# Pixar Web Scrape

### We will need to create a data set

To gather the Pixar movie information we will scrape the IMDB movie database. We will begin by importinging beautiful soup, pandas and requests.

In [1]:
import bs4
import pandas as pd
import requests
import numpy as np

We will create a function that will extract the html on a webpage into a BeautifulSoup object.

In [2]:
def get_page_contents(url):
    page = requests.get(url, headers={"Accept-Language": "en-US"})
    return bs4.BeautifulSoup(page.text, "html.parser")

Next we will create a few more functions to scrape numeric movie data, text values and nested values.

In [3]:
def numeric_value(movie, tag, class_=None, order=None):
    if order:
        if len(movie.findAll(tag, class_)) > 1:
            to_extract = movie.find_all(tag, class_)[order]['data-value']
        else:
            to_extract = None
    else:
        to_extract = movie.find(tag, class_)['data-value']

    return to_extract

In [4]:
def text_value(movie, tag, class_=None):
    if movie.find(tag, class_):
        return movie.find(tag, class_).text
    else:
        return

In [5]:
def nested_text_value(movie, tag_1, class_1, tag_2, class_2, order=None):
    if not order:
        try:
            return movie.find(tag_1, class_1).find(tag_2, class_2).text
        except:
            return ""
    else:
        return [val.text for val in movie.find(tag_1, class_1).findAll(tag_2, class_2)[order]]

We will now create a function that will run the previous functions depending on need.

In [6]:
def extract_attribute(soup, tag_1, class_1='', tag_2='', class_2='',
                      text_attribute=True, order=None, nested=False):
    movies = soup.findAll('div', class_='lister-item-content')
    data_list = []
    for movie in movies:
        if text_attribute:
            if nested:
                data_list.append(nested_text_value(movie, tag_1, class_1, tag_2, class_2, order))
            else:
                data_list.append(text_value(movie, tag_1, class_1))
        else:
            data_list.append(numeric_value(movie, tag_1, class_1, order))

    return data_list

Our final function will create a dictionary with the information scraped as well as two functions to handle specific cases not covered by our previous functions.

In [7]:
def create_dict():
    title = extract_attribute(soup, 'a')
    release = extract_attribute(soup, 'span', 'lister-item-year text-muted unbold')
    audience_rating = extract_attribute(soup, 'span', 'certificate')
    runtime = extract_attribute(soup, 'span', 'runtime')
    genre = extract_attribute(soup, 'span', 'genre')
    imdb_rating = extract_attribute(soup, 'div', 'inline-block ratings-imdb-rating', False)
    metascore = extract_attribute(soup, 'div', 'inline-block ratings-metascore', False)
    directors = extract_attribute(soup, 'p', '', 'a', '', True, 0, True)
    actors = extract_attribute(soup, 'p', '', 'a', '', True, slice(1, 5, None), True)
    
    movies = soup.findAll('div', class_='lister-item-content')
    imdb_id = []
    for movie in movies:
        imdb_id.append(soup.find('h3').a['href'].split('/')[2])
    
    movies = soup.findAll('div', class_='lister-item-content')
    description = []
    for movie in movies:
        description.append(movie.findAll('p', class_='text-muted')[-1].text.lstrip())
        
    movies = soup.findAll('div', class_='lister-item-content')
    votes = []
    earnings = []
    
    for movie in movies:
        movie_numbers = movie.findAll("span",attrs={"name": "nv"})
        if len(movie_numbers) == 2:
            votes.append(movie_numbers[0].text)
            earnings.append(movie_numbers[1].text)
        elif len(movie_numbers) == 1:
            votes.append(movie_numbers[0].text)
            earnings.append("")
        else:
            votes.append("")
            earnings.append("")
  
            
    
    df_dict = {'IMDB ID': imdb_id, 'Title': title, 'Year': release, 'Audience Rating': audience_rating,
           'Runtime': runtime, 'Genre': genre, 'IMDB Rating': imdb_rating,
           'Votes': votes, 'Box Office Earnings': earnings, 'Description' : description, 'Metascore': metascore, 'Director': directors,
           'Actors': actors}
    
    return df_dict

We will now run the get_page_contents function and connect to a IMDB pixar movie list.

In [8]:
soup = get_page_contents('https://www.imdb.com/list/ls087116422/')

df_dict2 = {}
df_dict2.update(create_dict())

In [9]:
df = pd.DataFrame(df_dict2)
df

Unnamed: 0,IMDB ID,Title,Year,Audience Rating,Runtime,Genre,IMDB Rating,Votes,Box Office Earnings,Description,Metascore,Director,Actors
0,tt0114709,Toy Story,(1995),G,81 min,"\nAnimation, Adventure, Comedy",,860844,$191.80M,"Votes:\n860,844\n| Gross:\n$191.80M\n",\n95 \n Metascore\n,,[]
1,tt0114709,Toy Story 2,(1999),G,92 min,"\nAnimation, Adventure, Comedy",,512617,$245.85M,"Votes:\n512,617\n| Gross:\n$245.85M\n",\n88 \n Metascore\n,,[]
2,tt0114709,Toy Story 3,(2010),G,103 min,"\nAnimation, Adventure, Comedy",,736991,$415.00M,"Votes:\n736,991\n| Gross:\n$415.00M\n",\n92 \n Metascore\n,,[]
3,tt0114709,Toy Story 4,(2019),G,100 min,"\nAnimation, Adventure, Comedy",,186520,$434.04M,"Votes:\n186,520\n| Gross:\n$434.04M\n",\n84 \n Metascore\n,,[]
4,tt0114709,A Bug's Life,(1998),G,95 min,"\nAnimation, Adventure, Comedy",,260535,$162.80M,"Votes:\n260,535\n| Gross:\n$162.80M\n",\n77 \n Metascore\n,,[]
5,tt0114709,The Incredibles,(2004),PG,115 min,"\nAnimation, Action, Adventure",,640491,$261.44M,"Votes:\n640,491\n| Gross:\n$261.44M\n",\n90 \n Metascore\n,,[]
6,tt0114709,The Incredibles 2,(2018),PG,118 min,"\nAnimation, Action, Adventure",,237642,$608.58M,"Votes:\n237,642\n| Gross:\n$608.58M\n",\n80 \n Metascore\n,,[]
7,tt0114709,Finding Nemo,(2003),G,100 min,"\nAnimation, Adventure, Comedy",,924523,$380.84M,"Votes:\n924,523\n| Gross:\n$380.84M\n",\n90 \n Metascore\n,,[]
8,tt0114709,Finding Dory,(2016),PG,97 min,"\nAnimation, Adventure, Comedy",,232355,$486.30M,"Votes:\n232,355\n| Gross:\n$486.30M\n",\n77 \n Metascore\n,,[]
9,tt0114709,Inside Out,(I) (2015),PG,95 min,"\nAnimation, Adventure, Comedy",,589378,$356.46M,"Votes:\n589,378\n| Gross:\n$356.46M\n",\n94 \n Metascore\n,,[]


We will now clean the data.

In [10]:
clean_df = df

In [11]:
clean_df['Year'] = clean_df['Year'].str.slice(start=-5, stop=-1)

In [12]:
clean_df['IMDB Rating'] = clean_df['IMDB Rating'].str.replace('\n',"")

In [13]:
clean_df['Metascore'] = clean_df['Metascore'].str.replace('\n',"")
clean_df['Metascore'] = clean_df['Metascore'].str.replace('Metascore',"")

In [14]:
clean_df['Runtime'] = clean_df['Runtime'].str.replace('min',"")

In [15]:
clean_df['Genre'] = clean_df['Genre'].str.slice(start=1)

In [16]:
clean_df['Studio'] = "Pixar Animation Studio"

In [17]:
clean_df.head()

Unnamed: 0,IMDB ID,Title,Year,Audience Rating,Runtime,Genre,IMDB Rating,Votes,Box Office Earnings,Description,Metascore,Director,Actors,Studio
0,tt0114709,Toy Story,1995,G,81,"Animation, Adventure, Comedy",,860844,$191.80M,"Votes:\n860,844\n| Gross:\n$191.80M\n",95,,[],Pixar Animation Studio
1,tt0114709,Toy Story 2,1999,G,92,"Animation, Adventure, Comedy",,512617,$245.85M,"Votes:\n512,617\n| Gross:\n$245.85M\n",88,,[],Pixar Animation Studio
2,tt0114709,Toy Story 3,2010,G,103,"Animation, Adventure, Comedy",,736991,$415.00M,"Votes:\n736,991\n| Gross:\n$415.00M\n",92,,[],Pixar Animation Studio
3,tt0114709,Toy Story 4,2019,G,100,"Animation, Adventure, Comedy",,186520,$434.04M,"Votes:\n186,520\n| Gross:\n$434.04M\n",84,,[],Pixar Animation Studio
4,tt0114709,A Bug's Life,1998,G,95,"Animation, Adventure, Comedy",,260535,$162.80M,"Votes:\n260,535\n| Gross:\n$162.80M\n",77,,[],Pixar Animation Studio


In [18]:
clean_df = clean_df[['IMDB ID', 'Title', 'Year', 'Genre', 'Audience Rating', 'Description', 'Studio', 'Director', 'Actors', 'Box Office Earnings', 'Metascore', 'IMDB Rating', 'Votes']]

Now that we are done we will save the file.

In [19]:
save_path = r"C:\Users\Basil\Documents\Data Science\Projects\20200521 Disney\1. Original Data\Pixar Animation Studios.csv"
clean_df.to_csv(save_path)