# Pixar Web Scrape

### We will need to create a data set

To gather the Pixar movie information we will scrape the IMDB movie database. We will begin by importinging beautiful soup, pandas and requests.

In [1]:
import bs4
import pandas as pd
import requests
import numpy as np

We will create a function that will extract the html on a webpage into a BeautifulSoup object.

In [2]:
def get_page_contents(url):
    page = requests.get(url, headers={"Accept-Language": "en-US"})
    return bs4.BeautifulSoup(page.text, "html.parser")

Next we will create a few more functions to scrape numeric movie data, text values and nested values.

In [3]:
def numeric_value(movie, tag, class_=None, order=None):
    if order:
        if len(movie.findAll(tag, class_)) > 1:
            to_extract = movie.find_all(tag, class_)[order]['data-value']
        else:
            to_extract = None
    else:
        to_extract = movie.find(tag, class_)['data-value']

    return to_extract

In [4]:
def text_value(movie, tag, class_=None):
    if movie.find(tag, class_):
        return movie.find(tag, class_).text
    else:
        return

In [5]:
def nested_text_value(movie, tag_1, class_1, tag_2, class_2, order=None):
    if not order:
        try:
            return movie.find(tag_1, class_1).find(tag_2, class_2).text
        except:
            return ""
    else:
        return [val.text for val in movie.find(tag_1, class_1).findAll(tag_2, class_2)[order]]

We will now create a function that will run the previous functions depending on need.

In [6]:
def extract_attribute(soup, tag_1, class_1='', tag_2='', class_2='',
                      text_attribute=True, order=None, nested=False):
    movies = soup.findAll('div', class_='lister-item-content')
    data_list = []
    for movie in movies:
        if text_attribute:
            if nested:
                data_list.append(nested_text_value(movie, tag_1, class_1, tag_2, class_2, order))
            else:
                data_list.append(text_value(movie, tag_1, class_1))
        else:
            data_list.append(numeric_value(movie, tag_1, class_1, order))

    return data_list

Our final function will create a dictionary with the information scraped as well as two functions to handle specific cases not covered by our previous functions.

In [30]:
def create_dict():
    title = extract_attribute(soup, 'a')
    release = extract_attribute(soup, 'span', 'lister-item-year text-muted unbold')
    audience_rating = extract_attribute(soup, 'span', 'certificate')
    runtime = extract_attribute(soup, 'span', 'runtime')
    genre = extract_attribute(soup, 'span', 'genre')
    imdb_rating = extract_attribute(soup, 'div', 'inline-block ratings-imdb-rating', False)
    metascore = extract_attribute(soup, 'div', 'inline-block ratings-metascore', False)
    directors = extract_attribute(soup, 'p', '', 'a', '', True, 0, True)
    actors = extract_attribute(soup, 'p', '', 'a', '', True, slice(1, 5, None), True)
    
    movies = soup.findAll('div', class_='lister-item-content')
    imdb_id = []
    for movie in movies:
        imdb_id.append(soup.find('h3').a['href'].split('/')[2])
    
    movies = soup.findAll('div', class_='lister-item-content')
    description = []
    for movie in movies:
        description.append(movie.findAll('p', class_='text-muted')[-1].text.lstrip())
        
    movies = soup.findAll('div', class_='lister-item-content')
    votes = []
    earnings = []
    
    for movie in movies:
        movie_numbers = movie.findAll("span",attrs={"name": "nv"})
        if len(movie_numbers) == 2:
            votes.append(movie_numbers[0].text)
            earnings.append(movie_numbers[1].text)
        elif len(movie_numbers) == 1:
            votes.append(movie_numbers[0].text)
            earnings.append("")
        else:
            votes.append("")
            earnings.append("")
  
            
    
    df_dict = {'IMDB ID': imdb_id, 'Title': title, 'Year': release, 'Audience Rating': audience_rating,
           'Runtime': runtime, 'Genre': genre, 'IMDB Rating': imdb_rating,
           'Votes': votes, 'Box Office Earnings': earnings, 'Description' : description, 'Metascore': metascore, 'Director': directors,
           'Actors': actors}
    
    return df_dict

We will now run the get_page_contents function and connect to the IMDB Pixar animations studios movie search pages 1 and 2.

In [31]:
soup = get_page_contents('https://www.imdb.com/search/title/?companies=co0017902&ref_=adv_prv')

df_dict2 = {}
df_dict2.update(create_dict())

In [32]:
df = pd.DataFrame(df_dict2)
df

Unnamed: 0,IMDB ID,Title,Year,Audience Rating,Runtime,Genre,IMDB Rating,Votes,Box Office Earnings,Description,Metascore,Director,Actors
0,tt7146812,Onward,(I) (2020),PG,102 min,"\nAnimation, Adventure, Comedy",\n\n7.4\n,72544.0,$61.56M,Two elven brothers embark on a quest to bring ...,\n61 \n Metascore\n,Dan Scanlon,"[Tom Holland, Chris Pratt, Julia Louis-Dreyfus..."
1,tt7146812,Baby Driver,(2017),R,113 min,"\nAction, Crime, Drama",\n\n7.6\n,418868.0,$107.83M,After being coerced into working for a crime b...,\n86 \n Metascore\n,Edgar Wright,"[Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Go..."
2,tt7146812,Toy Story 4,(2019),G,100 min,"\nAnimation, Adventure, Comedy",\n\n7.8\n,186486.0,$434.04M,"When a new toy called ""Forky"" joins Woody and ...",\n84 \n Metascore\n,Josh Cooley,"[Tom Hanks, Tim Allen, Annie Potts, Tony Hale]"
3,tt7146812,Cars,(2006),G,117 min,"\nAnimation, Comedy, Family",\n\n7.1\n,356204.0,$244.08M,A hot-shot race-car named Lightning McQueen ge...,\n73 \n Metascore\n,John Lasseter,"[Joe Ranft, Owen Wilson, Bonnie Hunt, Paul New..."
4,tt7146812,Coco,(I) (2017),PG,105 min,"\nAnimation, Adventure, Family",\n\n8.4\n,348237.0,$209.73M,"Aspiring musician Miguel, confronted with his ...",\n81 \n Metascore\n,Lee Unkrich,"[Adrian Molina, Anthony Gonzalez, Gael García ..."
5,tt7146812,Ratatouille,(2007),G,111 min,"\nAnimation, Adventure, Comedy",\n\n8.0\n,622146.0,$206.45M,A rat who can cook makes an unusual alliance w...,\n96 \n Metascore\n,Brad Bird,"[Jan Pinkava, Brad Garrett, Lou Romano, Patton..."
6,tt7146812,Inside Out,(I) (2015),PG,95 min,"\nAnimation, Adventure, Comedy",\n\n8.1\n,589328.0,$356.46M,After young Riley is uprooted from her Midwest...,\n94 \n Metascore\n,Pete Docter,"[Ronnie Del Carmen, Amy Poehler, Bill Hader, L..."
7,tt7146812,Brave,(2012),PG,93 min,"\nAnimation, Adventure, Comedy",\n\n7.1\n,362927.0,$237.28M,"Determined to make her own path in life, Princ...",\n69 \n Metascore\n,Mark Andrews,"[Brenda Chapman, Steve Purcell, Kelly Macdonal..."
8,tt7146812,The Incredibles 2,(2018),PG,118 min,"\nAnimation, Action, Adventure",\n\n7.6\n,237620.0,$608.58M,The Incredibles hero family takes on a new mis...,\n80 \n Metascore\n,Brad Bird,"[Craig T. Nelson, Holly Hunter, Sarah Vowell, ..."
9,tt7146812,The Incredibles,(2004),PG,115 min,"\nAnimation, Action, Adventure",\n\n8.0\n,640457.0,$261.44M,"A family of undercover superheroes, while tryi...",\n90 \n Metascore\n,Brad Bird,"[Craig T. Nelson, Samuel L. Jackson, Holly Hun..."


In [None]:
soup = get_page_contents('https://www.imdb.com/search/title/?companies=co0017902&start=51&ref_=adv_nxt')

df_dict2 = {}
df_dict2.update(create_dict())

In [None]:
df2 = pd.DataFrame(df_dict2)
df2

We will now combine the data from the two pages.

In [None]:
combined_data = pd.concat([df, df2], ignore_index = True, sort=True)

In [None]:
clean_df = combined_data

We will now clean the data.

In [None]:
clean_df['Year'] = clean_df['Year'].str.slice(start=-5, stop=-1)

In [None]:
clean_df['IMDB Rating'] = clean_df['IMDB Rating'].str.replace('\n',"")

In [None]:
clean_df['Metascore'] = clean_df['Metascore'].str.replace('\n',"")
clean_df['Metascore'] = clean_df['Metascore'].str.replace('Metascore',"")

In [None]:
clean_df['Runtime'] = clean_df['Runtime'].str.replace('min',"")

In [None]:
clean_df['Genre'] = clean_df['Genre'].str.slice(start=1)

In [None]:
clean_df['Studio'] = "Pixar Animation Studio"

In [None]:
clean_df.head()

In [None]:
clean_df = clean_df[['IMDB ID', 'Title', 'Year', 'Genre', 'Audience Rating', 'Description', 'Studio', 'Director', 'Actors', 'Box Office Earnings', 'Metascore', 'IMDB Rating', 'Votes']]

Now that we are done we will save the file.

In [None]:
save_path = r"C:\Users\Basil\Documents\Data Science\Projects\20200521 Disney\1. Original Data\Pixar Animation Studios.csv"
clean_df.to_csv(save_path)