# Disney Web Scrape

### We will need to create datasets

To gather the disney movie information we will scrape the IMDB movie database. We will begin by importinging beautiful soup, pandas and requests.

In [1]:
import bs4
import pandas as pd
import requests

We will create a function that will extract the html on a webpage into a BeautifulSoup object.

In [2]:
def get_page_contents(url):
    page = requests.get(url, headers={"Accept-Language": "en-US"})
    return bs4.BeautifulSoup(page.text, "html.parser")

In [3]:
def numeric_value(movie, tag, class_=None, order=None):
    if order:
        if len(movie.findAll(tag, class_)) > 1:
            to_extract = movie.findAll(tag, class_)[order]['data-value']
        else:
            to_extract = None
    else:
        to_extract = movie.find(tag, class_)['data-value']

    return to_extract

In [4]:
def text_value(movie, tag, class_=None):
    if movie.find(tag, class_):
        return movie.find(tag, class_).text
    else:
        return

In [5]:
def nested_text_value(movie, tag_1, class_1, tag_2, class_2, order=None):
    if not order:
        try:
            return movie.find(tag_1, class_1).find(tag_2, class_2).text
        except:
            return ""
    else:
        return [val.text for val in movie.find(tag_1, class_1).findAll(tag_2, class_2)[order]]

In [6]:
def extract_attribute(soup, tag_1, class_1='', tag_2='', class_2='',
                      text_attribute=True, order=None, nested=False):
    movies = soup.findAll('div', class_='lister-item-content')
    data_list = []
    for movie in movies:
        if text_attribute:
            if nested:
                data_list.append(nested_text_value(movie, tag_1, class_1, tag_2, class_2, order))
            else:
                data_list.append(text_value(movie, tag_1, class_1))
        else:
            data_list.append(numeric_value(movie, tag_1, class_1, order))

    return data_list

In [7]:
def create_dict():
    title = extract_attribute(soup, 'a')
    release = extract_attribute(soup, 'span', 'lister-item-year text-muted unbold')
    audience_rating = extract_attribute(soup, 'span', 'certificate')
    runtime = extract_attribute(soup, 'span', 'runtime')
    genre = extract_attribute(soup, 'span', 'genre')
    imdb_rating = extract_attribute(soup, 'span', 'ipl-rating-star__rating', False)
    metascore = extract_attribute(soup, 'div', 'inline-block ratings-metascore', False)
    description = extract_attribute(soup, 'p',"")
    
    movies = soup.findAll('div', class_='lister-item-content')
    imdb_id = []
    for movie in movies:
        imdb_id.append(soup.find('h3').a['href'].split('/')[2])
        
    movies = soup.findAll('div', class_='lister-item-content')
    votes = []
    earnings = []
    
    for movie in movies:
        movie_numbers = movie.findAll("span",attrs={"name": "nv"})
        if len(movie_numbers) == 2:
            votes.append(movie_numbers[0].text)
            earnings.append(movie_numbers[1].text)
        elif len(movie_numbers) == 1:
            votes.append(movie_numbers[0].text)
            earnings.append("")
        else:
            votes.append("")
            earnings.append("")
  
            
    
    df_dict = {'IMDB ID': imdb_id, 'Title': title, 'Year': release, 'Audience Rating': audience_rating,
           'Runtime': runtime, 'Genre': genre, 'IMDB Rating': imdb_rating,
           'Votes': votes, 'Box Office Earnings': earnings, 'Description' : description, 'Metascore': metascore}
    
    return df_dict

We will now run the function and connect to the IMDB Disney animations list I created. Contains all movies from all disney studios from Snow White and the Seven Dwarves (1937) to Onward (2020).

In [8]:
soup = get_page_contents('https://www.imdb.com/list/ls085145473/')

df_dict2 = {}
df_dict2.update(create_dict())

In [9]:
df = pd.DataFrame(df_dict2)
df

Unnamed: 0,IMDB ID,Title,Year,Audience Rating,Runtime,Genre,IMDB Rating,Votes,Box Office Earnings,Description,Metascore
0,tt0029583,Snow White and the Seven Dwarfs,(1937),Approved,83 min,"\nAnimation, Family, Fantasy",7.6,176896,$184.93M,\n Exiled into the dangerous forest by her ...,\n95 \n Metascore\n
1,tt0029583,Pinocchio,(1940),G,88 min,"\nAnimation, Comedy, Family",7.4,127391,$84.25M,"\n A living puppet, with the help of a cric...",\n99 \n Metascore\n
2,tt0029583,Fantasia,(1940),G,125 min,"\nAnimation, Family, Fantasy",7.8,86686,$76.41M,\n A collection of animated interpretations...,\n96 \n Metascore\n
3,tt0029583,The Reluctant Dragon,(I) (1941),Approved,74 min,"\nAnimation, Comedy, Family",6.9,2605,$0.87M,\n Humorist Robert Benchley learns about th...,
4,tt0029583,Dumbo,(1941),G,64 min,"\nAnimation, Drama, Family",7.2,116679,$1.60M,"\n Ridiculed because of his enormous ears, ...",\n96 \n Metascore\n
5,tt0029583,Bambi,(1942),G,70 min,"\nAnimation, Drama, Family",7.3,128039,$102.80M,\n The story of a young deer growing up in ...,\n91 \n Metascore\n
6,tt0029583,Saludos Amigos,(1942),Approved,42 min,"\nAnimation, Short, Adventure",6.1,5108,,\n Disney animators tour South America and ...,\n60 \n Metascore\n
7,tt0029583,Victory Through Air Power,(1943),,70 min,"\nDocumentary, Animation, History",6.6,846,$0.80M,\n An animated documentary promoting of the...,
8,tt0029583,The Three Caballeros,(1944),Approved,71 min,"\nAnimation, Comedy, Family",6.4,11917,,"\n Donald receives his birthday gifts, whic...",\n85 \n Metascore\n
9,tt0029583,Make Mine Music,(1946),Approved,75 min,"\nAnimation, Adventure, Comedy",6.3,3588,,\n Animation done to contemporary popular m...,\n60 \n Metascore\n


The first page looks good. There are a total of 749 titles. We will pull all 8 pages of the list. Then we will combine them.

In [10]:
soup = get_page_contents('https://www.imdb.com/list/ls085145473/?sort=list_order,asc&st_dt=&mode=detail&page=2')

df_dict2 = {}
df_dict2.update(create_dict())
df2 = pd.DataFrame(df_dict2)

In [11]:
soup = get_page_contents('https://www.imdb.com/list/ls085145473/?sort=list_order,asc&st_dt=&mode=detail&page=3')

df_dict2 = {}
df_dict2.update(create_dict())
df3 = pd.DataFrame(df_dict2)

In [12]:
soup = get_page_contents('https://www.imdb.com/list/ls085145473/?sort=list_order,asc&st_dt=&mode=detail&page=4')

df_dict2 = {}
df_dict2.update(create_dict())
df4 = pd.DataFrame(df_dict2)

In [13]:
soup = get_page_contents('https://www.imdb.com/list/ls085145473/?sort=list_order,asc&st_dt=&mode=detail&page=5')

df_dict2 = {}
df_dict2.update(create_dict())
df5 = pd.DataFrame(df_dict2)

In [14]:
soup = get_page_contents('https://www.imdb.com/list/ls085145473/?sort=list_order,asc&st_dt=&mode=detail&page=6')

df_dict2 = {}
df_dict2.update(create_dict())
df6 = pd.DataFrame(df_dict2)

In [15]:
soup = get_page_contents('https://www.imdb.com/list/ls085145473/?sort=list_order,asc&st_dt=&mode=detail&page=7')

df_dict2 = {}
df_dict2.update(create_dict())
df7 = pd.DataFrame(df_dict2)

In [16]:
soup = get_page_contents('https://www.imdb.com/list/ls085145473/?sort=list_order,asc&st_dt=&mode=detail&page=8')

df_dict2 = {}
df_dict2.update(create_dict())
df8 = pd.DataFrame(df_dict2)

In [38]:
combined_data = pd.concat([df, df2, df3, df4, df5, df6, df7, df8], ignore_index = True, sort=True)

Now that we have a full list we will clean the data. We will rename the headers and sort them in the order we desire.

In [39]:
clean_df = combined_data

In [40]:
clean_df['Year'] = clean_df['Year'].str.slice(start=-5, stop=-1)

In [41]:
clean_df['IMDB Rating'] = clean_df['IMDB Rating'].str.replace('\n',"")

In [42]:
clean_df['Metascore'] = clean_df['Metascore'].str.replace('\n',"")
clean_df['Metascore'] = clean_df['Metascore'].str.replace('Metascore',"")

In [43]:
clean_df['Runtime'] = clean_df['Runtime'].str.replace('min',"")

In [44]:
clean_df['Genre'] = clean_df['Genre'].str.slice(start=1)

In [45]:
clean_df.head()

Unnamed: 0,Audience Rating,Box Office Earnings,Description,Genre,IMDB ID,IMDB Rating,Metascore,Runtime,Title,Votes,Year
0,Approved,$184.93M,\n Exiled into the dangerous forest by her ...,"Animation, Family, Fantasy",tt0029583,7.6,95.0,83,Snow White and the Seven Dwarfs,176896,1937
1,G,$84.25M,"\n A living puppet, with the help of a cric...","Animation, Comedy, Family",tt0029583,7.4,99.0,88,Pinocchio,127391,1940
2,G,$76.41M,\n A collection of animated interpretations...,"Animation, Family, Fantasy",tt0029583,7.8,96.0,125,Fantasia,86686,1940
3,Approved,$0.87M,\n Humorist Robert Benchley learns about th...,"Animation, Comedy, Family",tt0029583,6.9,,74,The Reluctant Dragon,2605,1941
4,G,$1.60M,"\n Ridiculed because of his enormous ears, ...","Animation, Drama, Family",tt0029583,7.2,96.0,64,Dumbo,116679,1941


In [46]:
clean_df = clean_df[['IMDB ID', 'Title', 'Year', 'Genre', 'Audience Rating', 'Description', 'Box Office Earnings', 'Metascore', 'IMDB Rating', 'Votes']]

Now that the cleaning is done we will save it as a csv file.

In [47]:
save_path = r"C:\Users\Basil\Documents\Data Science\Projects\20200521 Disney\1. Original Data\Walt Disney Studios.csv"
clean_df.to_csv(save_path)