## Dataset Creation (w/ Python BeautifulSoup)

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re

### Scraping Wikipedia info box into Python dictionary

#### Loading the webpage

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to a beautiful soup object
soup = bs(r.content)

#Print out the HTML
contents = soup.prettify()

In [3]:
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")

In [7]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

movie_info = {}
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
            continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value


            
movie_info

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release dates': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [1]',
 'Box office': '$1.067 billion [1]'}

### Getting info box for all movies in the list

In [8]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

soup = bs(r.content)

contents = soup.prettify()

In [9]:
movies = soup.select(".wikitable.sortable i")
movies[0:5]

[<i><a href="/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons" title="Academy Award Review of Walt Disney Cartoons">Academy Award Review of Walt Disney Cartoons</a></i>,
 <i><a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">Snow White and the Seven Dwarfs</a></i>,
 <i><a href="/wiki/Pinocchio_(1940_film)" title="Pinocchio (1940 film)">Pinocchio</a></i>,
 <i><a href="/wiki/Fantasia_(1940_film)" title="Fantasia (1940 film)">Fantasia</a></i>,
 <i><a href="/wiki/The_Reluctant_Dragon_(1941_film)" title="The Reluctant Dragon (1941 film)">The Reluctant Dragon</a></i>]

In [6]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
        
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):  
    for tag in soup.find_all(["sup","span"]):
         tag.decompose() 
    
    
def get_info_box(url):
    
    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)
#     for tr in info_rows:
#         for tag in tr(['sup']):
#             tag.decompose()
    
    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value

    return movie_info

In [44]:
get_info_box("https://en.wikipedia.org/wiki/Pirates_of_the_Caribbean:_On_Stranger_Tides")

{'title': 'Pirates of the Caribbean: On Stranger Tides',
 'Directed by': 'Rob Marshall',
 'Written by': ['Ted Elliott', 'Terry Rossio'],
 'Suggested by': 'On Stranger Tides by Tim Powers',
 'Based on': ['Characters by Ted Elliott Terry Rossio Stuart Beattie Jay Wolpert',
  'Ted Elliott',
  'Terry Rossio',
  'Stuart Beattie',
  'Jay Wolpert',
  "Walt Disney 's Pirates of the Caribbean"],
 'Produced by': 'Jerry Bruckheimer',
 'Starring': ['Johnny Depp',
  'Penélope Cruz',
  'Ian McShane',
  'Kevin R. McNally',
  'Geoffrey Rush'],
 'Cinematography': 'Dariusz Wolski',
 'Edited by': ['David Brenner', 'Wyatt Smith'],
 'Music by': 'Hans Zimmer',
 'Production companies': ['Walt Disney Pictures', 'Jerry Bruckheimer Films'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release dates': ['May 7, 2011 ( Disneyland Resort )',
  'May 20, 2011 (United States)'],
 'Running time': '137 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': ['$410.6 million (gross)'

In [45]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title'] 
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
490
500
Elemental
'NoneType' object has no attribute 'find_all'
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
510
Keeper of the Lost Cities
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
520
Shrunk
'NoneType' object has no attribute 'find'
Sister Act 3
'NoneType' object has no attribute 'find'
The Graveyard Book
'NoneType' object has no attribute 'find_all'
The Thief
'NoneType' object has no attribute 'find_all'
530
Tom Sawyer
'NoneType' object has no attribute 'find_all'
Tower of Terror


In [11]:
len(movie_info_list)

521

#### Saving/loading movie data

In [73]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [74]:
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [31]:
save_data("disney_data_cleaned.json", movie_info_list)

### Cleaning the data

In [8]:
movie_info_list = load_data("disney_data_cleaned.json")

#### Tasks
- ~~Clean up references [1]~~
- ~~Convert running time to integer~~
- ~~Convert dates into datetime object~~
- ~~Split up the long strings~~
- ~~Convert Budget & Box Office to integers~~

In [8]:
movie_info_list[-10]

{'title': '',
 'Music': 'Stephen Flaherty',
 'Lyrics': 'Lynn Ahrens',
 'Book': 'Lynn Ahrens',
 'Basis': ['My Love, My Love; or, The Peasant Girl', 'by', 'Rosa Guy'],
 'Productions': ['1990',
  'Broadway',
  '1994 UK/Europe',
  '1995',
  'West End',
  '2009 UK',
  'revival',
  '2017 Broadway revival',
  '2019 National Tour'],
 'Awards': ['1995',
  'Laurence Olivier Award for Best New Musical',
  '2018',
  'Tony Award for Best Revival of a Musical']}

In [4]:
#print([movie.get('Running time', 'N/A') for movie in movie_info_list])

In [10]:
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else:
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie['Running time (mins)'] = minutes_to_integer(movie.get('Running time', "N/A"))


In [5]:
#print([movie.get('Budget', 'N/A') for movie in movie_info_list])

In [6]:
#print([movie.get('Running time (mins)', 'N/A') for movie in movie_info_list])

#### Converting Budget and Box office to float values

In [13]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

range_og = rf"\${number}(-|\sto\s|–)({number})\s({amounts})"
range_re1 = rf"\${number}(-|\sto\s|–)"
range_re2 = rf"({number})\s({amounts})"
range_re = rf"{range_re1}{range_re2}"
word_re = rf"\${number}(-|\sto\s)?({number})?\s({amounts})"
value_re = rf"\${number}"


def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def value_conversion(string):   
        v_string = re.search(number, string).group()
        value = float(v_string.replace(",", ""))
        return value

def range_value_conversion(string):
    trim = re.findall(range_re2, string, flags=re.I)
    value1 = value_conversion(string)
    value2 = float(trim[0][0])
    return ((value1 + value2)*0.5)

def parse_word_syntax(string):
    value = value_conversion(string)
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value
    
def parse_range_syntax(string):
    value = range_value_conversion(string)
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    return value_conversion(string)

'''
money_conversion("$12.2 million") --> 12200000 # Word syntax
money_conversion("$790,000") --> 790000 ## Value syntax
'''

def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    range_syntax = re.search(range_re,money, flags=re.I)
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)


    if range_syntax:
        return parse_range_syntax(range_syntax.group())

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    else:
        return None

In [14]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [43]:
money_conversion(movie_info_list[-44]['Budget'])

150000000.0

#### Converting Release date to datetime object

In [59]:
# Converting Release dates to datetime object
from datetime import datetime

# dates = [movie.get(['Release date', 'Release dates'], 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]

    if date == "N/A":
        return None

    date_str = clean_date(date)
    fmts = ["%B %d, %Y", "%d %B, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None


In [41]:
from datetime import datetime



def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion_complex(movies):
    date_title = rf"Release date(s?)"
 
    for movie in movies:
        date = movie.get(date_title, 'N/A')
        print(date)
#         return date


    if isinstance(date, list):
        date = date[0]

    if date == "N/A":
        return None

    date_str = clean_date(date)
    print(date_str)
    fmts = ["%B %d, %Y", "%d %B, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None


In [60]:
key1 = 'Release date'
key2 = 'Release dates'
for movie in movie_info_list:
    if key1 in movie.keys():
        movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))
    if key2 in movie.keys():
        movie['Release date (datetime)'] = date_conversion(movie.get('Release dates', 'N/A'))

#### Using Pickle to allow saving with datetime objects

In [27]:
# New method of saving data to allow saving of new data with datetime objects
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [2]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [67]:
save_data_pickle("disney_movie_data_cleaned_further.pickle", movie_info_list)

In [68]:
a = load_data_pickle("disney_movie_data_cleaned_further.pickle")

In [69]:
a == movie_info_list

True

### Attaching IMDB/Rotten Tomatoes/ Metacritic scores

In [4]:
movie_info_list = load_data_pickle('disney_movie_data_cleaned_further.pickle')

In [5]:
movie_info_list[-60]

{'title': 'Onward',
 'Directed by': 'Dan Scanlon',
 'Screenplay by': ['Dan Scanlon', 'Jason Headley', 'Keith Bunin'],
 'Story by': ['Dan Scanlon', 'Keith Bunin', 'Jason Headley'],
 'Produced by': 'Kori Rae',
 'Starring': ['Tom Holland',
  'Chris Pratt',
  'Julia Louis-Dreyfus',
  'Octavia Spencer'],
 'Cinematography': ['Sharon Calahan', 'Adam Habib'],
 'Edited by': 'Catherine Apple',
 'Music by': ['Mychael Danna', 'Jeff Danna'],
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release dates': ['February 21, 2020 ( Berlinale )',
  'March 6, 2020 (United States)'],
 'Running time': '102 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$175–200 million',
 'Box office': '$141.9 million',
 'Running time (mins)': 102,
 'Budget (float)': 187500000.0,
 'Box office (float)': 141900000.0,
 'Release date (datetime)': datetime.datetime(2020, 2, 21, 0, 0)}

In [None]:
http://www.omdbapi.com/?apikey=[]&

In [1]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": os.environ['OMDB_API_KEY'], 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']

In [19]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['IMDb rating'] = omdb_info.get('imdbRating', None)
    movie['Metascore'] = omdb_info.get('Metascore', None)
    movie['Rotten Tomatoes'] = get_rotten_tomato_score(omdb_info)

In [25]:
movie_info_list[-5]

{'title': 'The Aristocats',
 'Directed by': 'Wolfgang Reitherman',
 'Story by': ['Ken Anderson',
  'Larry Clemmons',
  'Eric Cleworth',
  'Vance Gerry',
  'Julius Svendsen',
  'Frank Thomas',
  'Ralph Wright'],
 'Based on': ['Tom McGowan', 'Tom Rowe'],
 'Produced by': ['Winston Hibler', 'Wolfgang Reitherman'],
 'Starring': ['Phil Harris',
  'Eva Gabor',
  'Sterling Holloway',
  'Scatman Crothers',
  'Paul Winchell',
  'Lord Tim Hudson',
  'Thurl Ravenscroft',
  'Dean Clark',
  'Liz English',
  'Gary Dubin'],
 'Edited by': 'Tom Acosta',
 'Music by': 'George Bruns',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release dates': ['December 11, 1970 (premiere)',
  'December 24, 1970 (United States)'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$4 million',
 'Box office': '$191 million',
 'Running time (mins)': 79,
 'Budget (float)': 4000000.0,
 'Box office (float)': 191000000.0,
 'Relea

In [28]:
save_data_pickle('disney_movie_data_final.pickle', movie_info_list)

### Saving data as JSON & CSV

In [65]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [70]:
movie_info_copy[-2]

{'title': 'The Rocketeer',
 'Directed by': 'Joe Johnston',
 'Screenplay by': ['Danny Bilson', 'Paul De Meo'],
 'Story by': ['Danny Bilson', 'Paul De Meo', 'William Dear'],
 'Based on': ['The Rocketeer', 'by', 'Dave Stevens'],
 'Produced by': ['Charles Gordon', 'Lawrence Gordon', 'Lloyd Levin'],
 'Starring': ['Bill Campbell',
  'Alan Arkin',
  'Jennifer Connelly',
  'Paul Sorvino',
  'Timothy Dalton'],
 'Cinematography': 'Hiro Narita',
 'Edited by': ['Michael A. Stevenson', 'Arthur Schmidt'],
 'Music by': 'James Horner',
 'Production companies': ['Walt Disney Pictures',
  'Touchstone Pictures',
  'Silver Screen Partners IV',
  'Gordon Company'],
 'Distributed by': 'Buena Vista Pictures Distribution',
 'Release date': ['June 21, 1991'],
 'Running time': '108 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$35-40 million',
 'Box office': '$46.7 million (USA)',
 'Running time (mins)': 108,
 'Budget (float)': 37500000.0,
 'Box office (float)': 46700000.0,
 'Releas

In [66]:
from datetime import datetime
date_key = 'Release date (datetime)'

for movie in movie_info_copy:
    if date_key in movie.keys():
        current_date = movie['Release date (datetime)']
        if current_date:
            movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
        else:
            movie['Release date (datetime)'] = None
    else:
        pass

In [75]:
save_data("disney_data_final.json", movie_info_copy)

#### Converting data to CSV

In [76]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [77]:
df.head()

Unnamed: 0,title,Production company,Distributed by,Release date,Running time,Country,Language,Box office,Running time (mins),Budget (float),...,Original title,Suggested by,Layouts by,Original concept by,Music,Lyrics,Book,Basis,Productions,Awards
0,Academy Award Review of,Walt Disney Productions,United Artists,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,RKO Radio Pictures,,83 minutes,United States,English,$418 million,83.0,1490000.0,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,RKO Radio Pictures,,88 minutes,United States,English,$164 million,88.0,2600000.0,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,RKO Radio Pictures,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),126.0,2280000.0,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,RKO Radio Pictures,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,...,,,,,,,,,,


In [79]:
df.to_csv("disney_movie_data_final.csv")

In [82]:
running_times = df.sort_values(['Running time (mins)'], ascending=True)
running_times.head()

Unnamed: 0,title,Production company,Distributed by,Release date,Running time,Country,Language,Box office,Running time (mins),Budget (float),...,Original title,Suggested by,Layouts by,Original concept by,Music,Lyrics,Book,Basis,Productions,Awards
287,Sacred Planet,Walt Disney Pictures,Buena Vista Pictures,"[April 22, 2004]",40 minutes,,English,"$1,108,356",40.0,,...,,,,,,,,,,
312,Roving Mars,,Buena Vista Pictures,"[January 27, 2006]",40 minutes,United States,English,$11 million,40.0,1000000.0,...,,,,,,,,,,
0,Academy Award Review of,Walt Disney Productions,United Artists,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,...,,,,,,,,,,
7,Saludos Amigos,Walt Disney Productions,RKO Radio Pictures,,42 minutes,United States,,$1.135 million (worldwide rentals),42.0,,...,,,,,,,,,,
131,A Tale of Two Critters,Walt Disney Productions,Buena Vista Distribution,"[June 20, 1977]",48 minutes,United States,English,,48.0,,...,,,,,,,,,,
