## Disney Data set Creation (w/ Python Beautiful Soup)

#### Scrape and clean a list of disney wikidpedia pages to create a data set for further analysis

### Tutorial by: Keith Galli
[Tutorial YouTube Link](https://www.youtube.com/watch?v=Ewgy-G9cmbg&list=PLFCB5Dp81iNVmuoGIqcT5oF4K-7kTI5vp&index=2)


- robots.txt = tells us what we can do on a website

### Import the required libraries

In [5]:
import requests
from bs4 import BeautifulSoup as bs

import pandas as pd
import json
import pickle

In [19]:
url = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"
url_ts = "https://en.wikipedia.org/wiki/Toy_Story_3"

In [20]:
r = requests.get(url_ts)

# Convert to beautiful soup object
soup = bs(r.content)

content = soup.prettify()


In [21]:
info_box = soup.find(class_ = "infobox vevent")
info_rows = info_box.find_all("tr")


In [212]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", "") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", "")

movie_info = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info["title"] = row.find("th").get_text(" ", strip=True)
    elif index == 1: 
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value
        




## 2. Get the info box for all movies

In [35]:
r = requests.get(url)

# Convert to beautiful soup object
soup = bs(r.content)

content = soup.prettify()

movies = soup.select(".wikitable.sortable i")

In [40]:
print(movies[0].a["href"])
print()
print(movies[0].a["title"])

/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons

Academy Award Review of Walt Disney Cartoons


In [70]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", "") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", "")
    
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()    
    
def get_info_box(url):
    
    r = requests.get(url)
    soup = bs(r.content)
    
    info_box = soup.find(class_ = "infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info["title"] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
            
    return movie_info


In [71]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path  = "https://en.wikipedia.org"

movie_info_list = []

for index, movie in enumerate(movies):
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
        
        
    except Exception as e:
            print(movie.get_text())
            print(e)
        
def clean_tags(soup):
    for tag in soup.find_all("sup"):
        tag.decompose()

Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
True-Life Adventures
'NoneType' object has no attribute 'find_all'


## Save / Reload Movie Data

In [2]:
import json

def save_data(title, data):
    with open(title, 'w', encoding="utf-8")as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        

In [3]:

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [74]:
save_data("disney_data_cleaned.json", movie_info_list)

# 3. Cleaning the scraped data

- ~~Clean up references~~
- ~~Convert running time into an integer~~
- Convert dates into datetime object
- ~~SPlit up the long strings~~
- ~~Convert budget and box office to numbers~~

### Clean up references (remove [1] [2] etc.)

### Split up the long strings

### Convert running time into an interger

In [77]:
#[movie.get("Running time", "N/A") for movie in movie_info_list]

In [83]:
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list):
        entry = running_time[0]
        return entry.split(" ")[0]
    else:
        return running_time.split(" ")[0]
    

for movie in movie_info_list:
    movie["Running time (int)"] = minutes_to_integer(movie.get("Running time", "N/A"))

In [84]:
print([movie.get("Running time (int)", "N/A") for movie in movie_info_list])

['41', '83', '88', '126', '74', '64', '70', '42', '65', '71', '75', '94', '73', '75', '82', '68', '74', '96', '75', '84', '77', '92', '69', '81', '60', '127', '92', '76', '75', '73', '85', '81', '70', '90', '80', '75', '83', '83', '72', '97', '75', '104', '93', '105', '95', '97', '134', '69', '92', '126', '79', '97', '128', '74', '91', '105', '98', '130', '89', '93', '67', '98', '100', '118', '103', '110', '80', '79', '91', '91', '97', '118', '139', '92', '131', '87', '116', '93', '110', '110', '131', '101', '108', '84', '78', '75', '164', '106', '110', '99', '113', '108', '112', '93', '91', '93', '100', '100', '79', '96', '113', '89', '118', '92', '88', '92', '87', '93', '93', '93', '90', '83', '96', '88', '89', '91', '93', '92', '97', '100', '100', '89', '91', '112', '115', '95', '91', '95', '104', '74', '48', '77', '104', '128', '101', '94', '104', '90', '100', '88', '93', '98', '100', '112', '84', '98', '97', '114', '96', '100', '109', '83', '90', '107', '96', '103', '91', '95', '1

### Convert budget and box office to numbers

In [175]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    
    value_dict = {"thousand" : 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value * word_value
    

def parse_value_syntax(string):
        
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value


def extract_number(money):
    
    if money == "N/A":
        return None
    
    if isinstance(money, list):
        money = money[0]
    
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)
    
    if word_syntax:
        return parse_word_syntax(word_syntax.group())
        
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    
    else:
        return None

    
 

In [176]:
print(extract_number("$245 Billion"))
    

245000000000.0


In [177]:
for movie in movie_info_list:
    movie["Budget Value ($)"] = extract_number(movie.get("Budget", "N/A"))
    movie["Box Office ($)"] = extract_number(movie.get("Box office", "N/A"))

In [178]:
movie_info_list[-41]

{'title': 'A Wrinkle in Time',
 'Directed by': 'Ava DuVernay',
 'Produced by': ['Jim Whitaker', 'Catherine Hand'],
 'Screenplay by': ['Jennifer Lee', 'Jeff Stockwell'],
 'Based on': ['A Wrinkle in Time', 'by', "Madeleine L'Engle"],
 'Starring': ['Oprah Winfrey',
  'Reese Witherspoon',
  'Mindy Kaling',
  'Levi Miller',
  'Storm Reid',
  'Gugu Mbatha-Raw',
  'Michael Peña',
  'Zach Galifianakis',
  'Chris Pine'],
 'Music by': 'Ramin Djawadi',
 'Cinematography': 'Tobias A. Schliessler',
 'Edited by': 'Spencer Averick',
 'Production companies': ['Walt Disney Pictures', 'Whitaker Entertainment'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['February26,2018 ( El Capitan Theatre )',
  'March9,2018 (United States)'],
 'Running time': '109 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$100–130 million',
 'Box office': '$133.4 million',
 'Running time (int)': '109',
 'Budget Value ($)': 100000000.0,
 'Box Office ($)': 133400000.0

## Convert dates into datetimes

In [39]:
# Convert i.e. June 28, 1940
from datetime import datetime

In [38]:
dates = [movie.get("Release date", "N/A") for movie in movie_info_list]


def clean_date(date):
    return date.split("(")[0].strip()


def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
    
        
    date_str = clean_date(date)
    fmts = ["%B, %d, %Y", "%B%d,%Y", "%d%B%Y"]
    
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None
    
    


In [197]:
for movie in movie_info_list:
    movie["Relase date (datetime)"] = date_conversion(movie.get("Release date", "N\A"))

In [199]:
movie_info_list[23]

{'title': 'Rob Roy, the Highland Rogue',
 'Directed by': 'Harold French',
 'Produced by': ['Perce Pearce', 'Walt Disney'],
 'Written by': 'Lawrence Edward Watkin',
 'Starring': ['Richard Todd',
  'Glynis Johns',
  'James Robertson Justice',
  'Michael Gough',
  'Finlay Currie',
  'Geoffrey Keen'],
 'Music by': 'Cedric Thorpe Davie',
 'Cinematography': 'Guy Green',
 'Edited by': 'Geoffrey Foot',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['26October1953 (Premiere- London )', '4February1954 ( US )'],
 'Running time': '81 minutes',
 'Country': ['United Kingdom', 'United States'],
 'Language': 'English',
 'Budget': '$1.8 million',
 'Running time (int)': '81',
 'Budget Value ($)': 1800000.0,
 'Box Office ($)': None,
 'Relase date (datetime)': datetime.datetime(1953, 10, 26, 0, 0)}

In [202]:
#save_data("disney_movie_data_cleaned201", movie_info_list)

In [33]:
# Save the object using 'pickle'

import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)
    


In [34]:
def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)
    

In [209]:
save_data_pickle('disney_movie_data_cleaned_201.pickle', movie_info_list)

In [46]:
movie_info_list = load_data_pickle('disney_movie_data_cleaned_201.pickle')


## Attach IMBD/Rotten Tomatoes Scores

In [1]:
# OMDb API: 
import os
import urllib

api_key = os.environ.get("PRIVATE_API_KEY")


In [2]:
OMDb = "http://www.omdbapi.com/?apikey=[ ]&"
base_url = "http://www.omdbapi.com/"

#os.environ.get['OMDB_API_KEY']


def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    # Place your key into an environment variable then access it below:
    parameters = {"apikey" : api_key, 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get("Ratings", [])
    for rating in ratings:
        if rating["Source"] == "Rotteon Tomatoes":
            return rating["Value"]
    return None
    

In [3]:
for movie in movie_info_list:
    title = movie["title"]
    omdb_info = get_omdb_info(title)
    movie["imdb"] = omdb_info.get("imdbRating", None)
    movie["metascore"] = omdb_info.get("Metascore", None)
    movie["rotten_tomatoes"] = get_rotteon_tomato_score(omdb_info)

NameError: name 'movie_info_list' is not defined

In [51]:
movie_info_list[3]

{'title': 'Fantasia',
 'Directed by': ['Samuel Armstrong',
  'James Algar',
  'Bill Roberts',
  'Paul Satterfield',
  'Ben Sharpsteen',
  'David D. Hand',
  'Hamilton Luske',
  'Jim Handley',
  'Ford Beebe',
  'T. Hee',
  'Norman Ferguson',
  'Wilfred Jackson'],
 'Produced by': ['Walt Disney', 'Ben Sharpsteen'],
 'Story by': ['Joe Grant', 'Dick Huemer'],
 'Starring': ['Leopold Stokowski', 'Deems Taylor'],
 'Narrated by': 'Deems Taylor',
 'Music by': 'See program',
 'Cinematography': 'James Wong Howe',
 'Production company': 'Walt Disney Productions',
 'Distributed by': ['Walt Disney Productions', 'RKO Radio Pictures'],
 'Release date': ['November13,1940'],
 'Running time': '126 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$2.28 million',
 'Box office': '$76.4–$83.3 million',
 'Running time (int)': '126',
 'Budget Value ($)': 2280000.0,
 'Box Office ($)': 83300000.0,
 'Relase date (datetime)': datetime.datetime(1940, 11, 13, 0, 0),
 'imdb': '7.7',
 'metasco

In [52]:
save_data_pickle("disney_movie_data_final", movie_info_list)

## Save data as JSON and CSV

In [53]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [57]:
for movie in movie_info_copy:
    current_date = movie["Relase date (datetime)"]
    if current_date:
        movie["Relase date (datetime)"] = current_date.strftime("%B %d, %Y")
    else:
        movie["Relase date (datetime)"] = None

In [61]:
save_data_pickle("disney_date_final.json", movie_info_copy)

### Convert data to CSV

In [62]:
df = pd.DataFrame(movie_info_list)

In [64]:
df.to_csv("disney_movie_final.csv")