# Disney Dataset Creation

#### Scrape and clean a list of disney wikipedia pages to create a dataset for further analysis

## Info Box for One Movie

Import necessary libraries

In [None]:
from bs4 import BeautifulSoup as bs
import json
import requests
import re
import pickle
import urllib
import os
from datetime import datetime
import pandas as pd

Load the webpage

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()
print(contents)


In [None]:
# Grab the infobox vevent
info_box = soup.find(class_="infobox vevent")
# print(info_box.prettify())

info_rows = info_box.find_all("tr")
for row in info_rows:
    print(row.prettify())

## Info box for all disney movies

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()
print(contents)


In [None]:
# movies = soup.find_all(class_="wikitable")
movies = soup.select(".wikitable.sortable i ")


### Functions

Get the content

In [None]:
# Check if there is a list
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br") or row_data.find("a"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")
    

Strip out references

In [None]:
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()

Get the info box

In [None]:
def get_info_box(url):
    
    r = requests.get(url)

    # Convert to a beautiful soup object
    soup = bs(r.content)
    
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    # clean out the references
    clean_tags(soup)
    
    movie_info = {}
    
    # index and row -> enumerate 
    # get_text(" ", strip=True) remove whitespaces
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info["title"] = row.find("th").get_text(" ", strip=True)
        elif index == 1:
            continue
        else:
            header = row.find("th")
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))

                movie_info[content_key] = content_value
    return movie_info

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
# Convert to a beautiful soup object
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"
movie_info_list = []

for index, movie in enumerate(movies):
    # debugging
#     if index == 10:
#         break 
    if index % 100 == 0:
        print(index)
        
    try:
        relative_path = movie["href"]
        full_path = base_path + relative_path
        title = movie["title"]
        
        movie_info_list.append(get_info_box(full_path))
        
        
    except Exception as e:
        print(movie.get_text())
        print(e)
      

## Save/Relaod Movie Data

Save Data function

In [None]:
def save_data(title, data):
    with open(title, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent = 2)
    

Load Data function

In [None]:
def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

Save the data

In [None]:
save_data("disney_movie_data_cleaned.json", movie_info_list)

Load Data

In [None]:
movie_info_list = load_data("disney_movie_data_cleaned.json")

## Convert running time field to an integer

In [None]:
[movie.get("Running time", "N/A") for movie in movie_info_list]

In [None]:
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    #if running time is a list
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])  
    else:
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie["Running time (int)"] = minutes_to_integer(movie.get("Running time", "N/A"))


## Convert "Budget" & "Box Office" fields to an integer

In [None]:
[movie.get("Budget", "N/A") for movie in movie_info_list]

In [None]:

# any number
number = r"\d{1,3}(?:,\d{3})*(?:\.\d+)?"
amounts = r"thousand|million|billion"
value_re = rf"\${number}"
word_re = rf"\${number}(-|\sto\s|-)?({number})?\s({amounts})"

def word_to_value(word):
    value_dict = {"thousand":1000, "million":1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value = re.search(number, string).group()
    value_string = float(value.replace(",", ""))
    word = re.search(amounts, string).group()
    word_value = word_to_value(word)
    return value_string * word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

def budget_conversion(money):
    if money == "N/A":
        return None
    
    if isinstance(money, list):
        money = money[0]
    
    word_syntax = re.search(word_re, money)
    value_syntax = re.search(value_re, money)
    
    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    
    else:
        return None

In [None]:
for movie in movie_info_list:
    movie["Budget (float)"] = budget_conversion(movie.get("Budget", "N/A"))
    movie["Box office (float)"] = budget_conversion(movie.get("Box office", "N/A"))

## Convert dates into datetime objects

In [None]:
[movie.get('Release date', movie.get('Release dates')) for movie in movie_info_list]

In [None]:
def clean_date(date):
    if date is None:
        return ""
    return date.split("(")[0].strip()

def date_conversion(date):
    
    if isinstance(date, list):
        date = date[0]
        
    date_str = clean_date(date)
  
    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:    
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None
    


In [None]:
for movie in movie_info_list:
    movie["Release date (datetime)"] = date_conversion(movie.get('Release date', movie.get('Release dates')))


## Save data using pickle

Save Data function

In [None]:
def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

Load Data function

In [None]:
def load_data_pickle(name):
    with open(name, "rb") as f:
        return pickle.load(f)

In [None]:
save_data_pickle("disney_movie_data_cleaned.pickle", movie_info_list)

## Attach IMDB/Rotten Tomatoes/Metascore Scores

In [None]:
movie_info_list = load_data_pickle("disney_movie_data_cleaned.pickle")

In [None]:
def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": os.environ['OMDB_API_KEY'], "t" : title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()
    
def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get("Ratings", [])
    for rating in ratings:
        if rating["Source"] == "Rotten Tomatoes":
            return rating["Value"]
    return None

In [None]:
for movie in movie_info_list:
    title = movie["title"]
    omdb_info = get_omdb_info(title)
    movie["imdb"] = omdb_info.get("imdbRating", None)
    movie["metascore"] = omdb_info.get("Metascore", None)
    movie["rotten_tomatoes"] = get_rotten_tomato_score(omdb_info)

In [None]:
save_data_pickle("disney_movie_data_final.pickle", movie_info_list)

## Save data as JSON & CSV

### Solve the datetime issue

In [None]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [None]:
for movie in movie_info_copy:
    current_date = movie["Release date (datetime)"]
    if current_date:
        movie["Release date (datetime)"] = current_date.strftime("%B %d %Y")
    else:
        movie["Release date (datetime)"] = None

In [None]:
save_data("disney_movie_data_final.json", movie_info_copy)

### Convert data to CSV

In [None]:
df = pd.DataFrame(movie_info_copy)

In [None]:
df.to_csv("disney_movie_data_final.csv")