# Disney Dataset Creation (w/ BeautifulSoup)

### Link: https://www.youtube.com/watch?v=Ewgy-G9cmbg&list=PLGF0tR95DzSftyT1wDhnT94Ysi1_s1biV&index=30&t=8s
#### Scrape and clean a list of disney wikipedia pages to creat a dataset to further analyze

##### Task 1: Get Info Box (store in Python dictionary)

### Import Necessary Libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re

### Load the webpage

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out HTML
contents = soup.prettify()


for s in soup.select('sup'):
    s.extract()

In [3]:
info_box = soup.find("table", attrs={"class": "infobox vevent"})
info_rows = info_box.find_all("tr")

In [5]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    if row_data.find("sup"):
        for sup in row_Data.find("sup"):
            sup.decompose()
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")


movie_info = {}
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    
    elif index == 1:
        continue
    
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value


for sup in info_box.find_all("sup"):
    sup.decompose()

##### Task 2: Get Info Box for all movies

In [6]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out HTML
contents = soup.prettify()

In [7]:
movies = soup.select(".wikitable.sortable i")
movies[0].a['href']

'/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons'

In [8]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return[text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()    
    
def get_info_box(url):
    
    r = requests.get(url)
    soup = bs(r.content)  
    info_box = soup.find("table", attrs={"class": "infobox vevent"})
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)
    
    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)

        else:
            header = row.find("th")
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
    
    return movie_info

In [11]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):    
    try:
        relative_path = movie['href']
        title = movie['title']
        full_path = base_path + relative_path
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)

Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
True-Life Adventures
'NoneType' object has no attribute 'find_all'
Luca
'NoneType' object has no attribute 'find_all'


##### Save/Reload Movie Data

In [12]:
import json

In [13]:
# Saves the move_info_list dictionnary as a json file

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [14]:
# Reload the movie data

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [15]:
save_data("disney_data.json", movie_info_list)

#### Task 3: Clean your data!

In [16]:
movie_info_list = load_data("disney_data.json")

#### Subtasks
- Clean up references: for example [1]
- Convert running time into an integer
- Convert dates into datetime object
- Split up the long strings
- Convert Budget & Box office numbers

In [17]:
## Clean up references: Done
# => Look at the def clean_tags() function

In [18]:
## Split up the long strings: Done

In [22]:
## Convert running time into an integer

def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    
    else:
        return int(running_time.split(" ")[0])


for movie in movie_info_list:
    movie["Running time (int)"] = minutes_to_integer(movie.get("Running time", "N/A"))
    

In [35]:
# Convert Budget & Box office numbers

import re

number = r"\d+(,\d{3})*\.*\d*"
amounts = r"thousand|million|billion"

word_re = rf"\${number}(-|\sto\s)?({number})?\s({amounts})"
value_re = rf"\${number}"


def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]


def parse_word_syntax(string):
    value = float((re.search(number, string).group()).replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value


def parse_value_syntax(string):
    value = float((re.search(number, string).group()).replace(",", ""))
    return value


def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]

    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    
    else:
        return None

In [36]:
for movie in movie_info_list:
    movie["Budget (float)"] = money_conversion(movie.get("Budget", "N/A"))
    movie["Box office (float)"] = money_conversion(movie.get("Box office", "N/A"))

In [None]:
# Convert Dates into datetimes
