Trying to follow a tutorial from:
https://www.youtube.com/watch?v=Ewgy-G9cmbg

Gonna attempt to do the tasks myself first before watching the video

# Import python library

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

# Task1: Scrape Toy story 3 summary table to a dictionary

Let's start from Toy story 3

In [2]:
# define the link
my_url = 'https://en.wikipedia.org/wiki/Toy_Story_3'
r = requests.get(my_url)

# call the soup function to make the html readable
page_soup = bs(r.content)
info_box = page_soup.find("table",{"class":"infobox vevent"})

In [3]:
info_rows = info_box.find_all("tr")

def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

movie_info = {}
    
for index, row in enumerate(info_rows):

    # For title row, movie info is in the title
    if index == 0:
        movie_info['title'] = row.find('th').get_text()
    elif index == 1:   # can skip this second row since this is the picture of the movie
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
    
        movie_info[content_key] = content_value



        
movie_info
        

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Produced by': 'Darla K. Anderson',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Blake Clark',
  'Jeff Pidgeon',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Music by': 'Randy Newman',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Production company': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [1]',
 'Box office': '$1.067 billion [1]'}

In [4]:
movie_info['title']

'Toy Story 3'

# Task2: get info box for all movies

In [5]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup","span"]):
        tag.decompose()
        
def get_info_box(url):

    r = requests.get(url)
    soup = bs(r.content)
    
    info_box = soup.find("table",{"class":"infobox vevent"})
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)
    
    movie_info = {}

    for index, row in enumerate(info_rows):

        # For title row, movie info is in the title
        if index == 0:
            movie_info['title'] = row.find('th').get_text()
        else:
            header = row.find("th")
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))

                movie_info[content_key] = content_value
    
    return movie_info


In [6]:
get_info_box("https://en.wikipedia.org/wiki/One_Little_Indian_(film)")

{'title': 'One Little Indian',
 'Directed by': 'Bernard McEveety',
 'Produced by': 'Winston Hibler',
 'Written by': 'Harry Spalding',
 'Starring': ['James Garner',
  'Vera Miles',
  'Pat Hingle',
  'Morgan Woodward',
  'Jodie Foster'],
 'Music by': 'Jerry Goldsmith',
 'Cinematography': 'Charles F. Wheeler',
 'Edited by': 'Robert Stafford',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['June 20, 1973'],
 'Running time': '90 Minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$2 million'}

In [7]:
# define the link
my_url = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'
r = requests.get(my_url)

# call the soup function to make the html readable
page_soup = bs(r.content)

Notice that all the movie names in italic. Lets try to grab just the italic inside the table

In [8]:
movies = page_soup.select('.wikitable.sortable i a')
movies[0:10]

[<a href="/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons" title="Academy Award Review of Walt Disney Cartoons">Academy Award Review of Walt Disney Cartoons</a>,
 <a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">Snow White and the Seven Dwarfs</a>,
 <a href="/wiki/Pinocchio_(1940_film)" title="Pinocchio (1940 film)">Pinocchio</a>,
 <a href="/wiki/Fantasia_(1940_film)" title="Fantasia (1940 film)">Fantasia</a>,
 <a href="/wiki/The_Reluctant_Dragon_(1941_film)" title="The Reluctant Dragon (1941 film)">The Reluctant Dragon</a>,
 <a href="/wiki/Dumbo" title="Dumbo">Dumbo</a>,
 <a href="/wiki/Bambi" title="Bambi">Bambi</a>,
 <a href="/wiki/Saludos_Amigos" title="Saludos Amigos">Saludos Amigos</a>,
 <a href="/wiki/Victory_Through_Air_Power_(film)" title="Victory Through Air Power (film)">Victory Through Air Power</a>,
 <a href="/wiki/The_Three_Caballeros" title="The Three Caballeros">The Three Caballeros</a>]

In [9]:
print(len(movies))
movie_info_list = []
base_path = 'https://en.wikipedia.org/'

for index, movie in enumerate(movies):
    if index % 10 ==0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
    
    except Exception as e:
        print(movie.get_text())
        print(e)

439
0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
True-Life Adventures
'NoneType' object has no attribute 'find_all'
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430


## Save and reload data

In [10]:
import json

def save_data(title,data):
    with open(title, 'w', encoding ='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent = 2)

In [11]:
import json

def load_data(title):
    with open(title, encoding ='utf-8') as f:
        return json.load(f)

In [12]:
save_data("disney_data_cleaned.json",movie_info_list)

# Task3: Clean data

In [13]:
movie_info_list = load_data("disney_data_cleaned.json")

Subtasks:
* Clean up references [1]
* convert running time into an integer
* convert dates into datetime boject
* split up long strings
* convert budget and box office to numbers

Clean up references and split up long strings already done above

## Convert running time

In [14]:
# for movie in movie_info_list[0:20]:
#     movie['Running time'] = movie['Running time'].split("min", 1)[0]


### Solution

In [15]:
[movie.get('Running time', 'N/A') for movie in movie_info_list]

['41 minutes (74 minutes 1966 release)',
 '83 minutes',
 '88 minutes',
 '126 minutes',
 '74 minutes',
 '64 minutes',
 '70 minutes',
 '42 minutes',
 '65 min.',
 '71 minutes',
 '75 minutes',
 '94 minutes',
 '73 minutes',
 '75 minutes',
 '82 minutes',
 '68 minutes',
 '74 minutes',
 '96 minutes',
 '75 minutes',
 '84 minutes',
 '77 minutes',
 '92 minutes',
 '69 minutes',
 '81 minutes',
 ['60 minutes (VHS version)', '71 minutes (original)'],
 '127 minutes',
 '92 minutes',
 '76 minutes',
 '75 minutes',
 '73 minutes',
 '85 minutes',
 '81 minutes',
 '70 minutes',
 '90 min.',
 '80 minutes',
 '75 minutes',
 '83 minutes',
 '83 minutes',
 '72 minutes',
 '97 minutes',
 '75 minutes',
 '104 minutes',
 '93 minutes',
 '105 minutes',
 '95 minutes',
 '97 minutes',
 '134 minutes',
 '69 minutes',
 '92 minutes',
 '126 minutes',
 '79 minutes',
 '97 minutes',
 '128 minutes',
 '74 minutes',
 '91 minutes',
 '105 minutes',
 '98 minutes',
 '130 minutes',
 '89 min.',
 '93 minutes',
 '67 minutes',
 '98 minutes',
 '1

In [16]:
"85 minutes"

def minute_to_integer(running_time):
    if running_time == 'N/A':
        return None
    if isinstance(running_time, list):
        entry = running_time[0]
        return int(entry.split(" ")[0])
    else:
        return int(running_time.split(" ")[0])
    
print(minute_to_integer(["85 minutes", "90 minutes"]))

85


In [17]:
for movie in movie_info_list:
    movie['Running time (int)'] = minute_to_integer(movie.get('Running time', "N/A"))

In [18]:
movie_info_list[-10]

{'title': 'Black Is King',
 'Directed by': ['Beyoncé Knowles-Carter',
  'Kwasi Fordjour',
  'Emmanuel Adjei',
  'Blitz Bazawule',
  'Ibra Ake',
  'Jenn Nkiru',
  'Jake Nava',
  'Pierre Debusschere',
  'Dikayl Rimmasch'],
 'Produced by': ['Jeremy Sullivan',
  'Jimi Adesanya',
  'Blitz Bazawule',
  'Ben Cooper',
  'Astrid Edwards',
  'Durwin Julies',
  'Yoli Mes',
  'Dafe Oboro',
  'Akin Omotoso',
  'Will Whitney',
  'Lauren Baker',
  'Jason Baum',
  'Alex Chamberlain',
  'Robert Day',
  'Christophe Faubert',
  'Brien Justiniano',
  'Rethabile Molatela Mothobi',
  'Sylvia Zakhary',
  'Nathan Scherrer',
  'Erinn Williams'],
 'Written by': ['Beyoncé Knowles-Carter',
  'Yrsa Daley-Ward',
  'Clover Hope',
  'Andrew Morrow'],
 'Based on': ['The Lion King: The Gift'],
 'Starring': ['Beyoncé',
  'Folajomi Akinmurele',
  'Connie Chiume',
  'Nyaniso Ntsikelelo Dzedze',
  'Nandi Madida',
  'Warren Masemola',
  'Sibusiso Mbeje',
  'Fumi Odede',
  'Stephen Ojo',
  'Mary Twala'],
 'Music by': ['James

## Convert budget and box movies to numbers

In [19]:
movie_info_list[-50]

{'title': 'The Jungle Book',
 'Directed by': 'Jon Favreau',
 'Produced by': ['Jon Favreau', 'Brigham Taylor'],
 'Written by': 'Justin Marks',
 'Based on': ['The Jungle Book', 'by', 'Rudyard Kipling'],
 'Starring': ['Bill Murray',
  'Ben Kingsley',
  'Idris Elba',
  "Lupita Nyong'o",
  'Scarlett Johansson',
  'Giancarlo Esposito',
  'Christopher Walken',
  'Neel Sethi'],
 'Music by': 'John Debney',
 'Cinematography': 'Bill Pope',
 'Edited by': 'Mark Livolsi',
 'Production company': ['Walt Disney Pictures', 'Fairview Entertainment'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['April 4, 2016 ( El Capitan Theatre )',
  'April 15, 2016 (United States)'],
 'Running time': '106 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$175–177 million',
 'Box office': '$966.6 million',
 'Running time (int)': 106}

In [20]:
[movie.get('Budget', 'N/A') for movie in movie_info_list]

['N/A',
 '$1.49 million',
 '$2.6 million',
 '$2.28 million',
 '$600,000',
 '$950,000',
 '$858,000',
 'N/A',
 '$788,000',
 'N/A',
 '$1.35 million',
 '$2.125 million',
 'N/A',
 '$1.5 million',
 '$1.5 million',
 'N/A',
 '$2.9 million',
 '$1,800,000',
 '$3 million',
 'N/A',
 '$4 million',
 '$2 million',
 '$300,000',
 '$1.8 million',
 'N/A',
 '$5 million',
 'N/A',
 '$4 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$700,000',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$6 million',
 'under $1 million or $1,250,000',
 'N/A',
 '$2 million',
 'N/A',
 'N/A',
 '$2.5 million',
 'N/A',
 'N/A',
 '$4 million',
 '$3.6 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$3 million',
 'N/A',
 '$3 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$3 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$4.4–6 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$4 million',
 'N/A',
 '$5 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '

In [21]:
import re

# this is the regex syntax. important to learn
amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict ={"thousand":1000, "million":1000000, "billion":1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number,string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags = re.I).group().lower()
    word_value = word_to_value(word)
    return value * word_value
    
def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

def money_conversion(money):
    
    if money == "N/A":
        return None
    
    if isinstance(money,list):
        money = money[0]

    word_syntax = re.search(word_re,money, flags = re.I)
    value_syntax = re.search(value_re,money)
    
    if word_syntax:
        return parse_word_syntax(word_syntax.group()) 
        
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    
    else:
        return None
        

In [22]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

# Task4: Convert dates into datetime object

In [None]:
# June 28, 1950

In [25]:
from datetime import datetime

In [30]:
dates = [movie.get('Release date','N/A') for movie in movie_info_list ]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
    
    if date == "N/A":
        return None
    
    date_str = clean_date(date)
    
    fmts = ["%B %d, %Y", "%d %B %Y" ]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str,fmt)
        except:
            pass
    
    return None

for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date','N/A'))

In [31]:
movie_info_list[-40]

{'title': 'Beauty and the Beast',
 'Directed by': 'Bill Condon',
 'Produced by': ['David Hoberman', 'Todd Lieberman'],
 'Screenplay by': ['Stephen Chbosky', 'Evan Spiliotopoulos'],
 'Based on': ["Disney 's Beauty and the Beast by Linda Woolverton",
  'Beauty and the Beast by Jeanne-Marie Leprince de Beaumont'],
 'Starring': ['Emma Watson',
  'Dan Stevens',
  'Luke Evans',
  'Kevin Kline',
  'Josh Gad',
  'Ewan McGregor',
  'Stanley Tucci',
  'Audra McDonald',
  'Gugu Mbatha-Raw',
  'Ian McKellen',
  'Emma Thompson'],
 'Music by': 'Alan Menken',
 'Cinematography': 'Tobias A. Schliessler',
 'Edited by': 'Virginia Katz',
 'Production company': ['Walt Disney Pictures', 'Mandeville Films'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['February 23, 2017 ( Spencer House )',
  'March 17, 2017 (United States)'],
 'Running time': '129 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$160–255 million',
 'Box office': '$1.264 billion',

# Task 5 saving again

In [35]:
import pickle

def save_data_pickle(name, data):
    with open(name,'wb') as f:
        pickle.dump(data, f)


In [36]:
import pickle

def load_data_pickle(name):
    with open(name,'rb') as f:
        return pickle.load(f)

In [37]:
save_data_pickle("disney_movie_data_cleaned_more.pickle", movie_info_list)

In [38]:
a = load_data_pickle("disney_movie_data_cleaned_more.pickle")

In [39]:
a[5]

{'title': 'Dumbo',
 'Directed by': ['Supervising director:',
  'Ben Sharpsteen',
  'Sequence directors:',
  'Norman Ferguson',
  'Wilfred Jackson',
  'Bill Roberts',
  'Jack Kinney',
  'Samuel Armstrong'],
 'Produced by': 'Walt Disney',
 'Story by': ['Otto Englander', 'Joe Grant', 'Dick Huemer'],
 'Based on': ['Dumbo, the Flying Elephant',
  'by',
  'Helen Aberson',
  'Harold Pearl'],
 'Starring': ['Edward Brophy',
  'Herman Bing',
  'Margaret Wright',
  'Sterling Holloway',
  'Verna Felton',
  'Cliff Edwards',
  'James Baskett',
  'Nick Stewart',
  'Hall Johnson',
  'Jim Carmichael'],
 'Narrated by': 'John McLeish',
 'Music by': ['Frank Churchill', 'Oliver Wallace'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['October 23, 1941 (New York City)',
  'October 31, 1941 (U.S.)'],
 'Running time': '64 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$950,000',
 'Box office': '$1.3 million (est. United

# Task 6 attach IMDB/Rotten tomatoes socre

In [40]:
movie_info_list = load_data_pickle("disney_movie_data_cleaned_more.pickle")

In [42]:
yourkey = '86268b8b'

In [None]:
/?apikey=[yourkey]&

In [58]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": yourkey, 't':title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    #print(full_url)
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None



        

In [59]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating',None)
    movie['metascore'] = omdb_info.get('Metascore',None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [60]:
movie_info_list[138]

{'title': 'The North Avenue Irregulars',
 'Directed by': 'Bruce Bilson',
 'Produced by': 'Ron Miller',
 'Written by': 'Don Tait',
 'Starring': ['Edward Herrmann',
  'Barbara Harris',
  'Susan Clark',
  'Karen Valentine',
  'Michael Constantine',
  'Cloris Leachman',
  'Steve Franken',
  'Patsy Kelly',
  'Douglas Fowley',
  'Virginia Capers',
  'Melora Hardin',
  'Alan Hale, Jr.',
  'Ruth Buzzi'],
 'Music by': ['Richard Bowden', 'Robert F. Brunner'],
 'Cinematography': 'Leonard J. South',
 'Edited by': 'Gordon D. Brenner',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': 'February 9, 1979',
 'Running time': '100 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Running time (int)': 100,
 'Budget (float)': None,
 'Box office (float)': None,
 'Release date (datetime)': datetime.datetime(1979, 2, 9, 0, 0),
 'imdb': '6.8',
 'metascore': 'N/A',
 'rotten_tomatoes': '78%'}

# Save

In [61]:
save_data_pickle('disney_movie_data_final_pickle', movie_info_list)

In [62]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [63]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [64]:
save_data('disney_data_final.json', movie_info_copy)

In [65]:
import pandas as pd
df = pd.DataFrame(movie_info_list)


In [66]:
df.head()

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),...,Cinematography,Edited by,Screenplay by,Production companies,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,45.472,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre , ...",83 minutes,United States,English,$418 million,83.0,1490000.0,418000000.0,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,164000000.0,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million,126.0,2280000.0,83300000.0,...,James Wong Howe,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 20, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,960000.0,...,Bert Giennon,Paul Weatherwax,,,,,,,,


In [67]:
df.to_csv('disney_movie_data_final.csv')