In [1]:
import pandas as pd

import requests
from bs4 import BeautifulSoup

# STEP 1 - Collecting the Data

## Scrape 1:  Table data from Top Lifetime WorldWide Grossing Movies

In [2]:
# There are 5 pages of data, lets focus on the first one

box_office_mojo_url = 'https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?area=XWW'
response = requests.get(box_office_mojo_url)
response.status_code

200

In [3]:
page = response.text
soup = BeautifulSoup(page, "html5lib")

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo">
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta charset="utf-8"/>
  <title dir="ltr">
   Top Lifetime Grosses - Box Office Mojo
  </title>
  <meta content="Top Lifetime Grosses" name="title"/>
  <meta content="Box Office Mojo" property="og:site_name"/>
  <meta content="https://m.media-amazon.com/images/G/01/boxofficemojo/logo/mojo-logo-bg.png" property="og:image"/>
  <meta content="telephone=no" name="format-detection"/>
  <link href="https://m.media-amazon.com/images/G/01/boxofficemojo/v2/favicon._CB448965889_.ico" rel="icon" type="image/x-icon"/>
  <link href="https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?area=XWW" rel="canonical"/>
  <link href="https://images-na.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41DAFIecsVL.css,51IB+wfP8qL.css,01ZfXnjPmmL.css,01oD

In [5]:
table = soup.find('table')
table

<table class="a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated"><tbody><tr><th class="a-text-right mojo-field-type-rank a-nowrap"><span title="Rank">Rank</span>
            </th><th class="a-text-left mojo-field-type-title a-nowrap"><span title="Title">Title</span>
            </th><th class="a-text-right mojo-field-type-money a-nowrap"><span title="Worldwide Lifetime Gross">Worldwide Lifetime Gross</span>
            </th><th class="a-text-right mojo-field-type-money a-nowrap"><span title="Domestic Lifetime Gross">Domestic Lifetime Gross</span>
            </th><th class="a-text-right mojo-field-type-percent a-nowrap"><span title="Domestic %">Domestic %</span>
            </th><th class="a-text-right mojo-field-type-money a-nowrap"><span title="Foreign Lifetime Gross">Foreign Lifetime Gross</span>
            </th><th class="a-text-right mojo-field-type-percent a-nowrap"><span title="Foreign %">Foreign %</span>
            </th><th class="a-tex

In [6]:
rows = [row for row in table.find_all('tr')]

In [13]:
# Create helper functions to aid in converting common features..
#..and obtaining values on main tables page

def money_to_int(money_string):
    try:    
        money_string = money_string.replace('$','').replace(',','')
        return int(money_string)

    except:
        return None
    
def theaters_to_int(theater_string):
    try:
        theater_string = theater_string.replace(',','')
        return(int(theater_string))
    
    except:
        return None

def get_ww_gross(element):
    try:
        raw_worldwide_gross = element.find_all(class_='a-text-right mojo-field-type-money')[0].text
        return raw_worldwide_gross
    
    except:
        return None

In [14]:
## Create function to iterate elements in each row of table 

movie_table_dict_list = []
def get_table_movie_info(element):
 
    movie_dict= {}
    headers = ['movie_title', 'link_stub','worldwide_gross']
    
    movie_title = element.find(class_='a-text-left mojo-field-type-title').next.text

    
    link_stub = element.find('a')['href']
   
    
    raw_worldwide_gross = get_ww_gross(element)
    worldwide_gross = money_to_int(raw_worldwide_gross)
   
       
       
    movie_table_dict = dict(zip(headers, [movie_title, link_stub, worldwide_gross]))
    
   
    
     
    return movie_table_dict_list.append(movie_table_dict)
    
    
    
    

In [15]:
# Create list of dictionaries containing each movie and its features from the table
# Experiment on 1 movie first
# ***Movies start at the 2nd row of the table
for element in rows[1:2]:
        get_table_movie_info(element)

movie_table_dict_list

[{'movie_title': 'Avatar',
  'link_stub': '/title/tt0499549/?ref_=bo_cso_table_1',
  'worldwide_gross': 2847246203}]

In [16]:
movie_table_dict_list.clear()

In [17]:
# Experiment on a few more, just to make sure
for element in rows[1:7]:
        get_table_movie_info(element)

movie_table_dict_list

[{'movie_title': 'Avatar',
  'link_stub': '/title/tt0499549/?ref_=bo_cso_table_1',
  'worldwide_gross': 2847246203},
 {'movie_title': 'Avengers: Endgame',
  'link_stub': '/title/tt4154796/?ref_=bo_cso_table_2',
  'worldwide_gross': 2797501328},
 {'movie_title': 'Titanic',
  'link_stub': '/title/tt0120338/?ref_=bo_cso_table_3',
  'worldwide_gross': 2201647264},
 {'movie_title': 'Star Wars: Episode VII - The Force Awakens',
  'link_stub': '/title/tt2488496/?ref_=bo_cso_table_4',
  'worldwide_gross': 2069521700},
 {'movie_title': 'Avengers: Infinity War',
  'link_stub': '/title/tt4154756/?ref_=bo_cso_table_5',
  'worldwide_gross': 2048359754},
 {'movie_title': 'Jurassic World',
  'link_stub': '/title/tt0369610/?ref_=bo_cso_table_6',
  'worldwide_gross': 1670516444}]

In [18]:
movie_table_dict_list.clear()

In [21]:
# Run function on the entire first page of data
for element in rows[1:201]:
        get_table_movie_info(element)
print('Scrape was successful') 
print('Num Movies Scraped: {}'.format(len(movie_table_dict_list)))

Scrape was successful
Num Movies Scraped: 200


In [24]:
# insure print was successful
print(movie_table_dict_list[0])
print(movie_table_dict_list[100])
print(movie_table_dict_list[199])

# movies match 

{'movie_title': 'Avatar', 'link_stub': '/title/tt0499549/?ref_=bo_cso_table_1', 'worldwide_gross': 2847246203}
{'movie_title': 'Guardians of the Galaxy', 'link_stub': '/title/tt2015381/?ref_=bo_cso_table_101', 'worldwide_gross': 773350147}
{'movie_title': 'The Meg', 'link_stub': '/title/tt4779682/?ref_=bo_cso_table_200', 'worldwide_gross': 530259473}


## Scrape 2:  Data collection from all subsequent pages

In [25]:
for page_offset in range(200,15001,200):
    url = "https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?area=XWW&offset={}".format(page_offset)
  

    response = requests.get(url)
    page = response.text

    soup = BeautifulSoup(page, "html5lib")
    #print(soup.prettify())

    table = soup.find('table')
    #print(table)

    rows_2 = [row for row in table.find_all('tr')]
    #print(rows)

    for element in rows_2[1:201]:
        #print(row)
        get_table_movie_info(element)  
        
print(len(movie_table_dict_list))  

15200


In [27]:
# insure print was successful

print(movie_table_dict_list[0])
print(movie_table_dict_list[10000])
print(movie_table_dict_list[15199])

# movies match 

{'movie_title': 'Avatar', 'link_stub': '/title/tt0499549/?ref_=bo_cso_table_1', 'worldwide_gross': 2847246203}
{'movie_title': 'Pawn Sacrifice', 'link_stub': '/title/tt1596345/?ref_=bo_cso_table_1', 'worldwide_gross': 5578519}
{'movie_title': 'Queen Bees', 'link_stub': '/title/tt8338076/?ref_=bo_cso_table_200', 'worldwide_gross': 1855827}


In [28]:
# Create a dataframe from this list

movie_table_info_df = pd.DataFrame(movie_table_dict_list)

In [29]:
movie_table_info_df.sample(10)

Unnamed: 0,movie_title,link_stub,worldwide_gross
3910,Two Tigers,/title/tt10333606/?ref_=bo_cso_table_111,32001580
8858,Respiro,/title/tt0286516/?ref_=bo_cso_table_59,7309845
12686,Moby Dick,/title/tt1984208/?ref_=bo_cso_table_87,3062455
372,Planet of the Apes,/title/tt0133152/?ref_=bo_cso_table_173,362211740
10230,"7 rzeczy, których nie wiecie o facetach",/title/tt5312370/?ref_=bo_cso_table_31,5267920
347,Batman Begins,/title/tt0372784/?ref_=bo_cso_table_148,373661946
8263,Tune in for Love,/title/tt10763618/?ref_=bo_cso_table_64,8494414
9639,Heaven Help Us,/title/tt0089264/?ref_=bo_cso_table_40,6070794
8703,Into the Night,/title/tt0089346/?ref_=bo_cso_table_104,7562164
5852,The Concubine,/title/tt2544120/?ref_=bo_cso_table_53,16465764


In [31]:
# No missing values, great success!

movie_table_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15200 entries, 0 to 15199
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   movie_title      15200 non-null  object
 1   link_stub        15200 non-null  object
 2   worldwide_gross  15200 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 356.4+ KB


###### *Further cleaning will be conducted upon completion of data gathering*

## Scape 3:  Data collection from each movie's page

In [84]:
# Create function that help capture features that can be matched by text
# Will work for a number of values but not all
import re

def get_movie_value(soup, field_name):
    
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
   
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None
    

    
# Create functions to properly format...
# ...collected data(in addition to ones previously created)
[]
def runtime_to_minutes(runtimestring):
    
    try:
        run_time = runtimestring.split()
        minutes = int(run_time[0])*60 + int(run_time[2])
        return minutes
        
    except:
        return None

def to_date(datestring):
    import dateutil.parser
    date = dateutil.parser.parse(datestring)
    return date 
 
    
    
    
def get_release_date(soup):
    try:
        raw_release_date = soup.find(text="Earliest Release Date").findNext('span').text.split('\n')[0]
        return raw_release_date
    
    except:
        return None

def get_distributor(soup, field_name):
    
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
   
    next_element = obj.findNext()
    
    if next_element:
        distributor = next_element.text.split('See')[0]
        return distributor
    else:
        return None   
    
    
def get_genre(soup, field_name):
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
   
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text.split() 
    else:
        return None
    

In [85]:
def get_movie_page_info(link):    
        movie_page_dict = {}
        headers = ['movie_title','rating',
                   'budget', 'runtime', 'release_date', 'distributor', 'genres']

        base_url  = 'https://www.boxofficemojo.com'

        movie_page_url = base_url+ str(link)


        movie_page_response = requests.get(movie_page_url)

        movie_page = movie_page_response.text
        movie_page_soup = BeautifulSoup(movie_page,"html5lib")

        # Get Title
        movie_title = movie_page_soup.find('title').text.split('-')[0].strip()
     

        # Get Rating
        #rating = movie_page_soup.find(text='MPAA').next.text
        rating = get_movie_value(movie_page_soup, 'MPAA')


        # Get Budget
        raw_budget = get_movie_value(movie_page_soup,'Budget')
        budget = money_to_int(raw_budget)


        # Get Run-time
        raw_runtime = get_movie_value(movie_page_soup,'Running')
        runtime = runtime_to_minutes(raw_runtime)


        # Get Release Date
        #raw_release_date = movie_page_soup.find(text="Earliest Release Date").findNext('span').text.split('\n')[0]
        raw_release_date = get_release_date(movie_page_soup)
        release_date = to_date(raw_release_date)
    
        # Get Distributor
        distributor = get_distributor(movie_page_soup, 'Distributor')
        
        #Get genre
        genres = get_genre(movie_page_soup,'Genres')
        
        movie_page_dict = dict(zip(headers, [movie_title, rating, budget, runtime, release_date, distributor, genres]))

        return movie_page_dict_list.append(movie_page_dict)


In [87]:
# Experiment on movie's page first
link_stub_series = movie_table_info_df['link_stub']
movie_page_dict_list = []
for link in link_stub_series[0:1]:
    get_movie_page_info(link)

movie_page_dict_list    

[{'movie_title': 'Avatar',
  'rating': 'PG-13',
  'budget': 237000000,
  'runtime': 162,
  'release_date': datetime.datetime(2009, 12, 16, 0, 0),
  'distributor': 'Twentieth Century Fox',
  'genres': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi']}]

In [67]:
movie_page_dict_list.clear()

In [88]:
# Experiment on a few more pages

link_stub_series = movie_table_info_df['link_stub']
movie_page_dict_list = []
for link in link_stub_series[0:7]:
    get_movie_page_info(link)

movie_page_dict_list    

[{'movie_title': 'Avatar',
  'rating': 'PG-13',
  'budget': 237000000,
  'runtime': 162,
  'release_date': datetime.datetime(2009, 12, 16, 0, 0),
  'distributor': 'Twentieth Century Fox',
  'genres': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi']},
 {'movie_title': 'Avengers: Endgame',
  'rating': 'PG-13',
  'budget': 356000000,
  'runtime': 181,
  'release_date': datetime.datetime(2019, 4, 24, 0, 0),
  'distributor': 'Walt Disney Studios Motion Pictures',
  'genres': ['Action', 'Adventure', 'Drama', 'Sci-Fi']},
 {'movie_title': 'Titanic',
  'rating': 'PG-13',
  'budget': 200000000,
  'runtime': 194,
  'release_date': datetime.datetime(1997, 12, 19, 0, 0),
  'distributor': 'Paramount Pictures',
  'genres': ['Drama', 'Romance']},
 {'movie_title': 'Star Wars: Episode VII',
  'rating': 'PG-13',
  'budget': 245000000,
  'runtime': 138,
  'release_date': datetime.datetime(2015, 12, 16, 0, 0),
  'distributor': 'Walt Disney Studios Motion Pictures',
  'genres': ['Action', 'Adventure', 'Sci-Fi']

In [69]:
movie_page_dict_list.clear()

In [89]:
# Run function on the entire list of movies
link_stub_series = movie_table_info_df['link_stub']
movie_page_dict_list = []
for link in link_stub_series:
    get_movie_page_info(link)

len(movie_page_dict_list)    


15200

In [92]:
# Movies match

print(movie_page_dict_list[0])
print(movie_page_dict_list[10000])
print(movie_page_dict_list[-1])

{'movie_title': 'Avatar', 'rating': 'PG-13', 'budget': 237000000, 'runtime': 162, 'release_date': datetime.datetime(2009, 12, 16, 0, 0), 'distributor': 'Twentieth Century Fox', 'genres': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi']}
{'movie_title': 'Pawn Sacrifice', 'rating': 'PG-13', 'budget': None, 'runtime': 115, 'release_date': datetime.datetime(2014, 9, 17, 0, 0), 'distributor': 'Bleecker Street Media', 'genres': ['Biography', 'Drama', 'Sport', 'Thriller']}
{'movie_title': 'Queen Bees', 'rating': 'PG-13', 'budget': None, 'runtime': 100, 'release_date': datetime.datetime(2021, 6, 11, 0, 0), 'distributor': 'Gravitas Ventures', 'genres': ['Comedy', 'Drama', 'Romance']}


In [93]:
# Create a second dataframe for info from movie pages
movie_pages_info_df = pd.DataFrame(movie_page_dict_list)

In [95]:
# Conduct initial data inspection
movie_pages_info_df.sample(20)

Unnamed: 0,movie_title,rating,budget,runtime,release_date,distributor,genres
4813,On Her Majesty's Secret Service,PG,,142.0,1969-12-18,United Artists,"[Action, Adventure, Thriller]"
13491,A Castle in Italy,,,104.0,2013-10-25,,"[Comedy, Drama]"
11455,Sacred Heart,,,,2005-02-25,,[Drama]
8141,True Believer,,,105.0,1989-02-17,Columbia Pictures,"[Crime, Drama]"
8537,Mujhse Shaadi Karogi,,,163.0,2004-07-30,Zee TV,"[Comedy, Romance]"
11789,Le père Noël,,,,2014-12-04,,
4728,Tomcats,R,11000000.0,95.0,2001-03-30,Revolution Studios,[Comedy]
12208,Rough,,,106.0,2006-08-26,,"[Romance, Sport]"
5781,Bringing Out the Dead,R,55000000.0,121.0,1999-10-22,Paramount Pictures,"[Drama, Thriller]"
12417,Go Brother! 2,,,105.0,2021-07-16,,[Drama]


In [96]:
# Missing values will be explored once all dataframes have been joined
# Will attempt to fill in values from other data sources
movie_pages_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15200 entries, 0 to 15199
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   movie_title   15200 non-null  object        
 1   rating        6500 non-null   object        
 2   budget        3132 non-null   float64       
 3   runtime       14466 non-null  float64       
 4   release_date  15200 non-null  datetime64[ns]
 5   distributor   8994 non-null   object        
 6   genres        15032 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 831.4+ KB


## Scrape 4:  Data collection of info on cast, crew & production

In [101]:


def get_producer(soup):
    try:
        raw_producer = soup.find('td',text='Producer').previous_sibling.findNext('a').text
        producer = " ".join(raw_producer.split())
        return producer
    
    except:
        return None
    
    
def get_director(soup):
    try:
        raw_director = soup.find('td',text='Director').previous_sibling.findNext('a').text
        director = " ".join(raw_director.split())
        return director
    
    except:
        return None
    
def get_writer(soup):
    try:
        raw_writer = soup.find('td',text='Writer').previous_sibling.findNext('a').text
        writer = " ".join(raw_writer.split())
        return writer
    
    except:
        return None
    
def get_actor_1(soup):  
    try:
        raw_lead_actor_1 = soup.find(text='Actor').findNext('a').text
        lead_actor_1 = " ".join(raw_lead_actor_1.split())
        return lead_actor_1
    
    except:
        return None
    
def get_actor_2(soup):
    try:
        raw_lead_actor_2 = soup.find(text='Actor').findNext('a').findNext('a', class_='a-link-normal').text
        lead_actor_2 = " ".join(raw_lead_actor_2.split())
        return lead_actor_2
    except:
        return None
        
def get_actor_3(soup):
    try:
        raw_actor_3 = soup.find(text='Actor').findNext('a').findNext('a', class_='a-link-normal').findNext('a', class_='a-link-normal').text
        actor_3 = " ".join(raw_actor_3.split())
        return actor_3
    except:
        return None
    


In [102]:
#Create function to scrape data from cast and crew portion of movie's pages

def get_crew_info(link):

    crew_dict = {}
    headers = ['movie_title','director','writer', 'producer', 'lead_actor_1', 
               'lead_actor_2', 'actor_3'] 
    
    base_url = 'https://www.boxofficemojo.com'
    
    url = base_url + link[:16]
   
    cast_url = url + "/credits"
  
    response = requests.get(cast_url)
 
    page = response.text
    crew_soup = BeautifulSoup(page,"html5lib")
    
    title_string = crew_soup.find('title').text
    movie_title = title_string.split('- Box')[0].strip()   
     
    director = get_director(crew_soup)
    
    writer = get_writer(crew_soup)
    
    producer = get_producer(crew_soup)
    
    lead_actor_1 = get_actor_1(crew_soup)
    
    lead_actor_2 = get_actor_2(crew_soup)
        
    actor_3 = get_actor_3(crew_soup)
 
    crew_dict = dict(zip(headers, [movie_title, director, writer, producer,
                                   lead_actor_1, lead_actor_2, actor_3]))
    

    return crew_dict_list.append(crew_dict)    


In [103]:
# Experiment with one movie

crew_dict_list = []
link_stubs = movie_table_info_df['link_stub']
for link in link_stubs[1:2]:
    get_crew_info(link)

crew_dict_list

[{'movie_title': 'Avengers: Endgame',
  'director': 'Anthony Russo',
  'writer': 'Christopher Markus',
  'producer': 'Kevin Feige',
  'lead_actor_1': 'Robert Downey Jr.',
  'lead_actor_2': 'Chris Evans',
  'actor_3': 'Mark Ruffalo'}]

In [104]:
# Experiment with a few more movies

crew_dict_list = []
link_stubs = movie_table_info_df['link_stub']
for link in link_stubs[1:10]:
    get_crew_info(link)

crew_dict_list

[{'movie_title': 'Avengers: Endgame',
  'director': 'Anthony Russo',
  'writer': 'Christopher Markus',
  'producer': 'Kevin Feige',
  'lead_actor_1': 'Robert Downey Jr.',
  'lead_actor_2': 'Chris Evans',
  'actor_3': 'Mark Ruffalo'},
 {'movie_title': 'Titanic',
  'director': 'James Cameron',
  'writer': 'James Cameron',
  'producer': 'James Cameron',
  'lead_actor_1': 'Leonardo DiCaprio',
  'lead_actor_2': 'Kate Winslet',
  'actor_3': 'Billy Zane'},
 {'movie_title': 'Star Wars: Episode VII - The Force Awakens',
  'director': 'J.J. Abrams',
  'writer': 'Lawrence Kasdan',
  'producer': 'J.J. Abrams',
  'lead_actor_1': 'Daisy Ridley',
  'lead_actor_2': 'John Boyega',
  'actor_3': 'Oscar Isaac'},
 {'movie_title': 'Avengers: Infinity War',
  'director': 'Anthony Russo',
  'writer': 'Christopher Markus',
  'producer': 'Kevin Feige',
  'lead_actor_1': 'Robert Downey Jr.',
  'lead_actor_2': 'Chris Hemsworth',
  'actor_3': 'Mark Ruffalo'},
 {'movie_title': 'Jurassic World',
  'director': 'Colin

In [105]:
crew_dict_list.clear()

In [106]:
# Run function on the entire list of movies
crew_dict_list = []
link_stubs = movie_table_info_df['link_stub']
for link in link_stubs:
    get_crew_info(link)
    
print(len(crew_dict_list))    

15200


In [108]:
# Double check values match

print(crew_dict_list[0])
print(crew_dict_list[10000])
print(crew_dict_list[-1])

{'movie_title': 'Avatar', 'director': 'James Cameron', 'writer': 'James Cameron', 'producer': 'James Cameron', 'lead_actor_1': 'Sam Worthington', 'lead_actor_2': 'Zoe Saldana', 'actor_3': 'Sigourney Weaver'}
{'movie_title': 'Pawn Sacrifice', 'director': 'Edward Zwick', 'writer': 'Steven Knight', 'producer': 'Gail Katz', 'lead_actor_1': 'Tobey Maguire', 'lead_actor_2': 'Liev Schreiber', 'actor_3': 'Peter Sarsgaard'}
{'movie_title': 'Queen Bees', 'director': 'Michael Lembeck', 'writer': 'Donald Martin', 'producer': 'Fred Bernstein', 'lead_actor_1': 'Ellen Burstyn', 'lead_actor_2': 'Jane Curtin', 'actor_3': 'Loretta Devine'}


In [109]:
crew_info_df = pd.DataFrame(crew_dict_list)
crew_info_df.head()

Unnamed: 0,movie_title,director,writer,producer,lead_actor_1,lead_actor_2,actor_3
0,Avatar,James Cameron,James Cameron,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver
1,Avengers: Endgame,Anthony Russo,Christopher Markus,Kevin Feige,Robert Downey Jr.,Chris Evans,Mark Ruffalo
2,Titanic,James Cameron,James Cameron,James Cameron,Leonardo DiCaprio,Kate Winslet,Billy Zane
3,Star Wars: Episode VII - The Force Awakens,J.J. Abrams,Lawrence Kasdan,J.J. Abrams,Daisy Ridley,John Boyega,Oscar Isaac
4,Avengers: Infinity War,Anthony Russo,Christopher Markus,Kevin Feige,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo


In [110]:
crew_info_df.sample(10)

Unnamed: 0,movie_title,director,writer,producer,lead_actor_1,lead_actor_2,actor_3
11163,Hartenstraat,Sanne Vogel,Judith Goudsmit,Kees Abrahams,Marwan Kenzari,Bracha van Doesburgh,Georgina Verbaan
4073,Major Payne,Nick Castle,Joe Connelly,Eric L. Gold,Damon Wayans,Michael Ironside,Bam Bam Bigelow
6683,Mubarakan,Anees Bazmee,Rupinder Chahal,Murad Khetani,Anil Kapoor,Arjun Kapoor,Ileana D'Cruz
2092,Serendipity,Peter Chelsom,Marc Klein,Peter Abrams,John Cusack,Kate Beckinsale,Jeremy Piven
6753,Bad Influence,Curtis Hanson,David Koepp,Steve Tisch,Rob Lowe,James Spader,Lisa Zane
11743,Imagine: John Lennon,Andrew Solt,Sam Egan,Andrew Solt,John Lennon,Yoko Ono,Paul McCartney
12860,The Horde,Andrey Proshkin,Yuriy Arabov,Nataliya Gostyushina,Maksim Sukhanov,Roza Khayrullina,Innokenti Dakaiarov
3341,2010: The Year We Make Contact,Peter Hyams,Arthur C. Clarke,Peter Hyams,Roy Scheider,John Lithgow,Helen Mirren
11677,Le séminaire Caméra Café,Charles Nemes,Alexandre Apergis,Jean-Yves Robin,Bruno Solo,Yvan Le Bolloc'h,Armelle
12369,Natale da chef,Neri Parenti,Alessandro Bencivenni,Fabio Boldi,Massimo Boldi,Dario Bandiera,Rocío Muñoz


In [111]:
crew_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15200 entries, 0 to 15199
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movie_title   15200 non-null  object
 1   director      14745 non-null  object
 2   writer        14477 non-null  object
 3   producer      13734 non-null  object
 4   lead_actor_1  14907 non-null  object
 5   lead_actor_2  14907 non-null  object
 6   actor_3       14907 non-null  object
dtypes: object(7)
memory usage: 831.4+ KB


#### There are three data frames so far;
 - movie_table_info_df:  contains all the movies and relevant features from the main table
 - movie_pages_info_df:  contains data from the individual pages of each movie
 - crew_info_df:         contains the cast,crew & production data from each movie's page 

Let's merge these into one dataframe 

In [112]:
# First save dfs into csv in case we create errors

movie_table_info_df.to_csv('./data/movie_table_info_df_3rd.csv')    
movie_pages_info_df.to_csv('./data/movie_pages_info_df_3rd.csv')
crew_info_df.to_csv('./data/crew_info_df_3rd.csv')

In [113]:
# # Merge all dfs

merge_table_pages_df = pd.merge(movie_table_info_df, movie_pages_info_df, left_index=True,
                                                right_index=True)

merge_table_pages_df

Unnamed: 0,movie_title_x,link_stub,worldwide_gross,movie_title_y,rating,budget,runtime,release_date,distributor,genres
0,Avatar,/title/tt0499549/?ref_=bo_cso_table_1,2847246203,Avatar,PG-13,237000000.0,162.0,2009-12-16,Twentieth Century Fox,"[Action, Adventure, Fantasy, Sci-Fi]"
1,Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2797501328,Avengers: Endgame,PG-13,356000000.0,181.0,2019-04-24,Walt Disney Studios Motion Pictures,"[Action, Adventure, Drama, Sci-Fi]"
2,Titanic,/title/tt0120338/?ref_=bo_cso_table_3,2201647264,Titanic,PG-13,200000000.0,194.0,1997-12-19,Paramount Pictures,"[Drama, Romance]"
3,Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_4,2069521700,Star Wars: Episode VII,PG-13,245000000.0,138.0,2015-12-16,Walt Disney Studios Motion Pictures,"[Action, Adventure, Sci-Fi]"
4,Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,2048359754,Avengers: Infinity War,PG-13,,149.0,2018-04-25,Walt Disney Studios Motion Pictures,"[Action, Adventure, Sci-Fi]"
...,...,...,...,...,...,...,...,...,...,...
15195,Omar W Salma 3,/title/tt6448088/?ref_=bo_cso_table_196,1856517,Omar W Salma 3,,,,2012-01-12,,
15196,Saraba itoshi no daitôryô,/title/tt1614392/?ref_=bo_cso_table_197,1856488,Saraba itoshi no daitôryô,,,86.0,2010-10-30,,[Comedy]
15197,Rich Kids,/title/tt0079806/?ref_=bo_cso_table_198,1856122,Rich Kids,,,101.0,1979-08-17,Metro-Goldwyn-Mayer (MGM),"[Comedy, Drama]"
15198,Golden Oldies,/title/tt0831904/?ref_=bo_cso_table_199,1856100,Golden Oldies,,,95.0,2008-09-19,,[Comedy]


In [114]:
merge_table_pages_df.rename(columns={'movie_title_x': 'movie_title'}, inplace = True)

In [115]:
merge_table_pages_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15200 entries, 0 to 15199
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   movie_title      15200 non-null  object        
 1   link_stub        15200 non-null  object        
 2   worldwide_gross  15200 non-null  int64         
 3   movie_title_y    15200 non-null  object        
 4   rating           6500 non-null   object        
 5   budget           3132 non-null   float64       
 6   runtime          14466 non-null  float64       
 7   release_date     15200 non-null  datetime64[ns]
 8   distributor      8994 non-null   object        
 9   genres           15032 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(6)
memory usage: 1.2+ MB


In [116]:
full_bom_df = pd.merge(merge_table_pages_df, crew_info_df, right_index=True, left_index=True)

full_bom_df.sample(5)

Unnamed: 0,movie_title_x,link_stub,worldwide_gross,movie_title_y,rating,budget,runtime,release_date,distributor,genres,movie_title_y.1,director,writer,producer,lead_actor_1,lead_actor_2,actor_3
7945,Searching for Sugar Man,/title/tt2125608/?ref_=bo_cso_table_146,9216552,Searching for Sugar Man,PG-13,,86.0,2012-07-27,Sony Pictures Classics,"[Biography, Documentary, Music]",Searching for Sugar Man,Malik Bendjelloul,Malik Bendjelloul,Malik Bendjelloul,Rodriguez,Stephen 'Sugar' Segerman,Dennis Coffey
11278,Edison,/title/tt0389957/?ref_=bo_cso_table_79,4165675,Edison,R,25000000.0,99.0,2005-10-06,,"[Action, Crime, Drama, Thriller]",Edison,David J. Burke,David J. Burke,Boaz Davidson,Morgan Freeman,Kevin Spacey,Justin Timberlake
12511,Chocolate,/title/tt1183252/?ref_=bo_cso_table_112,3179014,Chocolate,R,,110.0,2008-02-06,Magnolia Pictures,"[Action, Drama]",Chocolate,Prachya Pinkaew,Napalee,Tech Akarapol,JeeJa Yanin,Hiroshi Abe,Pongpat Wachirabunjong
1004,John Wick: Chapter 2,/title/tt4425200/?ref_=bo_cso_table_5,171547802,John Wick: Chapter 2,R,,122.0,2017-02-08,Lionsgate,"[Action, Crime, Thriller]",John Wick: Chapter 2,Chad Stahelski,Derek Kolstad,Basil Iwanyk,Keanu Reeves,Riccardo Scamarcio,Ian McShane
13946,Tiger Team: The Mountain of the 1000 Dragons,/title/tt1463208/?ref_=bo_cso_table_147,2351933,Tiger Team: The Mountain of the 1000 Dragons,,,89.0,2010-05-06,,"[Adventure, Family]",Tiger Team: The Mountain of the 1000 Dragons,Peter Gersina,Thomas Brezina,Susanne Freyer,Helena Siegmund-Schultze,Bruno Schubert,Justus Kammerer


In [117]:
del full_bom_df['movie_title_y']
full_bom_df.rename(columns={'movie_title_x': 'movie_title'}, inplace = True)

full_bom_df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15200 entries, 0 to 15199
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   movie_title      15200 non-null  object        
 1   link_stub        15200 non-null  object        
 2   worldwide_gross  15200 non-null  int64         
 3   rating           6500 non-null   object        
 4   budget           3132 non-null   float64       
 5   runtime          14466 non-null  float64       
 6   release_date     15200 non-null  datetime64[ns]
 7   distributor      8994 non-null   object        
 8   genres           15032 non-null  object        
 9   director         14745 non-null  object        
 10  writer           14477 non-null  object        
 11  producer         13734 non-null  object        
 12  lead_actor_1     14907 non-null  object        
 13  lead_actor_2     14907 non-null  object        
 14  actor_3          14907 non-null  objec

In [152]:


full_bom_df[82:83]

Unnamed: 0,movie_title,link_stub,worldwide_gross,rating,budget,runtime,release_date,distributor,genres,director,writer,producer,lead_actor_1,lead_actor_2,actor_3
82,Wonder Woman,/title/tt0451279/?ref_=bo_cso_table_83,822824522,PG-13,149000000.0,141.0,2017-05-30,Warner Bros.,"[Action, Adventure, Fantasy, Sci-Fi, War]",Patty Jenkins,Allan Heinberg,Charles Roven,Gal Gadot,Chris Pine,Robin Wright


### "full_bom_df"  is now the master dataframe for all info from Box-Office-Mojo

In [150]:
# Lots of missing values in budget and rating columns...
#...lets investigate
mask = (full_bom_df['budget'].isna()) & (full_bom_df['rating'].isna())
full_bom_df[mask].head()

# Manual inspection reveals NAs are from older films or foreign films 
# Will attempt to fill with values from imdb.com

Unnamed: 0,movie_title,link_stub,worldwide_gross,rating,budget,runtime,release_date,distributor,genres,director,writer,producer,lead_actor_1,lead_actor_2,actor_3
83,"Hi, Mom",/title/tt13364790/?ref_=bo_cso_table_84,822054381,,,128.0,2021-02-12,,"[Comedy, Drama, Fantasy]",Jan van den Nieuwenhuyzen,,Johan Nijenhuis,Fatma Genç,Sebastian Wulff,Roos Smit
114,Ne Zha,/title/tt10627720/?ref_=bo_cso_table_115,726264074,,,110.0,2019-07-26,Well Go USA Entertainment,"[Action, Adventure, Animation, Family, Fantasy...",Tony Collingwood,Mark Zaslove,Christopher O'Hare,Alan Marriott,Kate Harbour,Rob Rackstraw
124,The Wandering Earth,/title/tt7605074/?ref_=bo_cso_table_125,699992512,,,125.0,2019-01-31,CMC Pictures,"[Action, Adventure, Sci-Fi, Thriller]",Frant Gwo,Gong Geer,Gong Geer,Jing Wu,Chuxiao Qu,Guangjie Li
174,Operation Red Sea,/title/tt6878882/?ref_=bo_cso_table_175,579330426,,,142.0,2018-02-16,Well Go USA Entertainment,"[Action, Drama, Thriller, War]",Dante Lam,Zhuzhu Chen,Candy Leung,Yi Zhang,Johnny Huang,Hai-Qing
250,The Eight Hundred,/title/tt7294150/?ref_=bo_cso_table_51,461421559,,,149.0,2020-08-21,CMC Pictures,"[Action, Drama, History, War]",Hu Guan,Hu Guan,Wenjiu Zhu,Zhi-zhong Huang,Zhang Junyi,Hao Ou


## Scrape 5:  Data Collection of IMDB site
 Obtain the following data from IMBD;
   - Movie Title(for reference)
   - Ratings(supplemental to fill in missing values in full_bom_df)
   - Budget(supplemental to fill in missing values in full_bom_df)
   - Runtime(supplemental to fill in missing values in full_bom_df)
   - IMDB Scores(**new info**)

In [191]:
url = 'https://www.imdb.com/title/tt0926084/?ref_=fn_al_tt_1'

response = requests.get(url)
response.status_code

page = response.text
soup = BeautifulSoup(page, "html5lib")

In [193]:
soup.find(text=re.compile('Certificate')).findNext().text

''

In [192]:
soup.find('title').text.split("(")[0].strip()

'Harry Potter and the Deathly Hallows: Part 1'

In [200]:
soup.find(text=re.compile('Budget')).findNext().text.split('(')[0]

'£150,000,000 '

In [128]:
run_time = soup.find(text='Runtime').findNext().text
run_time

'1h 39min'

In [129]:
 int(run_time[0])*60 + int(run_time[3:5])

99

In [None]:

full_bom_df[full_bom_df.producer.isna()]

In [160]:
def get_imdb_budget(soup):
    try:
        raw_budget = soup.find(text=re.compile('Budget')).findNext().text.split('(')[0]
        return raw_budget
    
    except: 
        return None

def get_imdb_score(soup):
    try:
        raw_score = soup.find('span', class_="AggregateRatingButton__RatingScore-sc-1ll29m0-1 iTLWoV").text
        return float(raw_score)
    
    except:
        return None
        
def get_rating_imbd(soup):
      try:
            rating = soup.find(text=re.compile('MPAA')).findNext().text.split()[1]
            return rating
      except:
            pass
    
      try:
            certificate = soup.find(text=re.compile('Certificate')).findNext().text
            return certificate
    
      except:
            return None
        
def imdb_runtime_to_minutes(runtimestring):
    
    try:
        #run_time = runtimestring.split()
        minutes = int(runtimestring[0])*60 + int(runtimestring[3:5])
        return minutes
        
    except:
        return None

# def get_certificate_imdb(soup):
#     try:
#         certificate = soup.find(text=re.compile('Certificate')).findNext().text
#         return certificate
    
#     except:
#         return None

def get_raw_runtime(soup):
    try: 
        raw_runtime = soup.find(text='Runtime').findNext().text 
        return raw_runtime
    
    except:
        return None

In [161]:
def get_imdb_info(link):
    imdb_info_dict = {}
    
    headers = ['movie_title', 'rating', 'budget', 'imdb_score', 'runtime']
    
    base_url = 'https://www.imdb.com/title/'
    
    url = base_url + link##fill in info
    
    response = requests.get(url)
    page = response.text
    
    soup = BeautifulSoup(page, "html.parser")
    
    movie_title = soup.find('title').text.split("(")[0].strip()
    
    #rating = get_movie_value(soup, 'MPAA')
    rating = get_rating_imbd(soup)
    
    raw_budget = get_imdb_budget(soup) 
    budget = money_to_int(raw_budget)
    
    imdb_score = get_imdb_score(soup)
    
    raw_runtime = get_raw_runtime(soup)
    runtime = imdb_runtime_to_minutes(raw_runtime)

    distributor = get_distributor(soup, 'Production companies')
    
    imdb_info_dict = dict(zip(headers,[movie_title, rating, budget, imdb_score, runtime]))
    
    return imdb_info_dict_list.append(imdb_info_dict)    

In [156]:
# Experiment on 1

imdb_info_dict_list = []
link_stub_series = full_bom_df['link_stub'][0:1]
for link in link_stub_series:
    get_imdb_info(link[7:17])

imdb_info_dict_list

[{'movie_title': 'Avatar',
  'rating': 'PG-13',
  'budget': 237000000,
  'imdb_score': 7.8,
  'runtime': 162}]

In [159]:
# Experiment on a few

imdb_info_dict_list = []
link_stub_series = full_bom_df['link_stub'][250:258]
for link in link_stub_series:
    get_imdb_info(link[7:17])

imdb_info_dict_list

[{'movie_title': 'The Eight Hundred',
  'rating': 'Not Rated',
  'budget': 80000000,
  'imdb_score': 6.8,
  'runtime': 149},
 {'movie_title': 'X-Men: The Last Stand',
  'rating': 'PG-13',
  'budget': 210000000,
  'imdb_score': 6.7,
  'runtime': 104},
 {'movie_title': 'National Treasure: Book of Secrets',
  'rating': 'PG',
  'budget': 130000000,
  'imdb_score': 6.5,
  'runtime': None},
 {'movie_title': 'Lucy',
  'rating': 'R',
  'budget': 40000000,
  'imdb_score': 6.4,
  'runtime': 89},
 {'movie_title': 'Mission: Impossible',
  'rating': 'PG-13',
  'budget': 80000000,
  'imdb_score': 7.1,
  'runtime': 110},
 {'movie_title': '300',
  'rating': 'R',
  'budget': 65000000,
  'imdb_score': 7.6,
  'runtime': 117},
 {'movie_title': 'The Last Samurai',
  'rating': 'R',
  'budget': 140000000,
  'imdb_score': 7.7,
  'runtime': 154},
 {'movie_title': 'Demon Slayer: Mugen Train',
  'rating': '',
  'budget': None,
  'imdb_score': 8.3,
  'runtime': 117}]

In [162]:
# Run the function on the entire series

imdb_info_dict_list = []
link_stub_series = full_bom_df['link_stub']
for link in link_stub_series:
    get_imdb_info(link[7:17])

len(imdb_info_dict_list)

15200

In [None]:
link_stub_series[0]

In [163]:
len(imdb_info_dict_list)
print(imdb_info_dict_list[0])
print(imdb_info_dict_list[350])
print(imdb_info_dict_list[-1])

{'movie_title': 'Avatar', 'rating': 'PG-13', 'budget': 237000000, 'imdb_score': 7.8, 'runtime': 162}
{'movie_title': 'The Golden Compass', 'rating': 'PG-13', 'budget': 180000000, 'imdb_score': 6.1, 'runtime': 113}
{'movie_title': 'Queen Bees', 'rating': 'PG-13', 'budget': None, 'imdb_score': 5.9, 'runtime': 100}


In [164]:
full_imdb_df = pd.DataFrame(imdb_info_dict_list)

In [184]:
full_imdb_df.to_csv('./data/full_imdb_df_3rd.csv')

In [165]:
full_imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15200 entries, 0 to 15199
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movie_title  15200 non-null  object 
 1   rating       15200 non-null  object 
 2   budget       6533 non-null   float64
 3   imdb_score   14978 non-null  float64
 4   runtime      13117 non-null  float64
dtypes: float64(3), object(2)
memory usage: 593.9+ KB


In [166]:
#Merge box-office-mojo and imdb_scores
#Remaining columns will attempted to be used to fill in missing values 

imdb_score_df = full_imdb_df[['movie_title', 'imdb_score']]

full_movie_df = pd.merge(full_bom_df, imdb_score_df, right_index=True, left_index=True)

full_movie_df

Unnamed: 0,movie_title_x,link_stub,worldwide_gross,rating,budget,runtime,release_date,distributor,genres,director,writer,producer,lead_actor_1,lead_actor_2,actor_3,movie_title_y,imdb_score
0,Avatar,/title/tt0499549/?ref_=bo_cso_table_1,2847246203,PG-13,237000000.0,162.0,2009-12-16,Twentieth Century Fox,"[Action, Adventure, Fantasy, Sci-Fi]",James Cameron,James Cameron,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,Avatar,7.8
1,Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2797501328,PG-13,356000000.0,181.0,2019-04-24,Walt Disney Studios Motion Pictures,"[Action, Adventure, Drama, Sci-Fi]",Anthony Russo,Christopher Markus,Kevin Feige,Robert Downey Jr.,Chris Evans,Mark Ruffalo,Avengers: Endgame,8.4
2,Titanic,/title/tt0120338/?ref_=bo_cso_table_3,2201647264,PG-13,200000000.0,194.0,1997-12-19,Paramount Pictures,"[Drama, Romance]",James Cameron,James Cameron,James Cameron,Leonardo DiCaprio,Kate Winslet,Billy Zane,Titanic,7.8
3,Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_4,2069521700,PG-13,245000000.0,138.0,2015-12-16,Walt Disney Studios Motion Pictures,"[Action, Adventure, Sci-Fi]",J.J. Abrams,Lawrence Kasdan,J.J. Abrams,Daisy Ridley,John Boyega,Oscar Isaac,Star Wars: Episode VII - The Force Awakens,7.8
4,Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,2048359754,PG-13,,149.0,2018-04-25,Walt Disney Studios Motion Pictures,"[Action, Adventure, Sci-Fi]",Anthony Russo,Christopher Markus,Kevin Feige,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo,Avengers: Infinity War,8.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15195,Omar W Salma 3,/title/tt6448088/?ref_=bo_cso_table_196,1856517,,,,2012-01-12,,,,,,,,,Omar W Salma 3,
15196,Saraba itoshi no daitôryô,/title/tt1614392/?ref_=bo_cso_table_197,1856488,,,86.0,2010-10-30,,[Comedy],Sando Katsura,Keita Yamada,Takashige Ichise,Sando Katsura,Daisuke Miyagawa,Kendô Kobayashi,Saraba itoshi no daitôryô,4.0
15197,Rich Kids,/title/tt0079806/?ref_=bo_cso_table_198,1856122,,,101.0,1979-08-17,Metro-Goldwyn-Mayer (MGM),"[Comedy, Drama]",Robert M. Young,Judith Ross,George W. George,Trini Alvarado,Jeremy Levy,Kathryn Walker,Rich Kids,6.5
15198,Golden Oldies,/title/tt0831904/?ref_=bo_cso_table_199,1856100,,,95.0,2008-09-19,,[Comedy],Hugo Carvana,Paulo Halm,,Paulo Betti,José Wilker,Antonio Pedro,Casa da Mãe Joana,5.2


In [167]:
del full_movie_df['movie_title_y']
full_movie_df.rename(columns={'movie_title_x':'movie_title'}, inplace=True)

In [170]:
full_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15200 entries, 0 to 15199
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   movie_title      15200 non-null  object        
 1   link_stub        15200 non-null  object        
 2   worldwide_gross  15200 non-null  int64         
 3   rating           6500 non-null   object        
 4   budget           3132 non-null   float64       
 5   runtime          14466 non-null  float64       
 6   release_date     15200 non-null  datetime64[ns]
 7   distributor      8994 non-null   object        
 8   genres           15032 non-null  object        
 9   director         14745 non-null  object        
 10  writer           14477 non-null  object        
 11  producer         13734 non-null  object        
 12  lead_actor_1     14907 non-null  object        
 13  lead_actor_2     14907 non-null  object        
 14  actor_3          14907 non-null  objec

#### Fill in missing data in full_movie_df with data from full_imdb_df

In [173]:
# 1) fill in budget, 12,068 missing values

full_movie_df['budget'].fillna(full_imdb_df['budget'], inplace=True)

In [222]:
# Successfully filled 3,406 values
# most of the remaining values are independent, foreign or older movies
full_movie_df['budget'].isna().sum()

3944

In [177]:
# 2) fill-in any runtimes, 734 missing values
full_movie_df['runtime'].fillna(full_imdb_df['runtime'], inplace=True)

In [179]:
# only filled 17 values

full_movie_df['runtime'].isna().sum()

717

In [181]:
# 3) fill-in any ratings, 8700 missing

full_movie_df['rating'].fillna(full_imdb_df['rating'], inplace=True)

In [197]:
#this seems wrong(to good to be true), investigate

full_movie_df['rating'].isna().sum()

0

In [199]:
# odd ratings were passed in


full_movie_df['rating'].unique()

array(['PG-13', 'PG', 'G', 'R', 'Not Rated', '', 'TV-MA', 'Passed',
       'TV-Y7', 'TV-PG', 'Approved', 'Unrated', 'TV-14', 'NC-17', 'X',
       'GP', 'Hams', 'M', 'TV-Y7-FV', 'TV-G', 'Nin', 'close-up', 'TV-Y',
       'Bernardi', 'E', 'TV-13', 'Samantha', 'Open', 'Seconds',
       ' —rAjOo (gunwanti@hotmail.com)', 'scene'], dtype=object)

In [207]:
exp_rating_list = ['', 'TV-MA', 'Passed','TV-Y7', 'TV-PG', 'Approved','TV-14',
                   'X',  'GP', 'Hams', 'M', 'TV-Y7-FV', 'TV-G', 'Nin', 'close-up', 'TV-Y',
                   'Bernardi', 'E', 'TV-13', 'Samantha', 'Open', 'Seconds',
                   ' —rAjOo (gunwanti@hotmail.com)', 'scene' ]
full_movie_df[full_movie_df['rating'].isin(exp_rating_list)]

Unnamed: 0,movie_title,link_stub,worldwide_gross,rating,budget,runtime,release_date,distributor,genres,director,writer,producer,lead_actor_1,lead_actor_2,actor_3,imdb_score
83,"Hi, Mom",/title/tt13364790/?ref_=bo_cso_table_84,822054381,,,128.0,2021-02-12,,"[Comedy, Drama, Fantasy]",Jan van den Nieuwenhuyzen,,Johan Nijenhuis,Fatma Genç,Sebastian Wulff,Roos Smit,7.1
124,The Wandering Earth,/title/tt7605074/?ref_=bo_cso_table_125,699992512,TV-MA,48000000.0,125.0,2019-01-31,CMC Pictures,"[Action, Adventure, Sci-Fi, Thriller]",Frant Gwo,Gong Geer,Gong Geer,Jing Wu,Chuxiao Qu,Guangjie Li,6.0
174,Operation Red Sea,/title/tt6878882/?ref_=bo_cso_table_175,579330426,TV-MA,70000000.0,142.0,2018-02-16,Well Go USA Entertainment,"[Action, Drama, Thriller, War]",Dante Lam,Zhuzhu Chen,Candy Leung,Yi Zhang,Johnny Huang,Hai-Qing,6.7
260,"My People, My Country",/title/tt10147382/?ref_=bo_cso_table_61,450064993,,,154.0,2019-09-25,CMC Pictures,"[Drama, History]",,,Danielle Franco,,,,6.5
289,"My People, My Homeland",/title/tt12363162/?ref_=bo_cso_table_90,422390820,,,153.0,2020-10-01,,[Drama],Juraj Jakubisko,Jaroslav Dietl,Kurt J. Mrkwicka,Viveca Lindfors,Martin Hreben,Bolek Polívka,6.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15192,His Last Gift,/title/tt1232099/?ref_=bo_cso_table_193,1857015,,,112.0,2008-02-06,,[Drama],Young-jun Kim,Ee-hwan Bom,Tae-won Jeong,Hyeon-jun Shin,Joon-ho Huh,In-hwa Jeong,7.0
15194,My Father Is a Cleaning Lady,/title/tt1773578/?ref_=bo_cso_table_195,1856973,,,80.0,2011-04-13,,"[Comedy, Drama]",Saphia Azzeddine,Saphia Azzeddine,Thomas Langmann,François Cluzet,Jérémie Duvall,Nanou Garcia,5.7
15195,Omar W Salma 3,/title/tt6448088/?ref_=bo_cso_table_196,1856517,,,,2012-01-12,,,,,,,,,
15196,Saraba itoshi no daitôryô,/title/tt1614392/?ref_=bo_cso_table_197,1856488,,,86.0,2010-10-30,,[Comedy],Sando Katsura,Keita Yamada,Takashige Ichise,Sando Katsura,Daisuke Miyagawa,Kendô Kobayashi,4.0


#### Upon manual inspection it apprears these movies are either foreign made or extremely old
#### Will need to drop these unfortunately, cutting our rows by 5,061 movies

In [208]:
drop_rating_df = full_movie_df[full_movie_df['rating'].isin(exp_rating_list)]
full_movie_df = full_movie_df.drop(drop_rating_df.index, axis=0)

In [209]:

full_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10139 entries, 0 to 15199
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   movie_title      10139 non-null  object        
 1   link_stub        10139 non-null  object        
 2   worldwide_gross  10139 non-null  int64         
 3   rating           10139 non-null  object        
 4   budget           6195 non-null   float64       
 5   runtime          9942 non-null   float64       
 6   release_date     10139 non-null  datetime64[ns]
 7   distributor      8653 non-null   object        
 8   genres           10138 non-null  object        
 9   director         10092 non-null  object        
 10  writer           9996 non-null   object        
 11  producer         9851 non-null   object        
 12  lead_actor_1     10110 non-null  object        
 13  lead_actor_2     10110 non-null  object        
 14  actor_3          10110 non-null  objec

In [225]:
full_movie_df.to_csv('./data/full_movie_df_3rd.csv')