Note: Budget numbers for movies can be both difficult to find and unreliable. Studios and film-makers often try to keep the information secret and will use accounting tricks to inflate or reduce announced budgets.

This chart shows the budget of every film in our database, where we have it. The data we have is, to the best of our knowledge, accurate but there are gaps and disputed figures.

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [2]:
from datetime import datetime
from io import StringIO
import re

### Get the links

In [243]:
'''
This function finds all links of all the movies in a table and create a list
'''

def get_links(table):
    links = []
    for i in range(len(table.find_all('tr')) -1):
        i += 1
        try:
            messy = table.find_all('tr')[i].find_all('td')[-4]
            links.append(messy.find('a',href=True)['href'])
        except:
            links.append(None)
    
    return links

### Get the description column

In [258]:
'''
This function finds the gross profit and budget for the movie
'''

def get_profit(soup):
    try:
        table = soup.find_all(class_="infobox vevent")[0]
        budget = table.find_all('tr')[-2]
        proof = budget.find_all(class_ ='infobox-label')[0].get_text()
    except:
        return None, None
    if proof == 'Budget':
        budget = budget.find_all(class_ ='infobox-data')[0].get_text(strip=True)
        box_office = table.find_all('tr')[-1]
        box_office = box_office.find_all(class_ ='infobox-data')[0].get_text(strip=True)
        return budget, box_office

    else:
        return None, None

In [254]:
'''
This function finds the genre of the movie and calls
the get_profit so to find also the gross profit and budget for the movie
'''

pattern3 = r'\d{4} [A-Z]\w+([^\d\.]+) film[\s\[]'
pattern4 = r'\d{4} ([^\d\.]+) film[\s\[]'

def get_descr(x):

    try:
        x = r'https://en.wikipedia.org' + x
        r1 = requests.get(x)
    except:
        return [None, None, None]

    if r1.status_code==200:
        r2 = r1
        r1.close()
        soup1_2 = BeautifulSoup(r2.content, 'html.parser')

        budget, box_office = get_profit(soup1_2)

        first_line = soup1_2.find_all('p')[0].get_text(strip=False)
        print('-', end='')
        if len(first_line) < 75:
            first_line = soup1_2.find_all('p')[1].get_text(strip=False)
        result = re.findall(pattern3, first_line)

        if len(result) > 0:
            return [result[0],budget, box_office]
        
        else:
            result = re.findall(pattern4, first_line)
            if len(result) > 0:
                return [result[0],budget, box_office]
            else:
                return [None,budget, box_office]
    else:
        return [None,budget, box_office]

### Convert the months to numbers

In [195]:
def months_to_number(x):
    return months[x]

months = {'January':1, 'February':2, 'March':3,
          'April':4, 'May':5, 'June':6, 'July':7,
          'August':8,'September':9, 'October':10,
          'November':11, 'December':12}

## Scrape the whole page

In [212]:
def scrape_whole_page(url):

    # Initialize the request
    r = requests.get(url)
    if r.status_code != 200:
        print('Bad request.')
        return None
    
    # Parse the html page 
    soup1 = BeautifulSoup(r.content, 'html.parser')
    tables = soup1.find_all('table')
    tables = tables[2:6]

    # Get all the movie links of the page
    all_links = {}
    for t in range(len(tables)):
        links = get_links(tables[t])
        all_links[t] = links
    
    # Adjust all the df and add the link column
    all_df = pd.read_html(StringIO(str(tables)))
    result_df = {}
    for n,d in enumerate(all_df):
        d.rename(columns={'Opening':'Month', 'Opening.1':'Day'}, inplace=True)
        d = d[['Month', 'Day', 'Title']].copy().dropna(thresh=3)
        d['Month'] = d['Month'].str.replace(' ', '').str.capitalize()
        d['Link'] = all_links[n]
    
        d['List'] = d['Link'].apply(get_descr)
        result_df[n] = d
        print('\nTable done!\n')

    print('\nPAGE DONE!!!\n')
    return result_df


#df3 = scrape_whole_page(r'https://en.wikipedia.org/wiki/List_of_American_films_of_2015')       

## Test section

In [261]:
r = requests.get(r'https://en.wikipedia.org/wiki/List_of_American_films_of_2017')
r.status_code

200

In [None]:
soup1 = BeautifulSoup(r.content, 'html.parser')
tables = soup1.find_all('table')

In [277]:
tables[3] = str(tables[3]).replace('q', '')

In [283]:
pd.read_html(StringIO(str(tables[-1])))

[            Opening  Opening.1                                      Title  \
 0     O C T O B E R          6                          Blade Runner 2049   
 1     O C T O B E R          6                    The Mountain Between Us   
 2     O C T O B E R          6                  My Little Pony: The Movie   
 3     O C T O B E R          6                        The Florida Project   
 4     O C T O B E R          6                     Brawl in Cell Block 99   
 5     O C T O B E R         13                            Happy Death Day   
 6     O C T O B E R         13                                   Marshall   
 7     O C T O B E R         13                                    Breathe   
 8     O C T O B E R         13                             Carving a Life   
 9     O C T O B E R         13     Professor Marston and the Wonder Women   
 10    O C T O B E R         13                                Blood Money   
 11    O C T O B E R         20                                 

In [264]:
soup1 = BeautifulSoup(r.content, 'html.parser')
tables = soup1.find_all('table')
tables = tables[2:6]

get_links(tables[0])

['/wiki/Underworld:_Blood_Wars',
 '/wiki/Arsenal_(2017_film)',
 '/wiki/Between_Us_(2016_film)',
 '/wiki/Monster_Trucks_(film)',
 '/wiki/The_Bye_Bye_Man',
 '/wiki/Sleepless_(2017_film)',
 '/wiki/The_Book_of_Love_(2016_film)',
 '/wiki/Split_(2016_American_film)',
 '/wiki/XXX:_Return_of_Xander_Cage',
 '/wiki/The_Resurrection_of_Gavin_Stone',
 '/wiki/Trespass_Against_Us',
 '/wiki/Sophie_and_the_Rising_Sun_(film)',
 '/wiki/A_Dog%27s_Purpose_(film)',
 '/wiki/Resident_Evil:_The_Final_Chapter',
 '/wiki/Lost_in_Florence',
 '/wiki/I_Am_Michael',
 '/wiki/IBoy',
 '/wiki/Rings_(2017_film)',
 '/wiki/The_Space_Between_Us_(film)',
 '/wiki/Youth_in_Oregon',
 '/wiki/I_Am_Not_Your_Negro',
 '/wiki/Growing_Up_Smith',
 '/wiki/The_Lego_Batman_Movie',
 '/wiki/Fifty_Shades_Darker_(film)',
 '/wiki/John_Wick:_Chapter_2',
 '/wiki/Bornless_Ones',
 '/wiki/The_Great_Wall_(film)',
 '/wiki/A_Cure_for_Wellness',
 '/wiki/Fist_Fight',
 '/wiki/American_Fable',
 '/wiki/XX_(film)',
 '/wiki/Lovesong_(film)',
 '/wiki/Get_Out'

### 2015 - 2016

In [214]:
all_years_df = {}

for year in np.arange(2015,2017):
    url = 'https://en.wikipedia.org/wiki/List_of_American_films_of_' + str(year)
    
    all_years_df[year] = scrape_whole_page(url)


-------------------------------------------------------------
Table done!

--------------------------------------------
Table done!

---------------------------------------------
Table done!

---------------------------------------------------
Table done!


PAGE DONE!!!

---------------------------------------------
Table done!

-----------------------------------------
Table done!

---------------------------------------------
Table done!

----------------------------------------------
Table done!


PAGE DONE!!!



### 2017
This had a problem with the last table. I didn't understand the problem, so I took out that table. It's a tradeoff I'm willing to make.

In [293]:
def scrape_whole_page_2017(url):

    # Initialize the request
    r = requests.get(url)
    if r.status_code != 200:
        print('Bad request.')
        return None
    
    # Parse the html page 
    soup1 = BeautifulSoup(r.content, 'html.parser')
    tables2 = soup1.find_all('table')
    tables2 = tables2[2:5]
    

    # Get all the movie links of the page
    all_links = {}
    for t in range(len(tables2)):
        links = get_links(tables2[t])
        all_links[t] = links
    
    # Adjust all the df and add the link column
    all_df = pd.read_html(StringIO(str(tables2)))
    result_df = {}
    for n,d in enumerate(all_df):
        d.rename(columns={'Opening':'Month', 'Opening.1':'Day'}, inplace=True)
        d = d[['Month', 'Day', 'Title']].copy().dropna(thresh=3)
        d['Month'] = d['Month'].str.replace(' ', '').str.capitalize()
        d['Link'] = all_links[n]
    
        d['List'] = d['Link'].apply(get_descr)
        result_df[n] = d
        print('\nTable done!\n')

    print('\nPAGE DONE!!!\n')
    return result_df

all_years_df[2017] = scrape_whole_page_2017(r'https://en.wikipedia.org/wiki/List_of_American_films_of_2017')


----------------------------------------------------------------
Table done!

-----------------------------------------------------------
Table done!

-----------------------------------------------------
Table done!


PAGE DONE!!!



### 2018-2019

In [218]:
for year in np.arange(2018,2020):
    url = 'https://en.wikipedia.org/wiki/List_of_American_films_of_' + str(year)
    
    all_years_df[year] = scrape_whole_page(url)

-----------------------------------------------------------
Table done!

--------------------------------------------------------
Table done!

------------------------------------------------------------------
Table done!

------------------------------------------------------------
Table done!


PAGE DONE!!!

-------------------------------------------------
Table done!

-------------------------------------------------------------------
Table done!

-----------------------------------------------------------
Table done!

-----------------------------------------------------------------------
Table done!


PAGE DONE!!!



### 2020

In [256]:
all_years_df[2020] = scrape_whole_page(r'https://en.wikipedia.org/wiki/List_of_American_films_of_2020')

----------------------------------------------------------
Table done!

-----------------------------------------------
Table done!

----------------------------------------------------------------------
Table done!

---------------------------------------------------------------------------------------------------
Table done!


PAGE DONE!!!



### 2021
This was different becase it has one tab more than the others

In [259]:
def scrape_whole_page_2021(url):

    # Initialize the request
    r = requests.get(url)
    if r.status_code != 200:
        print('Bad request.')
        return None
    
    # Parse the html page 
    soup1 = BeautifulSoup(r.content, 'html.parser')
    tables = soup1.find_all('table')
    tables = tables[3:7]

    # Get all the movie links of the page
    all_links = {}
    for t in range(len(tables)):
        links = get_links(tables[t])
        all_links[t] = links
    
    # Adjust all the df and add the link column
    all_df = pd.read_html(StringIO(str(tables)))
    result_df = {}
    for n,d in enumerate(all_df):
        d.rename(columns={'Opening':'Month', 'Opening.1':'Day'}, inplace=True)
        d = d[['Month', 'Day', 'Title']].copy().dropna(thresh=3)
        d['Month'] = d['Month'].str.replace(' ', '').str.capitalize()
        d['Link'] = all_links[n]
    
        d['List'] = d['Link'].apply(get_descr)
        result_df[n] = d
        print('\nTable done!\n')

    print('\nPAGE DONE!!!\n')
    return result_df

all_years_df[2021] = scrape_whole_page_2021(r'https://en.wikipedia.org/wiki/List_of_American_films_of_2021')

--------------------------------------------------------------------
Table done!

----------------------------------------------------------------------------------
Table done!

---------------------------------------------------------------------------------------------------------
Table done!

-------------------------------------------------------------------------------------------------
Table done!


PAGE DONE!!!



### 2022

In [295]:
all_years_df[2022] = scrape_whole_page(r'https://en.wikipedia.org/wiki/List_of_American_films_of_2022')

--------------------------------------------------------------
Table done!

-----------------------------------------------------------------------
Table done!

-------------------------------------------------------------------------------------------
Table done!

---------------------------------------------------------------------------------------
Table done!


PAGE DONE!!!



In [296]:
all_years_df.keys()

dict_keys([2015, 2016, 2018, 2019, 2020, 2021, 2017, 2022])

## Cleaning

In [321]:
all_years_df[2015].keys()

dict_keys([0, 1, 2, 3])

In [320]:
all_df = all_years_df[2015][0].iloc[0:2]
all_df = all_df.drop([0,1])
all_df = all_df._append(all_years_df[2015][0])
all_df.head()

Unnamed: 0,Month,Day,Title,Link,List
0,January,2,The Woman in Black: Angel of Death,/wiki/The_Woman_in_Black:_Angel_of_Death,"[supernatural horror, $15 million[6], $48.9 mi..."
1,January,9,Taken 3,/wiki/Taken_3,"[-language French action-thriller, $48 million..."
2,January,9,Let's Kill Ward's Wife,/wiki/Let%27s_Kill_Ward%27s_Wife,"[ black comedy, None, None]"
3,January,14,Match,/wiki/Match_(film),"[ drama, None, None]"
4,January,16,Blackhat,/wiki/Blackhat_(film),"[ action thriller, $70 million[3], $19.7 milli..."


In [323]:
all_df = all_years_df[2015][0].iloc[0:2]
all_df = all_df.drop([0,1])

for n in all_years_df.keys():
    for i in all_years_df[n].keys():
        all_years_df[n][i]['Year'] = n
        all_df = all_df._append(all_years_df[n][i])

all_df.shape

(1990, 6)

In [325]:
all_df = all_df.reset_index(drop=True)
all_df['Month'] = all_df['Month'].apply(months_to_number)
all_df.head()

Unnamed: 0,Month,Day,Title,Link,List,Year
0,1,2.0,The Woman in Black: Angel of Death,/wiki/The_Woman_in_Black:_Angel_of_Death,"[supernatural horror, $15 million[6], $48.9 mi...",2015.0
1,1,9.0,Taken 3,/wiki/Taken_3,"[-language French action-thriller, $48 million...",2015.0
2,1,9.0,Let's Kill Ward's Wife,/wiki/Let%27s_Kill_Ward%27s_Wife,"[ black comedy, None, None]",2015.0
3,1,14.0,Match,/wiki/Match_(film),"[ drama, None, None]",2015.0
4,1,16.0,Blackhat,/wiki/Blackhat_(film),"[ action thriller, $70 million[3], $19.7 milli...",2015.0


In [326]:
all_df['Genres'] = all_df['List'].str.get(0)
all_df['Budget'] = all_df['List'].str.get(1)
all_df['Box office'] = all_df['List'].str.get(2)
all_df.tail()

Unnamed: 0,Month,Day,Title,Link,List,Year,Genres,Budget,Box office
1985,12,23.0,Whitney Houston: I Wanna Dance with Somebody,/wiki/Whitney_Houston:_I_Wanna_Dance_with_Some...,"[ biographical musical drama, $45 million[1], ...",2022.0,biographical musical drama,$45 million[1],$59.8 million[2][3]
1986,12,23.0,The Pale Blue Eye,/wiki/The_Pale_Blue_Eye,"[ mystery thriller, None, None]",2022.0,mystery thriller,,
1987,12,23.0,Women Talking,/wiki/Women_Talking_(film),"[drama, None, None]",2022.0,drama,,
1988,12,29.0,A Man Called Otto,/wiki/A_Man_Called_Otto,"[ comedy-drama, $50 million, $113.2 million[1]]",2022.0,comedy-drama,$50 million,$113.2 million[1]
1989,12,30.0,"Alice, Darling","/wiki/Alice,_Darling","[psychological thriller, None, None]",2022.0,psychological thriller,,


In [331]:
mask1 = all_df['Budget'].isna()
mask2 = all_df['Box office'].isna()

mask3 = (mask1 | mask2)
all_df = all_df[~mask3]

In [333]:
mask1 = all_df['Genres'].isna()
all_df = all_df[~mask1]
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1032 entries, 0 to 1988
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Month       1032 non-null   int64  
 1   Day         1032 non-null   float64
 2   Title       1032 non-null   object 
 3   Link        1032 non-null   object 
 4   List        1032 non-null   object 
 5   Year        1032 non-null   float64
 6   Genres      1032 non-null   object 
 7   Budget      1032 non-null   object 
 8   Box office  1032 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 80.6+ KB


In [334]:
all_df.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\jup_lab\movies_profi.csv',index=False)