In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [2]:
from io import StringIO
import re

### Get the links

In [53]:
'''
This function finds all links of all the movies in a table and create a list
'''

def get_links(table, title_pos):
    links = []
    for i in range(len(table.find_all('tr')) -1):
        i += 1
        try:
            messy = table.find_all('tr')[i].find_all('td')[title_pos]
            links.append(messy.find('a',href=True)['href'])
        except:
            links.append(None)
    
    return links

### Get the description column

In [4]:
'''
This function finds the gross profit and budget for the movie
'''

def get_profit(soup):
    try:
        table = soup.find_all(class_="infobox vevent")[0]
        budget = table.find_all('tr')[-2]
        proof = budget.find_all(class_ ='infobox-label')[0].get_text()
    except:
        return None, None
    if proof == 'Budget':
        budget = budget.find_all(class_ ='infobox-data')[0].get_text(strip=True)
        box_office = table.find_all('tr')[-1]
        box_office = box_office.find_all(class_ ='infobox-data')[0].get_text(strip=True)
        return budget, box_office

    else:
        return None, None

In [69]:
'''
This function finds the genre of the movie and calls
the get_profit so to find also the gross profit and budget for the movie
'''

pattern3 = r'\d{4} [A-Z]\w+([^\d\.]+) film[\s\[]'
pattern4 = r'\d{4} ([^\d\.]+) film[\s\[]'

def get_descr(x, session):

    try:
        x = r'https://en.wikipedia.org' + x
        r1 = session.get(x)
    except:
        return [None, None, None]

    if r1.status_code==200:
        r2 = r1
        r1.close()
        soup1_2 = BeautifulSoup(r2.content, 'html.parser')

        budget, box_office = get_profit(soup1_2)

        first_line = soup1_2.find_all('p')[0].get_text(strip=False)
        print('-', end='')
        if len(first_line) < 75:
            first_line = soup1_2.find_all('p')[1].get_text(strip=False)
        result = re.findall(pattern3, first_line)

        if len(result) > 0:
            return [result[0],budget, box_office]
        
        else:
            result = re.findall(pattern4, first_line)
            if len(result) > 0:
                return [result[0],budget, box_office]
            else:
                return [None,budget, box_office]
    else:
        return [None,budget, box_office]

### Convert the months to numbers

In [6]:
def months_to_number(x):
    return months[x]

months = {'January':1, 'February':2, 'March':3,
          'April':4, 'May':5, 'June':6, 'July':7,
          'August':8,'September':9, 'October':10,
          'November':11, 'December':12}

## Find the position of 'Title'

In [49]:
def get_title_pos(tables):
    length = len(tables.select('tr')[0].select('th'))

    for n,x in enumerate(tables.select('tr')[0].select('th')):
        if re.findall('(\w+)', x.get_text())[0] == 'Title':

            return n - length

## Scrape the whole page

In [92]:
def scrape_whole_page(url):

    # Initialize the request
    session = requests.Session()
    r = session.get(url)
    if r.status_code != 200:
        print('Bad request.')
        return None
    
    # Parse the html page 
    soup1 = BeautifulSoup(r.content, 'html.parser')
    tables = soup1.select(".wikitable.sortable")

    title_pos = get_title_pos(tables[0])

    # Get all the movie links of the page
    all_links = {}
    for t in range(len(tables)):
        links = get_links(tables[t], title_pos)
        all_links[t] = links
    
    # Combine all tables and add the Link & List column
    df_list = []
    for n,t in enumerate(tables):
        try:
            df = pd.read_html(StringIO(str(t)))[0]
            df.rename(columns={'Opening':'Month', 'Opening.1':'Day'}, inplace=True)
            df = df[['Month', 'Day', 'Title']].copy().dropna(thresh=3)

            df['Month'] = df['Month'].str.replace(' ', '').str.capitalize()
            df['Link'] = all_links[n]

            df['List'] = df['Link'].apply(get_descr, session=session)

            df_list.append(df)
            print('0\n')
        except Exception as e:
            print(e)
            pass

    df = pd.concat(df_list, axis=0,ignore_index=True)

    return df

## Scraping

In [90]:
years_df = []

for year in np.arange(2004,2011):
    url = r'https://en.wikipedia.org/wiki/List_of_American_films_of_' + str(year)
    df = scrape_whole_page(url)
    df['Year'] = year
    years_df.append(df)

--------------------------------------------0

-----------------------cannot access local variable 'budget' where it is not associated with a value
--------------------------------------------------0

------------------------------------------------------------0

PAGE DONE!!!

------------------------------------0

--------------------------------------------0

----------------------------------------------------0

-------------------------------------------------------------0

PAGE DONE!!!

-----------------------------------------------0

------------------------------------------0

-------------------------------------------------------0

---------------------------------------------------------------------0

PAGE DONE!!!

-----------------------------------------------------0

-------------------------------------------------------------------0

----------------------------------------------------------------------0

-----------------------------------------------------------------

In [93]:
years_df2 = []

for year in np.arange(2011,2023):
    url = r'https://en.wikipedia.org/wiki/List_of_American_films_of_' + str(year)
    df = scrape_whole_page(url)
    df['Year'] = year
    years_df2.append(df)
    print(str(year),'PAGE DONE!!!\n')

-------------------------------------------0

------------------------------------------------------0

----------------------------------------------------------0

-------------------------------------------------------0

2011 PAGE DONE!!!

----------------------------------------------------------------0

------------------------------------------------------------------------0

----------------------------------------------------------------0

------------------------------------------------------------------------0

2012 PAGE DONE!!!

----------------------------------------------------------0

--------------------------------------------------------------0

-------------------------------------------------------------0

-----------------------------------------------0

2013 PAGE DONE!!!

----------------------------------------------------------0

----------------------------------------------------0

-----------------------------------------------------------0

-------------------

In [94]:
url = r'https://en.wikipedia.org/wiki/List_of_American_films_of_' + str(2023)
df = scrape_whole_page(url)
df['Year'] = 2023
years_df2.append(df)
print(str(year),'PAGE DONE!!!\n')

----------------------------------------------------------------------0

-------------------------------------------------------------------------------0

----------------------------------------------------------------------------0

--------------------------------------------------------------------------------------------------0

2022 PAGE DONE!!!



### Combining all the datasets

In [97]:
df_final = pd.concat(years_df + years_df2, axis=0,ignore_index=True)
df_final.head()

Unnamed: 0,Month,Day,Title,Link,List,Year
0,January,9.0,Chasing Liberty,/wiki/Chasing_Liberty,"[romantic comedy, $23 million[1], $12 million[1]]",2004
1,January,9.0,My Baby's Daddy,/wiki/My_Baby%27s_Daddy,"[None, $12 million, $18.5 million]",2004
2,January,16.0,Along Came Polly,/wiki/Along_Came_Polly,"[ romantic comedy, $42 million, $178.3 million...",2004
3,January,16.0,The Goodbye Girl,/wiki/The_Goodbye_Girl_(2004_film),"[ television comedy, None, None]",2004
4,January,16.0,Pixel Perfect,/wiki/Pixel_Perfect,"[None, None, None]",2004


In [98]:
df_final.tail()

Unnamed: 0,Month,Day,Title,Link,List,Year
4931,December,22.0,Memory,/wiki/Memory_(2023_film),"[drama, None, None]",2023
4932,December,25.0,The Color Purple,/wiki/The_Color_Purple_(2023_film),"[ coming-of-age musical period drama, $90–100 ...",2023
4933,December,25.0,The Boys in the Boat,/wiki/The_Boys_in_the_Boat_(film),"[ biographical sports drama, $40 million[4], $...",2023
4934,December,25.0,Ferrari,/wiki/Ferrari_(2023_film),"[ biographical sports drama, $95 million[4], $...",2023
4935,December,29.0,Good Grief,/wiki/Good_Grief_(film),[ comedy-drama film written and directed by Da...,2023


## Basic Cleaning

In [99]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4936 entries, 0 to 4935
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Month   4936 non-null   object 
 1   Day     4936 non-null   float64
 2   Title   4936 non-null   object 
 3   Link    4882 non-null   object 
 4   List    4936 non-null   object 
 5   Year    4936 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 231.5+ KB


In [100]:
df_final = df_final.reset_index(drop=True)
df_final['Month'] = df_final['Month'].apply(months_to_number)

In [103]:
import warnings

warnings.filterwarnings('ignore')

In [104]:
df_final = df_final[['Year','Month','Day','Title','Link','List']]
df_final['Genres'] = df_final['List'].str.get(0)
df_final['Budget'] = df_final['List'].str.get(1)
df_final['Box office'] = df_final['List'].str.get(2)
df_final.tail()

Unnamed: 0,Year,Month,Day,Title,Link,List,Genres,Budget,Box office
4931,2023,12,22.0,Memory,/wiki/Memory_(2023_film),"[drama, None, None]",drama,,
4932,2023,12,25.0,The Color Purple,/wiki/The_Color_Purple_(2023_film),"[ coming-of-age musical period drama, $90–100 ...",coming-of-age musical period drama,$90–100 million[2][3],$47.2 million[4][5]
4933,2023,12,25.0,The Boys in the Boat,/wiki/The_Boys_in_the_Boat_(film),"[ biographical sports drama, $40 million[4], $...",biographical sports drama,$40 million[4],$24.8 million[5][6]
4934,2023,12,25.0,Ferrari,/wiki/Ferrari_(2023_film),"[ biographical sports drama, $95 million[4], $...",biographical sports drama,$95 million[4],$14.7 million[5]
4935,2023,12,29.0,Good Grief,/wiki/Good_Grief_(film),[ comedy-drama film written and directed by Da...,comedy-drama film written and directed by Dan...,,


In [105]:
mask1 = df_final['Budget'].isna()
mask2 = df_final['Box office'].isna()

mask3 = (mask1 | mask2)
df_final2 = df_final[~mask3]

In [106]:
mask1 = df_final2['Genres'].isna()
df_final2 = df_final2[~mask1]
df_final2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2938 entries, 0 to 4934
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Year        2938 non-null   int64  
 1   Month       2938 non-null   int64  
 2   Day         2938 non-null   float64
 3   Title       2938 non-null   object 
 4   Link        2938 non-null   object 
 5   List        2938 non-null   object 
 6   Genres      2938 non-null   object 
 7   Budget      2938 non-null   object 
 8   Box office  2938 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 229.5+ KB


In [107]:
df_final2.to_csv(r'movies_2004_2023.csv',index=False)