In [1]:
# import dependencies
import pandas as pd
import requests
import numpy as np

In [2]:
# links
url1 = 'https://www.boxofficemojo.com/year/2023/?ref_=bo_yl_table_1'
url2 = 'https://www.boxofficemojo.com/holiday/?ref_=bo_nb_yld_secondarytab'
url3 = 'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&ref_=bo_cso_ac'

# retrieve html for links
response1 = requests.get(url1)
response2 = requests.get(url2)
response3 = requests.get(url3)

# Read responses into tables
top_domestic_2023 = pd.read_html(response1.text)
holiday_revenue_2023 = pd.read_html(response2.text)
top_200_alltime = pd.read_html(response3.text)

In [3]:
# Select and display tables
top_dom_2023_df = top_domestic_2023[0]
holiday_rev_2023_df = holiday_revenue_2023[0]
top_200_alltime_df = top_200_alltime[0]

display(top_dom_2023_df.head())
display(holiday_rev_2023_df.head())
display(top_200_alltime_df.head())

Unnamed: 0,Rank,Release,Genre,Budget,Running Time,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated
0,1,Barbie,-,-,-,"$636,218,186",4337,"$636,218,186",Jul 21,Warner Bros.,False
1,2,The Super Mario Bros. Movie,-,-,-,"$574,934,330",4371,"$574,934,330",Apr 5,Universal Pictures,False
2,3,Spider-Man: Across the Spider-Verse,-,-,-,"$381,311,319",4332,"$381,311,319",Jun 2,Columbia Pictures,False
3,4,Guardians of the Galaxy Vol. 3,-,-,-,"$358,995,815",4450,"$358,995,815",May 5,Walt Disney Studios Motion Pictures,False
4,5,Oppenheimer,-,-,-,"$326,067,790",3761,"$326,067,790",Jul 21,Universal Pictures,False


Unnamed: 0,Holiday,Cumulative Gross,% of Year,Releases,Average,#1 Release,Genre,Budget,Running Time,Gross,% of Total
0,Post-Thanksgiving Weekend,"$97,203,446",1.1%,52,"$1,869,297",Renaissance: A Film by Beyoncé,-,-,-,"$21,801,216",22.4%
1,Thanksgiving Weekend,"$116,005,630",1.4%,54,"$2,148,252",The Hunger Games: The Ballad of Songbirds & Sn...,-,-,-,"$29,042,517",25%
2,Thanksgiving 4-Day Weekend,"$138,550,022",1.6%,37,"$3,744,595",The Hunger Games: The Ballad of Songbirds & Sn...,-,-,-,"$34,924,898",25.2%
3,Thanksgiving,"$22,728,673",0.3%,31,"$733,183",The Hunger Games: The Ballad of Songbirds & Sn...,-,-,-,"$5,882,381",25.9%
4,Thanksgiving 5-Day Weekend,"$173,203,005",2%,40,"$4,330,075",The Hunger Games: The Ballad of Songbirds & Sn...,-,-,-,"$42,206,080",24.4%


Unnamed: 0,Title,Rank,Lifetime Gross,Overall Rank,Year
0,Star Wars: Episode VII - The Force Awakens,1,"$936,662,225",1,2015
1,Avengers: Endgame,2,"$858,373,000",2,2019
2,Spider-Man: No Way Home,3,"$814,115,070",3,2021
3,Avatar,4,"$785,221,649",4,2009
4,Top Gun: Maverick,5,"$718,732,821",5,2022


### Table - Top Domestic Movies of 2023 

In [4]:
top_dom_2023_df.columns

Index(['Rank', 'Release', 'Genre', 'Budget', 'Running Time', 'Gross',
       'Theaters', 'Total Gross', 'Release Date', 'Distributor', 'Estimated'],
      dtype='object')

In [5]:
top_dom_2023_df['Estimated'].value_counts()

Estimated
False    200
Name: count, dtype: int64

In [6]:
# remove Genre, Budget, Running Time, and Estimated
top_dom_2023_df = top_dom_2023_df[['Rank', 'Release', 'Gross','Theaters', 'Total Gross', 'Distributor']]

In [7]:
top_dom_2023_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Rank         200 non-null    int64 
 1   Release      200 non-null    object
 2   Gross        200 non-null    object
 3   Theaters     200 non-null    object
 4   Total Gross  200 non-null    object
 5   Distributor  200 non-null    object
dtypes: int64(1), object(5)
memory usage: 9.5+ KB


In [8]:
top_200 = top_dom_2023_df.copy()

In [9]:
top_200['Total Gross'].apply(type).value_counts()

Total Gross
<class 'str'>    200
Name: count, dtype: int64

In [10]:
# Convert 'Gross' and 'Total Gross'

# top_200['Gross'].apply(type).value_counts()

top_200['Gross'] = top_200['Gross'].str.replace(',', '')
top_200['Gross'] = top_200['Gross'].str.replace('$', '')
top_200['Gross'] = top_200['Gross'].astype('int64', copy=True)

top_200['Total Gross'] = top_200['Total Gross'].str.replace(',', '')
top_200['Total Gross'] = top_200['Total Gross'].str.replace('$', '')
top_200['Total Gross'] = top_200['Total Gross'].astype('int64', copy=True)

top_200[['Gross', 'Total Gross']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Gross        200 non-null    int64
 1   Total Gross  200 non-null    int64
dtypes: int64(2)
memory usage: 3.2 KB


In [11]:
top_200['Theaters'].apply(type).value_counts()

Theaters
<class 'str'>    200
Name: count, dtype: int64

In [12]:
# Convert 'Theaters' column to float, nan values cannot be converted to integer.
theater_list = top_200['Theaters'].values.tolist()

new_list = []
for i in range(len(theater_list)):
    if theater_list[i] == '-':
        new_list.append(np.nan)
    else:
        new_list.append(int(theater_list[i]))

top_200['Theaters'] = new_list

top_200.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Rank         200 non-null    int64  
 1   Release      200 non-null    object 
 2   Gross        200 non-null    int64  
 3   Theaters     198 non-null    float64
 4   Total Gross  200 non-null    int64  
 5   Distributor  200 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.5+ KB


In [13]:
# Rename the Columns 

In [14]:
# Export as csv
top_200.to_csv('top_dom_200_2023.csv')

### Holidays Table

In [15]:
holiday_rev_2023_df.columns

Index(['Holiday', 'Cumulative Gross', '% of Year', 'Releases', 'Average',
       '#1 Release', 'Genre', 'Budget', 'Running Time', 'Gross', '% of Total'],
      dtype='object')

In [16]:
holiday_rev_2023_df.head(2)

# Convert Average, % of Year, cumulative gross, gross, % of Total

Unnamed: 0,Holiday,Cumulative Gross,% of Year,Releases,Average,#1 Release,Genre,Budget,Running Time,Gross,% of Total
0,Post-Thanksgiving Weekend,"$97,203,446",1.1%,52,"$1,869,297",Renaissance: A Film by Beyoncé,-,-,-,"$21,801,216",22.4%
1,Thanksgiving Weekend,"$116,005,630",1.4%,54,"$2,148,252",The Hunger Games: The Ballad of Songbirds & Sn...,-,-,-,"$29,042,517",25%


In [17]:
holiday_df = holiday_rev_2023_df.copy()

In [18]:
# Convert 'Cumulative Gross', 'Average', 'Gross' from str to int

holiday_df['Cumulative Gross'] = holiday_df['Cumulative Gross'].str.replace(',', '')
holiday_df['Cumulative Gross'] = holiday_df['Cumulative Gross'].str.replace('$', '')
holiday_df['Cumulative Gross'] = holiday_df['Cumulative Gross'].astype('int64', copy=True)

holiday_df['Average'] = holiday_df['Average'].str.replace(',', '')
holiday_df['Average'] = holiday_df['Average'].str.replace('$', '')
holiday_df['Average'] = holiday_df['Average'].astype('int64', copy=True)

holiday_df['Gross'] = holiday_df['Gross'].str.replace(',', '')
holiday_df['Gross'] = holiday_df['Gross'].str.replace('$', '')
holiday_df['Gross'] = holiday_df['Gross'].astype('int64', copy=True)


holiday_df[['Cumulative Gross', 'Average', 'Gross']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Cumulative Gross  21 non-null     int64
 1   Average           21 non-null     int64
 2   Gross             21 non-null     int64
dtypes: int64(3)
memory usage: 632.0 bytes


In [19]:
holiday_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Holiday           21 non-null     object
 1   Cumulative Gross  21 non-null     int64 
 2   % of Year         21 non-null     object
 3   Releases          21 non-null     int64 
 4   Average           21 non-null     int64 
 5   #1 Release        21 non-null     object
 6   Genre             21 non-null     object
 7   Budget            21 non-null     object
 8   Running Time      21 non-null     object
 9   Gross             21 non-null     int64 
 10  % of Total        21 non-null     object
dtypes: int64(4), object(7)
memory usage: 1.9+ KB


In [20]:
# Convert columns '% of Year' and '% of Total' to float

holiday_df['% of Year'] = holiday_df['% of Year'].str.replace('%', '')
holiday_df['% of Year'] = holiday_df['% of Year'].astype(float)/100

holiday_df['% of Total'] = holiday_df['% of Total'].str.replace('%', '')
holiday_df['% of Total'] = holiday_df['% of Total'].astype(float)/100


holiday_df[['% of Year', '% of Total']].head()

Unnamed: 0,% of Year,% of Total
0,0.011,0.224
1,0.014,0.25
2,0.016,0.252
3,0.003,0.259
4,0.02,0.244


In [21]:
# Remove columns without data

display(holiday_df.columns)

holiday_df = holiday_df[['Holiday', 'Cumulative Gross', '% of Year', 'Releases', 'Average',
       '#1 Release', 'Gross', '% of Total']]

display(holiday_df.columns)

Index(['Holiday', 'Cumulative Gross', '% of Year', 'Releases', 'Average',
       '#1 Release', 'Genre', 'Budget', 'Running Time', 'Gross', '% of Total'],
      dtype='object')

Index(['Holiday', 'Cumulative Gross', '% of Year', 'Releases', 'Average',
       '#1 Release', 'Gross', '% of Total'],
      dtype='object')

In [22]:
holiday_list = holiday_df['Holiday'].values.tolist()

holiday_list

['Post-Thanksgiving Weekend',
 'Thanksgiving Weekend',
 'Thanksgiving 4-Day Weekend',
 'Thanksgiving',
 'Thanksgiving 5-Day Weekend',
 'Halloween',
 "Indigenous People's Day",
 "Indigenous People's Day Weekend",
 'Labor Day',
 'Labor Day Weekend',
 'Independence Day',
 'Independence Day Weekend',
 'Memorial Day',
 'Memorial Day Weekend',
 'Easter',
 'Easter Weekend',
 "President's Day",
 "President's Day Weekend",
 'MLK Day',
 'MLK Day Weekend',
 "New Year's Day"]

In [23]:
# Encode each holiday as a month
# holiday month list
month_list = []
month_name_list = []

# loop thru list
for i in range(len(holiday_list)):
    if i <= 4:
        month_list.append(11)
        month_name_list.append('November')
    elif i <= 7:
        month_list.append(10)
        month_name_list.append('October')
    elif i <= 9:
        month_list.append(9)
        month_name_list.append('September')
    elif i <= 11:
        month_list.append(7)
        month_name_list.append('July')
    elif i  <= 13:
        month_list.append(5)
        month_name_list.append('May')
    elif i <= 15: 
        month_list.append(4)
        month_name_list.append('April')
    elif i <= 17: 
        month_list.append(2)
        month_name_list.append('February')
    else:
        month_list.append(1)
        month_name_list.append('January')
        
display(len(month_list))
display(len(month_name_list))

# Add lists to df as columns
holiday_df['num_month'] = month_list
holiday_df['month'] = month_name_list

holiday_df.head()

21

21

Unnamed: 0,Holiday,Cumulative Gross,% of Year,Releases,Average,#1 Release,Gross,% of Total,num_month,month
0,Post-Thanksgiving Weekend,97203446,0.011,52,1869297,Renaissance: A Film by Beyoncé,21801216,0.224,11,November
1,Thanksgiving Weekend,116005630,0.014,54,2148252,The Hunger Games: The Ballad of Songbirds & Sn...,29042517,0.25,11,November
2,Thanksgiving 4-Day Weekend,138550022,0.016,37,3744595,The Hunger Games: The Ballad of Songbirds & Sn...,34924898,0.252,11,November
3,Thanksgiving,22728673,0.003,31,733183,The Hunger Games: The Ballad of Songbirds & Sn...,5882381,0.259,11,November
4,Thanksgiving 5-Day Weekend,173203005,0.02,40,4330075,The Hunger Games: The Ballad of Songbirds & Sn...,42206080,0.244,11,November


In [24]:
holiday_df.columns

Index(['Holiday', 'Cumulative Gross', '% of Year', 'Releases', 'Average',
       '#1 Release', 'Gross', '% of Total', 'num_month', 'month'],
      dtype='object')

In [25]:
# Rearrange order of columns
display(holiday_df.columns)

holiday_df = holiday_df[['Holiday', 'month', 'num_month', 'Cumulative Gross', '% of Year', 'Releases', 'Average',
       '#1 Release', 'Gross', '% of Total']]

display(holiday_df.columns)

Index(['Holiday', 'Cumulative Gross', '% of Year', 'Releases', 'Average',
       '#1 Release', 'Gross', '% of Total', 'num_month', 'month'],
      dtype='object')

Index(['Holiday', 'month', 'num_month', 'Cumulative Gross', '% of Year',
       'Releases', 'Average', '#1 Release', 'Gross', '% of Total'],
      dtype='object')

In [26]:
# Rename the Columns 
# holiday_df.columns = ['holiday', 'month', 'num_month', 'cumulative_gross', 'percent_of_year', 'releases', 'average', 'top_release', 'gross', 'percent_of_total']

# holiday_df.columns

In [27]:
# Export to csv
holiday_df.to_csv('holiday_revenue_2023.csv')

### Top 200 Movies of Alltime

In [28]:
top_200_alltime_df.head(3)

Unnamed: 0,Title,Rank,Lifetime Gross,Overall Rank,Year
0,Star Wars: Episode VII - The Force Awakens,1,"$936,662,225",1,2015
1,Avengers: Endgame,2,"$858,373,000",2,2019
2,Spider-Man: No Way Home,3,"$814,115,070",3,2021


In [29]:
top_200_alltime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Title           200 non-null    object
 1   Rank            200 non-null    int64 
 2   Lifetime Gross  200 non-null    object
 3   Overall Rank    200 non-null    int64 
 4   Year            200 non-null    int64 
dtypes: int64(3), object(2)
memory usage: 7.9+ KB


In [30]:
alltime_df = top_200_alltime_df.copy()

In [31]:
# Convert 'Lifetime Gross' column to integer type

alltime_df['Lifetime Gross'] = alltime_df['Lifetime Gross'].str.replace(',', '')
alltime_df['Lifetime Gross'] = alltime_df['Lifetime Gross'].str.replace('$', '')
alltime_df['Lifetime Gross'] = alltime_df['Lifetime Gross'].astype('int64', copy=True)

alltime_df.head(3)

Unnamed: 0,Title,Rank,Lifetime Gross,Overall Rank,Year
0,Star Wars: Episode VII - The Force Awakens,1,936662225,1,2015
1,Avengers: Endgame,2,858373000,2,2019
2,Spider-Man: No Way Home,3,814115070,3,2021


In [32]:
# Export to csv
alltime_df.to_csv('top_200_alltime.csv')