In [263]:
from bs4 import BeautifulSoup as bsoup
import urllib.robotparser
import requests
import time
import pandas as pd
import re

In [265]:
# daily revenue for movie industry, contains data for the top grossing movie by day with associated revenue
# test run: goes to url and pulls all data for 1 month

url = "https://www.boxofficemojo.com/daily/2019/?interval=january&sort=date&sortDir=asc&view=month&ref_=bo_di__resort#table"
page = requests.get(url)
soup = bsoup(page.text, 'lxml')

all_divs = soup.main.find_all("div", id="table")
all_tables = all_divs[0].find_all("table")
all_trs = all_tables[0].find_all("tr")

for i in range(1, len(all_trs)):
    print(f"Entry row {i} = ")
    
    all_tds = all_trs[i].find_all("td")
    entry = []
    
    for j in range(0, len(all_tds)):
        entry.append(all_tds[j].text)       
    
    print(entry)
    

Entry row 1 = 
["Jan 1, 2019New Year's Day", 'Tuesday', '1', '$52,588,390', '-0.3%', '+23.3%', '53', 'Aquaman', '$16,377,779']
Entry row 2 = 
['Jan 2, 2019', 'Wednesday', '2', '$25,043,176', '-52.4%', '-54.9%', '53', 'Aquaman', '$7,379,476']
Entry row 3 = 
['Jan 3, 2019', 'Thursday', '3', '$22,001,708', '-12.1%', '-57.4%', '55', 'Aquaman', '$6,203,801']
Entry row 4 = 
['Jan 4, 2019', 'Friday', '4', '$38,870,140', '+76.7%', '-31.4%', '52', 'Aquaman', '$9,388,082']
Entry row 5 = 
['Jan 5, 2019', 'Saturday', '5', '$48,843,884', '+25.7%', '-16%', '52', 'Aquaman', '$13,053,690']
Entry row 6 = 
['Jan 6, 2019', 'Sunday', '6', '$31,293,715', '-35.9%', '-38.6%', '52', 'Aquaman', '$8,561,508']
Entry row 7 = 
['Jan 7, 2019', 'Monday', '7', '$9,979,794', '-68.1%', '-72.5%', '48', 'Aquaman', '$2,583,295']
Entry row 8 = 
['Jan 8, 2019', 'Tuesday', '8', '$14,364,595', '+43.9%', '-72.7%', '50', 'Aquaman', '$3,852,619']
Entry row 9 = 
['Jan 9, 2019', 'Wednesday', '9', '$9,403,436', '-34.5%', '-62.5%', 

In [266]:
# daily revenue for movie industry, contains data for the top grossing movie by day with associated revenue
# this will run through all months and the past 4 years worth of data on boxofficemojo.com to gather the revenue data
# the result is a data frame with the top grossing movie by day for the past 4 years


months = ["january","february","march","april","may","june","july","august","september", "october", "november", "december"]
years = ["2017","2018","2019","2020"]

column_names = ["date_full","weekday","day_number","top_10_gross_total","ytd_delta",
                "lw_delta","releases","top_release","top_gross"]

df = pd.DataFrame(columns=column_names)

print("running")

for year in years:
    for month in months:
        url_a = "https://www.boxofficemojo.com/daily/"
        url_b = year
        url_c = "/?interval="
        url_d = month
        url_e = "&sort=date&sortDir=asc&view=month&ref_=bo_di__resort#table"

        url = url_a + url_b + url_c + url_d + url_e
        page = requests.get(url)
        soup = bsoup(page.text, 'lxml')
        current_month = []

        all_divs = soup.main.find_all("div", id="table")
        all_tables = all_divs[0].find_all("table")
        all_trs = all_tables[0].find_all("tr")

        for i in range(1, len(all_trs)):
            all_tds = all_trs[i].find_all("td")
            entry = []
            for j in range(0, len(all_tds)):
                entry.append(all_tds[j].text)
            current_month.append(entry)

        df_month = pd.DataFrame(current_month, columns=column_names)
        frames = [df, df_month]
        df = pd.concat(frames)
        
        time.sleep(0.1)
        
    print(f"{year} processed...")
    
df.reset_index(inplace=True)
df.drop(["index", "ytd_delta", "lw_delta"], axis = 1, inplace = True)
print("complete")

running
2017 processed...
2018 processed...
2019 processed...
2020 processed...
complete


In [267]:
df.head(10)

Unnamed: 0,date_full,weekday,day_number,top_10_gross_total,releases,top_release,top_gross
0,"Jan 1, 2017New Year's Day",Sunday,1,"$56,752,913",52,Rogue One: A Star Wars Story,"$16,751,857"
1,"Jan 2, 2017",Monday,2,"$54,065,844",54,Rogue One: A Star Wars Story,"$15,913,674"
2,"Jan 3, 2017",Tuesday,3,"$24,799,823",48,Rogue One: A Star Wars Story,"$6,268,921"
3,"Jan 4, 2017",Wednesday,4,"$16,695,754",49,Rogue One: A Star Wars Story,"$4,237,535"
4,"Jan 5, 2017",Thursday,5,"$14,928,109",50,Rogue One: A Star Wars Story,"$3,893,517"
5,"Jan 6, 2017",Friday,6,"$35,952,754",46,Hidden Figures,"$7,621,334"
6,"Jan 7, 2017",Saturday,7,"$50,026,406",44,Rogue One: A Star Wars Story,"$9,558,458"
7,"Jan 8, 2017",Sunday,8,"$34,512,473",45,Rogue One: A Star Wars Story,"$6,420,138"
8,"Jan 9, 2017",Monday,9,"$10,419,513",46,Hidden Figures,"$1,857,416"
9,"Jan 10, 2017",Tuesday,10,"$15,267,153",44,Hidden Figures,"$2,877,410"


In [268]:
# cleaning up the revenue values

df['top_10_gross_total'] = df['top_10_gross_total'].str.replace("$", "")
df['top_10_gross_total'] = df['top_10_gross_total'].str.replace(",", "")
df['top_gross'] = df['top_gross'].str.replace("$", "")
df['top_gross'] = df['top_gross'].str.replace(",", "")

df.head()

Unnamed: 0,date_full,weekday,day_number,top_10_gross_total,releases,top_release,top_gross
0,"Jan 1, 2017New Year's Day",Sunday,1,56752913,52,Rogue One: A Star Wars Story,16751857
1,"Jan 2, 2017",Monday,2,54065844,54,Rogue One: A Star Wars Story,15913674
2,"Jan 3, 2017",Tuesday,3,24799823,48,Rogue One: A Star Wars Story,6268921
3,"Jan 4, 2017",Wednesday,4,16695754,49,Rogue One: A Star Wars Story,4237535
4,"Jan 5, 2017",Thursday,5,14928109,50,Rogue One: A Star Wars Story,3893517


In [269]:
# cleaning up the date column, adding a new columns for month, day, year, and event (holiday)

dates = df[['date_full']]
holiday = pd.DataFrame(columns = ["date", "year", "event", "month", "day"])
holiday[["date", "year", "event"]] = dates.date_full.str.split('(\d{4})', expand = True)
holiday['date'] = holiday['date'].str.replace(",", "")
holiday[["month", "day"]] = holiday.date.str.split(' ', n= 1, expand = True)
final = holiday[['month', 'day', 'year', 'event']]

final.head()

Unnamed: 0,month,day,year,event
0,Jan,1,2017,New Year's Day
1,Jan,2,2017,
2,Jan,3,2017,
3,Jan,4,2017,
4,Jan,5,2017,


In [270]:
# update date columns using new dataframe from above

df[['month', 'day', 'year', 'event']] = final[['month', 'day', 'year', 'event']]
df.drop(["date_full"], axis = 1, inplace = True)

In [271]:
df.to_csv("movie_data_v1.csv")
df.head()

Unnamed: 0,weekday,day_number,top_10_gross_total,releases,top_release,top_gross,month,day,year,event
0,Sunday,1,56752913,52,Rogue One: A Star Wars Story,16751857,Jan,1,2017,New Year's Day
1,Monday,2,54065844,54,Rogue One: A Star Wars Story,15913674,Jan,2,2017,
2,Tuesday,3,24799823,48,Rogue One: A Star Wars Story,6268921,Jan,3,2017,
3,Wednesday,4,16695754,49,Rogue One: A Star Wars Story,4237535,Jan,4,2017,
4,Thursday,5,14928109,50,Rogue One: A Star Wars Story,3893517,Jan,5,2017,
