In [2]:
from bs4 import BeautifulSoup as bsoup
import urllib.robotparser
import requests
import time
import pandas as pd
import numpy as np
import re
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [None]:
# DATA COLLECTION

## MOVIE
# First scrape daily box office info for all movies in theaters from boxofficemojo.com

# Build date for URL and make requests for each day from 2015-2019
month_and_days = {'01': [x for x in range(1,32)], '02': [x for x in range(1,29)], '03':[x for x in range(1,32)],
                  '04':[x for x in range(1,31)],'05':[x for x in range(1,32)], '06':[x for x in range(1,31)], 
                 '07':[x for x in range(1,32)], '08':[x for x in range(1,32)], '09':[x for x in range(1,31)],
                 '10': [x for x in range(1,32)], '11':[x for x in range(1,31)], '12':[x for x in range(1,32)]}

leap_years = [2008, 2016, 2020]
leap_year_month_days = {'01': [x for x in range(1,32)], '02': [x for x in range(1,30)], '03':[x for x in range(1,32)],
                  '04':[x for x in range(1,31)],'05':[x for x in range(1,32)], '06':[x for x in range(1,31)], 
                 '07':[x for x in range(1,32)], '08':[x for x in range(1,32)], '09':[x for x in range(1,31)],
                 '10': [x for x in range(1,32)], '11':[x for x in range(1,31)], '12':[x for x in range(1,32)]}

all_data = []

for year in range(2015, 2020):
    # build url for each year-month-day and scrape table data
    url = 'https://www.boxofficemojo.com/date/'
    
    if year in leap_years:
        m_d = leap_year_month_days
    else:
        m_d = month_and_days
        
    y = str(year)
    
    for month in m_d:
        m = '-' + month
        for day in m_d[month]:
            if len(str(day)) == 1:
                d = '-' + '0' + str(day)
            else:
                d = '-' + str(day)
            
            date = y + m + d
            url_curr = url + date
            
            #get data
            page = requests.get(url_curr)
            soup = bsoup(page.text, 'lxml')
            
            all_divs = soup.main.find_all("div", id="table")
            
            try:
                all_tables = all_divs[0].find_all("table")
            except:
                print(date)
            
            try:
                all_trs = all_tables[0].find_all("tr")
            except:
                print(date)
            
            for i in range(1, len(all_trs)):

                all_tds = all_trs[i].find_all("td")
                a_href = all_tds[2].find('a',href=True)
                a_href = 'http://boxofficemojo.com' + a_href['href']
                entry = []

                for j in range(0, len(all_tds)):
                    entry.append(all_tds[j].text)       
                
                entry = entry[0:11]
                entry.append(a_href)
                entry.append(date)

                # append row data to all_data list
                all_data.append(entry)
    print(f'{year} processed...')

In [None]:
# Check length
len(all_data)

In [None]:
# Create data frame from all_data and save to pickle
column_names = ['TD', 'YD', 'Movie', 'Daily', '%YD', '%LW', 'Theaters', 'Avg', 'To_Date', 
                'Days', 'Distributor', 'href','Date']
df = pd.DataFrame(all_data, columns=column_names)
df.to_pickle('boxofficemojo-daily_gross.pkl')
df

In [None]:
# read data back in
df = pd.read_pickle('boxofficemojo-daily_gross.pkl')

In [None]:
# Make a list of unique titles
titles_unique = list(set(df['Movie']))
len(titles_unique)

In [None]:
# Go to each movie page to scrape genre data and add to dataframe
genre_dict = {}

for m in titles_unique:
    h = list(df.loc[df['Movie'] == m, 'href'])[0]
    movie_page = requests.get(h)
    movie_soup = bsoup(movie_page.text, 'lxml')

    table_div = movie_soup.find('div', {'class': 'mojo-summary-values'})
    sub_divs = table_div.find_all('div')

    for s in sub_divs:
        if re.search('^Genres',s.text):
            genres = s.text
            genres = re.sub('Genres', '', genres)
            genres = re.sub('\s+', ' ', genres)
    genre_dict[m] = genres.split()

genre_dict

In [None]:
# For each movie in the dictionary, unpack genres into string and add to genre column for that film
for m in genre_dict:
    genres = genre_dict[m]
    genre_str = ''
    for g in genres:
        genre_str += g + '|'
    df.loc[df['Movie'] == m,'genre'] = genre_str

df.genre.sample(50)

In [None]:
# Split the genres into their own columns with True/False values
unique_genre =  df.genre.str.split('|').sum()
unique_genre = set(unique_genre)
unique_genre

for g in unique_genre:
    df[g] = df.genre.map( lambda x: g in x.split('|') )
    
df.sample(10)

In [None]:
# Replace non-numeric values for numeric data columns
df['Daily'] = df['Daily'].str.replace('[^0-9]', '')
df['To_Date'] = df['To_Date'].str.replace('[^0-9]', '')
df['%YD'] = df['%YD'].str.replace('+', '')
df['%YD'] = df['%YD'].str.replace('-', '')
df['%YD'] = df['%YD'].str.replace('%', '')
df['%YD'] = df['%YD'].str.replace(',', '')
df['%YD'] = df['%YD'].str.replace('<0.1', '0')
df['%LW'] = df['%LW'].str.replace('+', '')
df['%LW'] = df['%LW'].str.replace('-', '')
df['%LW'] = df['%LW'].str.replace('%', '')
df['%LW'] = df['%LW'].str.replace(',', '')
df['%LW'] = df['%LW'].str.replace('<0.1', '0')
df['Theaters'] = df['Theaters'].str.replace('[^0-9]', '')
df['Avg'] = df['Avg'].str.replace('[^0-9]', '')
df['Days'] = df['Days'].str.replace('[^0-9]', '')
df['Distributor'] = df['Distributor'].str.replace('\\n', '')
df.head()

In [None]:
# Update data types
df['Daily'] = pd.to_numeric(df['Daily'])
df['%YD'] = pd.to_numeric(df['%YD'])
df['%LW'] = pd.to_numeric(df['%LW'])
df['Theaters'] = pd.to_numeric(df['Theaters'])
df['Avg'] = pd.to_numeric(df['Avg'])
df['To_Date'] = pd.to_numeric(df['To_Date'])
df['Days'] = pd.to_numeric(df['Days'])
df['Date'] = pd.to_datetime(df['Date'])
df.dtypes

In [None]:
df.sample(10)

In [None]:
# Send data to pickle
df.to_pickle('clean-boxofficemojo-daily_gross.pkl')

In [None]:
## LABOR STATISTICS

# Scrape data from the US Bureau of Labor Statistics
# Consumer prices increase 5.0 percent for the year ended May 2021
url = "https://www.bls.gov/opub/ted/2021/consumer-prices-increase-5-0-percent-for-the-year-ended-may-2021.htm"
page = requests.get(url)
soup = bsoup(page.text, 'lxml')

In [None]:
all_tables = soup.find_all("table", class_="regular")
headers = all_tables[0].find('thead').text.strip().split("\n")
headers

all_rows = []

tbody = all_tables[0].find('tbody')
entries = tbody.find_all('tr')

for i in entries:
    x = i.text.strip().split("\n")
    x = x[0:1] + x[2:]
    all_rows.append(x)

all_dat = pd.DataFrame(all_rows, columns=headers)
all_dat['Month'] = pd.to_datetime(all_dat['Month'])
all_dat = all_dat.replace('%','', regex=True)

heads = all_dat.columns[1:]
for item in heads:
    all_dat[item] = pd.to_numeric(all_dat[item])
    
all_dat.to_pickle("final-clean-consumer_prices.pkl")
all_dat.to_csv("final-clean-consumer_prices.csv")
all_dat

In [None]:
# Real average weekly earnings down 2.2 percent from May 2020 to May 2021
url = "https://www.bls.gov/opub/ted/2021/real-average-weekly-earnings-down-2-2-percent-from-may-2020-to-may-2021.htm"
page = requests.get(url)
soup = bsoup(page.text, 'lxml')

In [None]:
all_tables = soup.find_all("table", class_="regular")
headers = all_tables[0].find('thead')
heads = headers.find_all('tr')
head = ["Month"] + heads[1].text.strip().split("\n")
head

all_rows = []

tbody = all_tables[0].find('tbody')
entries = tbody.find_all('tr')

for i in entries:
    x = i.text.strip().split("\n")
    x = x[0:1] + x[2:]
    all_rows.append(x)

all_dat = pd.DataFrame(all_rows, columns=head)
all_dat['Month'] = pd.to_datetime(all_dat['Month'])
all_dat = all_dat.replace('\$','', regex=True)
all_dat = all_dat.replace('%','', regex=True)

heads = all_dat.columns[1:]
for item in heads:
    all_dat[item] = pd.to_numeric(all_dat[item])
    
all_dat.to_pickle("final-clean-weekly_earnings.pkl")
all_dat.to_csv("final-clean-weekly_earnings.csv")
all_dat

In [None]:
## Merge Datasets

with open('clean-boxofficemojo-daily_gross.pkl', "rb") as fh:
  dat = pickle.load(fh)


df = dat[['Date', 'Movie', 'Daily', 'Theaters', 'Avg',
       'To_Date', 'Days', 'Distributor', 'genre', 'Music',
       'Short', 'Family', 'Horror', 'Crime', 'Documentary', 'News', 'Musical',
       'History', 'Western', 'War', 'Comedy', 'Thriller', 'Action', 'Romance',
       'Biography', 'Sport', 'Sci-Fi', 'Drama', 'Fantasy', 'Animation',
       'Mystery', 'Adventure', 'Film-Noir', 'Adult']]
df = df.dropna()
col = df.columns.tolist()

# mutiply genre by daily to dispay total revenue per genre 
for i in range(9, len(col)):
    df[col[i]] = df[col[i]]*df['Daily']
    
# This is the main raw movie DF
df.head()

In [None]:
# create month column and group movie data by month
df['Month'] = df['Date'].apply(lambda x: x.strftime('%Y-%m'))

df = df.groupby(['Month']).agg({'Music':'sum',
                             'Short':'sum',
                             'Family':'sum',
                             'Horror':'sum',
                             'Crime':'sum',
                             'Documentary':'sum',
                             'News':'sum',
                             'Musical':'sum',
                             'History':'sum',
                             'Western':'sum',
                             'War':'sum',
                             'Comedy':'sum',
                             'Thriller':'sum',
                             'Action':'sum',
                             'Romance':'sum',
                             'Biography':'sum',
                             'Sport':'sum',
                             'Sci-Fi':'sum',
                             'Drama':'sum',
                             'Fantasy':'sum',
                             'Animation':'sum',
                             'Mystery':'sum',
                             'Adventure':'sum',
                             'Film-Noir':'sum',
                             'Adult':'sum'}).round(2)
df.sample(10)

In [None]:
# create new DF Genre that has all the info from the original df 
genre = df.reset_index()

# make month date time
genre['Month'] = pd.to_datetime(genre['Month'])

# read and display economic data
cp = pd.read_pickle('final-clean-consumer_prices.pkl')
we = pd.read_pickle('final-clean-weekly_earnings.pkl')
print(genre.columns)
print(cp.columns)
print(we.columns)
genre.dtypes

In [None]:
#merge all movie and economic data, save as pkl
alldat = pd.merge(genre, cp, how='left', left_on='Month', right_on='Month')
alldat = pd.merge(alldat, we, how='left', left_on='Month', right_on='Month')

alldat.to_pickle('alldat.pkl')

In [4]:
# reload the alldat pickle and store in df
with open('alldat.pkl', "rb") as fh:
  df = pickle.load(fh)

df

FileNotFoundError: [Errno 2] No such file or directory: './alldat.pkl'

In [8]:
df = pd.read_pickle('alldat.pkl')
df.sample(10)

FileNotFoundError: [Errno 2] No such file or directory: '../analysis_AL/alldat.pkl'

In [None]:
# create new data frame with only movie data to show the monthly revenue trends over time

cf = df[['Month', 'Music', 'Short', 'Family', 'Horror', 'Crime', 'Documentary',
       'News', 'Musical', 'History', 'Western', 'War', 'Comedy', 'Thriller',
       'Action', 'Romance', 'Biography', 'Sport', 'Sci-Fi', 'Drama', 'Fantasy',
       'Animation', 'Mystery', 'Adventure', 'Film-Noir', 'Adult']]

# melt data so the genre is categorical variable and revenue is the value 
cf = pd.melt(cf, id_vars=['Month'], value_vars=['Music', 'Short', 'Family', 'Horror', 'Crime', 'Documentary',
       'News', 'Musical', 'History', 'Western', 'War', 'Comedy', 'Thriller',
       'Action', 'Romance', 'Biography', 'Sport', 'Sci-Fi', 'Drama', 'Fantasy',
       'Animation', 'Mystery', 'Adventure', 'Film-Noir', 'Adult'],
        var_name='Genre', value_name='Revenue')



g = sns.FacetGrid(cf, col="Genre", col_wrap=5, size=3, height=2)
g.map(sns.lineplot, "Month", "Revenue", )
g.set(xticks=cf.Month[0::12])