In [16]:
from bs4 import BeautifulSoup as bsoup
import urllib.robotparser
import requests
import time
import pandas as pd
import re
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [7]:
# daily revenue for movie industry, contains data for the top grossing movie by day with associated revenue
# test run: goes to url and pulls all data for 1 month

url = "https://www.boxofficemojo.com/daily/2019/?interval=january&sort=date&sortDir=asc&view=month&ref_=bo_di__resort#table"
page = requests.get(url)
soup = bsoup(page.text, 'lxml')

all_divs = soup.main.find_all("div", id="table")
all_tables = all_divs[0].find_all("table")
all_trs = all_tables[0].find_all("tr")

for i in range(1, len(all_trs)):
    print(f"Entry row {i} = ")
    
    all_tds = all_trs[i].find_all("td")
    entry = []
    
    for j in range(0, len(all_tds)):
        entry.append(all_tds[j].text)       
    
    print(entry)
    

Entry row 1 = 
["Jan 1, 2019New Year's Day", 'Tuesday', '1', '$52,588,390', '-0.3%', '+23.3%', '53', 'Aquaman', '$16,377,779']
Entry row 2 = 
['Jan 2, 2019', 'Wednesday', '2', '$25,043,176', '-52.4%', '-54.9%', '53', 'Aquaman', '$7,379,476']
Entry row 3 = 
['Jan 3, 2019', 'Thursday', '3', '$22,001,708', '-12.1%', '-57.4%', '55', 'Aquaman', '$6,203,801']
Entry row 4 = 
['Jan 4, 2019', 'Friday', '4', '$38,870,140', '+76.7%', '-31.4%', '52', 'Aquaman', '$9,388,082']
Entry row 5 = 
['Jan 5, 2019', 'Saturday', '5', '$48,843,884', '+25.7%', '-16%', '52', 'Aquaman', '$13,053,690']
Entry row 6 = 
['Jan 6, 2019', 'Sunday', '6', '$31,293,715', '-35.9%', '-38.6%', '52', 'Aquaman', '$8,561,508']
Entry row 7 = 
['Jan 7, 2019', 'Monday', '7', '$9,979,794', '-68.1%', '-72.5%', '48', 'Aquaman', '$2,583,295']
Entry row 8 = 
['Jan 8, 2019', 'Tuesday', '8', '$14,364,595', '+43.9%', '-72.7%', '50', 'Aquaman', '$3,852,619']
Entry row 9 = 
['Jan 9, 2019', 'Wednesday', '9', '$9,403,436', '-34.5%', '-62.5%', 

In [8]:
# daily revenue for movie industry, contains data for the top grossing movie by day with associated revenue
# this will run through all months and the past 4 years worth of data on boxofficemojo.com to gather the revenue data
# the result is a data frame with the top grossing movie by day for the past 4 years


months = ["january","february","march","april","may","june","july","august","september", "october", "november", "december"]
years = ["2017","2018","2019","2020"]

column_names = ["date_full","weekday","day_number","top_10_gross_total","ytd_delta",
                "lw_delta","releases","top_release","top_gross"]

df = pd.DataFrame(columns=column_names)

print("running")

for year in years:
    for month in months:
        url_a = "https://www.boxofficemojo.com/daily/"
        url_b = year
        url_c = "/?interval="
        url_d = month
        url_e = "&sort=date&sortDir=asc&view=month&ref_=bo_di__resort#table"

        url = url_a + url_b + url_c + url_d + url_e
        page = requests.get(url)
        soup = bsoup(page.text, 'lxml')
        current_month = []

        all_divs = soup.main.find_all("div", id="table")
        all_tables = all_divs[0].find_all("table")
        all_trs = all_tables[0].find_all("tr")

        for i in range(1, len(all_trs)):
            all_tds = all_trs[i].find_all("td")
            entry = []
            for j in range(0, len(all_tds)):
                entry.append(all_tds[j].text)
            current_month.append(entry)

        df_month = pd.DataFrame(current_month, columns=column_names)
        frames = [df, df_month]
        df = pd.concat(frames)
        
        time.sleep(0.1)
        
    print(f"{year} processed...")
    
df.reset_index(inplace=True)
df.drop(["index", "ytd_delta", "lw_delta"], axis = 1, inplace = True)
print("complete")

running
2017 processed...
2018 processed...
2019 processed...
2020 processed...
complete


In [9]:
df.head(10)

Unnamed: 0,date_full,weekday,day_number,top_10_gross_total,releases,top_release,top_gross
0,"Jan 1, 2017New Year's Day",Sunday,1,"$56,752,913",52,Rogue One: A Star Wars Story,"$16,751,857"
1,"Jan 2, 2017",Monday,2,"$54,065,844",54,Rogue One: A Star Wars Story,"$15,913,674"
2,"Jan 3, 2017",Tuesday,3,"$24,799,823",48,Rogue One: A Star Wars Story,"$6,268,921"
3,"Jan 4, 2017",Wednesday,4,"$16,695,754",49,Rogue One: A Star Wars Story,"$4,237,535"
4,"Jan 5, 2017",Thursday,5,"$14,928,109",50,Rogue One: A Star Wars Story,"$3,893,517"
5,"Jan 6, 2017",Friday,6,"$35,952,754",46,Hidden Figures,"$7,621,334"
6,"Jan 7, 2017",Saturday,7,"$50,026,406",44,Rogue One: A Star Wars Story,"$9,558,458"
7,"Jan 8, 2017",Sunday,8,"$34,512,473",45,Rogue One: A Star Wars Story,"$6,420,138"
8,"Jan 9, 2017",Monday,9,"$10,419,513",46,Hidden Figures,"$1,857,416"
9,"Jan 10, 2017",Tuesday,10,"$15,267,153",44,Hidden Figures,"$2,877,410"


In [10]:
# cleaning up the revenue values

df['top_10_gross_total'] = df['top_10_gross_total'].str.replace("$", "")
df['top_10_gross_total'] = df['top_10_gross_total'].str.replace(",", "")
df['top_gross'] = df['top_gross'].str.replace("$", "")
df['top_gross'] = df['top_gross'].str.replace(",", "")

df.head()

Unnamed: 0,date_full,weekday,day_number,top_10_gross_total,releases,top_release,top_gross
0,"Jan 1, 2017New Year's Day",Sunday,1,56752913,52,Rogue One: A Star Wars Story,16751857
1,"Jan 2, 2017",Monday,2,54065844,54,Rogue One: A Star Wars Story,15913674
2,"Jan 3, 2017",Tuesday,3,24799823,48,Rogue One: A Star Wars Story,6268921
3,"Jan 4, 2017",Wednesday,4,16695754,49,Rogue One: A Star Wars Story,4237535
4,"Jan 5, 2017",Thursday,5,14928109,50,Rogue One: A Star Wars Story,3893517


In [11]:
# cleaning up the date column, adding a new columns for month, day, year, and event (holiday)

dates = df[['date_full']]
holiday = pd.DataFrame(columns = ["date", "year", "event", "month", "day"])
holiday[["date", "year", "event"]] = dates.date_full.str.split('(\d{4})', expand = True)
holiday['date'] = holiday['date'].str.replace(",", "")
holiday[["month", "day"]] = holiday.date.str.split(' ', n= 1, expand = True)
final = holiday[['month', 'day', 'year', 'event']]

final.head()

Unnamed: 0,month,day,year,event
0,Jan,1,2017,New Year's Day
1,Jan,2,2017,
2,Jan,3,2017,
3,Jan,4,2017,
4,Jan,5,2017,


In [12]:
# update date columns using new dataframe from above

df[['month', 'day', 'year', 'event']] = final[['month', 'day', 'year', 'event']]
df.drop(["date_full"], axis = 1, inplace = True)

In [13]:
df.to_csv("movie_data_v1.csv")
df.head()

Unnamed: 0,weekday,day_number,top_10_gross_total,releases,top_release,top_gross,month,day,year,event
0,Sunday,1,56752913,52,Rogue One: A Star Wars Story,16751857,Jan,1,2017,New Year's Day
1,Monday,2,54065844,54,Rogue One: A Star Wars Story,15913674,Jan,2,2017,
2,Tuesday,3,24799823,48,Rogue One: A Star Wars Story,6268921,Jan,3,2017,
3,Wednesday,4,16695754,49,Rogue One: A Star Wars Story,4237535,Jan,4,2017,
4,Thursday,5,14928109,50,Rogue One: A Star Wars Story,3893517,Jan,5,2017,


In [14]:
# Making a list of unique film names (for searching imdb genres) 
# and a dictionary of monthly top films by year for a count

months = ['Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthly_top_uniq = []
monthly_top_repeat = {}
for year in years:
    for month in months:
        t = df.loc[(df['month'] == month) & (df['year'] == year)]
        ind = year + month
        monthly_top_uniq.append(t.top_release.unique())
        monthly_top_repeat[ind] = t.top_release
monthly_top_repeat

{'2017Jan': 0     Rogue One: A Star Wars Story
 1     Rogue One: A Star Wars Story
 2     Rogue One: A Star Wars Story
 3     Rogue One: A Star Wars Story
 4     Rogue One: A Star Wars Story
 5                   Hidden Figures
 6     Rogue One: A Star Wars Story
 7     Rogue One: A Star Wars Story
 8                   Hidden Figures
 9                   Hidden Figures
 10                  Hidden Figures
 11                  Hidden Figures
 12                  Hidden Figures
 13                  Hidden Figures
 14                  Hidden Figures
 15                  Hidden Figures
 16                  Hidden Figures
 17                  Hidden Figures
 18                  Hidden Figures
 19                           Split
 20                           Split
 21                           Split
 22                           Split
 23                           Split
 24                           Split
 25                           Split
 26                           Split
 27              

In [17]:
# parsing out the unique movies into one array
all_movies = []
for month in monthly_top_uniq:
    for movie in month:
        all_movies.append(movie)
all_movies = set(all_movies)
unique_movies = list(all_movies)
pp.pprint(unique_movies)

[   'Resistance',
    'Fifty Shades Darker',
    'Relic',
    'Star Wars: Episode VIII - The Last Jedi',
    'Frozen II',
    'Fast & Furious Presents: Hobbs & Shaw',
    'Aladdin',
    'Incredibles 2',
    'The Tax Collector',
    'Jurassic World: Fallen Kingdom',
    'Ant-Man and the Wasp',
    'The New Mutants',
    'I Still Believe',
    'Men in Black: International',
    'Swallow',
    'Toy Story 4',
    'Ralph Breaks the Internet',
    'Kingsman: The Golden Circle',
    'Fantastic Beasts: The Crimes of Grindelwald',
    'Abominable',
    'Transformers: The Last Knight',
    'A Bad Moms Christmas',
    'Sonic the Hedgehog',
    'Thor: Ragnarok',
    'Insidious: The Last Key',
    'Hotel Transylvania 3: Summer Vacation',
    'Us',
    'Jumanji: The Next Level',
    'The Invisible Man',
    'Blade Runner 2049',
    'The Other Lamb',
    'Venom',
    'Dragon Ball Super: Broly',
    'Pirates of the Caribbean: Dead Men Tell No Tales',
    'It Chapter Two',
    'Split',
    'Maleficent:

In [18]:
# SANITY CHECK - making sure all movies were included in unique array (we can delete this later)
count = 0
for i in monthly_top_uniq:
    for j in i:
        if j not in unique_movies:
            count+=1
print(count)

0


In [20]:
# reading in imdb csv and creating database with all movies made 2016 and up
imdb = pd.read_csv('IMDb movies.csv')
imdb = imdb.astype({'year':'string'})
imdb['year'] = imdb.year.str.replace(r"[a-zA-Z]", '')
imdb = imdb.loc[imdb['year'] >= '2016']
imdb.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
15675,tt0069049,L'altra faccia del vento,The Other Side of the Wind,2018,2018-11-02,Drama,122,"France, Iran, USA","English, German",Orson Welles,...,"John Huston, Oja Kodar, Peter Bogdanovich, Sus...",A Hollywood director emerges from semi-exile w...,6.8,5887,,,,78.0,81.0,98.0
25479,tt0100275,La Telenovela Errante,La Telenovela Errante,2017,2018-09-06,"Comedy, Drama, Fantasy",80,Chile,Spanish,"Raoul Ruiz, Valeria Sarmiento",...,"Luis Alarcón, Patricia Rivadeneira, Francisco ...",The film revolves around the concept of soap o...,6.6,260,,$ 3624,$ 3624,,,27.0
32465,tt0137204,Joe Finds Grace,Joe Finds Grace,2017,2018-03-20,"Animation, Adventure, Comedy",83,Canada,English,Anthony Harrison,...,"Anthony Harrison, C. Ernst Harth, Ellie Harvie...",Simple-minded Joseph Briteman breaks a golden ...,8.8,264,,,,,2.0,
41749,tt0315642,Wazir,Wazir,2016,2016-01-08,"Action, Crime, Drama",103,India,Hindi,Bejoy Nambiar,...,"Amitabh Bachchan, Farhan Akhtar, Aditi Rao Hyd...",A grief-stricken cop and an amputee grandmaste...,7.1,16992,,$ 1124045,$ 5633588,,118.0,30.0
42166,tt0326716,'77,'77,2017,2017-05-25,"Comedy, Drama",113,USA,English,Patrick Read Johnson,...,"John Francis Daley, Austin Pendleton, Colleen ...","Alienated, hopeful-filmmaker Pat Johnson's epi...",6.1,304,,,,,4.0,4.0


In [21]:
imdb['year'].max()

'2020'

In [22]:
# recording genres for each movie in the unique movie list
not_found = []
genres = {}
for mov in unique_movies:
        genres[mov] = imdb.loc[(imdb['original_title'] == mov)].genre.values
pp.pprint(genres)
print(not_found)

{   '12 Strong': array(['Action, Drama, History'], dtype=object),
    '1917': array(['Drama, War'], dtype=object),
    'A Bad Moms Christmas': array(['Comedy'], dtype=object),
    "A Dog's Purpose": array(['Adventure, Comedy, Drama'], dtype=object),
    'A Madea Family Funeral': array(['Comedy'], dtype=object),
    'A Quiet Place': array(['Drama, Horror, Sci-Fi'], dtype=object),
    'A Simple Favor': array(['Comedy, Crime, Drama'], dtype=object),
    'A Star Is Born': array(['Drama, Music, Romance'], dtype=object),
    'A Wrinkle in Time': array(['Adventure, Family, Fantasy'], dtype=object),
    'Abominable': array(['Horror', 'Animation, Adventure, Comedy'], dtype=object),
    'Aladdin': array(['Adventure, Family, Fantasy'], dtype=object),
    'Alita: Battle Angel': array(['Action, Adventure, Sci-Fi'], dtype=object),
    'American Made': array(['Action, Biography, Comedy'], dtype=object),
    'Angel Has Fallen': array(['Action, Thriller'], dtype=object),
    'Annabelle: Creation': arra

In [23]:
# example of missing data
imdb.loc[(imdb['original_title']=='Birds of Prey')]

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics


In [24]:
# What to do about missing values??
print(genres['The Other Lamb'])
print(genres['Birds of Prey'])

['Drama, Horror']
[]
