In [1]:
import requests
import sys
import numpy as np
import time
import pandas as pd
from bs4 import BeautifulSoup
#time.sleep(1)

In [2]:
def get_links(year):
    """returns a list of link suffixes that connect to every movie in the
    boxofficemojo page for the given year"""
    page = requests.get("https://www.boxofficemojo.com/year/{}/".format(year))
    soup = BeautifulSoup(page.content, 'html.parser')
    if page.status_code >=400:
        return [page.status_code,"https://www.boxofficemojo.com/year/{}/".format(year)]
    links = soup.select(".a-text-left.mojo-field-type-release.mojo-cell-wide a")
    return links


In [3]:
def cash_str_to_int(string):
    #takes string containing a number with a symbol at the front and returns the int
    return int(string[1:].replace(",",""))

In [4]:
def make_movie_dict(links):
    """accepts a list of link suffixes and returns a dictionary that
    has a key for each link. The value of each item is also a dictionary
    containing keys
                    domestic, international, worldwide, budget, genres, and title,
    with types      float     float          float      float   list[strs]   str
    
    if an int is missing its replaced with none and if genres is missing it is
    replaced with an empty list
    """
    movie_dict = {}
    for link in links:
        time.sleep(np.random.random())
        current = requests.get("https://www.boxofficemojo.com" + link["href"])
        currysoup = BeautifulSoup(current.content, "html.parser")
        title = currysoup.find("h1").text
        print("scraping ", title)
        try:
            budget = cash_str_to_int(currysoup.find(text="Budget").next_element.text)
        except AttributeError:
            budget = None
        grossli = currysoup.select(".a-section.a-spacing-none.mojo-performance-summary-table .a-section.a-spacing-none")
        grosses = []
        for i in grossli:
            try:
                grosses.append(cash_str_to_int(i.select(".money")[0].text))
            except IndexError:
                grosses.append(None)
        try:
            genres = list(currysoup.select(".a-section.a-spacing-none.mojo-summary-values.mojo-hidden-from-mobile")[0])[6].find_all("span")[1].text.split()
        except IndexError:
            genres = []
        movie_dict[link["href"]] = {"domestic":grosses[0],
                                    "international":grosses[1],
                                    "worldwide":grosses[2],
                                    "budget":budget,
                                    "genres":genres,
                                    "title":title
                                   }
        print(movie_dict[link["href"]])
        #uncomment the below to see only one pull
        #return(movie_dict)
    return movie_dict
    
    

In [5]:
links_2019 = get_links(2005)
movie_dict_2019 = make_movie_dict(links_2019)

scraping  Star Wars: Episode III - Revenge of the Sith
{'domestic': 380270577, 'international': 488081953, 'worldwide': 868352530, 'budget': 113000000, 'genres': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi'], 'title': 'Star Wars: Episode III - Revenge of the Sith'}
scraping  Harry Potter and the Goblet of Fire
{'domestic': 290013036, 'international': 605908000, 'worldwide': 895921036, 'budget': 150000000, 'genres': ['Adventure', 'Family', 'Fantasy', 'Mystery'], 'title': 'Harry Potter and the Goblet of Fire'}
scraping  War of the Worlds
{'domestic': 234280354, 'international': 369592765, 'worldwide': 603873119, 'budget': 132000000, 'genres': ['Adventure', 'Sci-Fi', 'Thriller'], 'title': 'War of the Worlds'}
scraping  The Chronicles of Narnia: The Lion, the Witch and the Wardrobe
{'domestic': 291710957, 'international': 453302158, 'worldwide': 745013115, 'budget': 180000000, 'genres': ['Adventure', 'Family', 'Fantasy'], 'title': 'The Chronicles of Narnia: The Lion, the Witch and the Wardr

scraping  Red Eye
{'domestic': 57891803, 'international': 38366398, 'worldwide': 96258201, 'budget': 26000000, 'genres': ['Thriller'], 'title': 'Red Eye'}
scraping  White Noise
{'domestic': 56386759, 'international': 34809660, 'worldwide': 91196419, 'budget': None, 'genres': ['359', 'days/51', 'weeks'], 'title': 'White Noise'}
scraping  Be Cool
{'domestic': 56046979, 'international': 39716737, 'worldwide': 95763716, 'budget': 53000000, 'genres': ['Comedy', 'Crime', 'Drama', 'Music', 'Romance'], 'title': 'Be Cool'}
scraping  Wallace & Gromit: The Curse of the Were-Rabbit
{'domestic': 56110897, 'international': 138000274, 'worldwide': 194111171, 'budget': 30000000, 'genres': ['453', 'days/64', 'weeks'], 'title': 'Wallace & Gromit: The Curse of the Were-Rabbit'}
scraping  Fun with Dick and Jane
{'domestic': 110332737, 'international': 94349162, 'worldwide': 204681899, 'budget': 100000000, 'genres': ['Comedy', 'Crime'], 'title': 'Fun with Dick and Jane'}
scraping  Crash
{'domestic': 545803

scraping  Stealth
{'domestic': 32116746, 'international': 47151576, 'worldwide': 79268322, 'budget': 135000000, 'genres': ['Action', 'Adventure', 'Sci-Fi', 'Thriller'], 'title': 'Stealth'}
scraping  House of Wax
{'domestic': 32064800, 'international': 36701321, 'worldwide': 68766121, 'budget': 40000000, 'genres': ['Horror', 'Thriller'], 'title': 'House of Wax'}
scraping  The Wedding Date
{'domestic': 31726995, 'international': 15369211, 'worldwide': 47096206, 'budget': 15000000, 'genres': ['Comedy', 'Romance'], 'title': 'The Wedding Date'}
scraping  Just Friends
{'domestic': 32619671, 'international': 18292763, 'worldwide': 50912434, 'budget': None, 'genres': ['404', 'days/57', 'weeks'], 'title': 'Just Friends'}
scraping  A History of Violence
{'domestic': 31504633, 'international': 29880432, 'worldwide': 61385065, 'budget': 32000000, 'genres': ['Drama', 'Thriller'], 'title': 'A History of Violence'}
scraping  Get Rich or Die Tryin'
{'domestic': 30985352, 'international': 15578609, 'wo

scraping  The Devil's Rejects
{'domestic': 17044981, 'international': 3856878, 'worldwide': 20901859, 'budget': 7000000, 'genres': ['Horror'], 'title': "The Devil's Rejects"}
scraping  Son of the Mask
{'domestic': 17018422, 'international': 42963126, 'worldwide': 59981548, 'budget': 84000000, 'genres': ['Comedy', 'Family', 'Fantasy'], 'title': 'Son of the Mask'}
scraping  Rebound
{'domestic': 16809014, 'international': 683000, 'worldwide': 17492014, 'budget': 33100000, 'genres': ['Comedy', 'Family', 'Sport'], 'title': 'Rebound'}
scraping  The Perfect Man
{'domestic': 16535005, 'international': 3235470, 'worldwide': 19770475, 'budget': 10000000, 'genres': ['Comedy', 'Family', 'Romance'], 'title': 'The Perfect Man'}
scraping  Spanglish
{'domestic': 42726869, 'international': 12743285, 'worldwide': 55470154, 'budget': 80000000, 'genres': ['Comedy', 'Drama', 'Romance'], 'title': 'Spanglish'}
scraping  Waiting...
{'domestic': 16124543, 'international': 2513147, 'worldwide': 18637690, 'budge

scraping  Blade: Trinity
{'domestic': 52411906, 'international': 79565998, 'worldwide': 131977904, 'budget': 65000000, 'genres': ['Action', 'Adventure', 'Fantasy', 'Horror', 'Sci-Fi'], 'title': 'Blade: Trinity'}
scraping  Downfall
{'domestic': 5509040, 'international': 86672534, 'worldwide': 92181574, 'budget': None, 'genres': ['317', 'days/45', 'weeks'], 'title': 'Downfall'}
scraping  Beyond the Sea
{'domestic': 6318709, 'international': 2128906, 'worldwide': 8447615, 'budget': None, 'genres': ['380', 'days/54', 'weeks'], 'title': 'Beyond the Sea'}
scraping  Alone in the Dark
{'domestic': 5178569, 'international': 5264239, 'worldwide': 10442808, 'budget': 20000000, 'genres': ['Action', 'Horror', 'Sci-Fi'], 'title': 'Alone in the Dark'}
scraping  The Squid and the Whale
{'domestic': 7372734, 'international': 3725397, 'worldwide': 11098131, 'budget': 1500000, 'genres': ['Comedy', 'Drama'], 'title': 'The Squid and the Whale'}
scraping  Wild Safari - A South African Adventure
{'domestic':

scraping  Look at Me
{'domestic': 1737308, 'international': 16992443, 'worldwide': 18729751, 'budget': None, 'genres': ['275', 'days/39', 'weeks'], 'title': 'Look at Me'}
scraping  Everything Is Illuminated
{'domestic': 1712337, 'international': 1889637, 'worldwide': 3601974, 'budget': None, 'genres': ['112', 'days/16', 'weeks'], 'title': 'Everything Is Illuminated'}
scraping  State Property: Blood on the Streets
{'domestic': 1691706, 'international': None, 'worldwide': 1691706, 'budget': None, 'genres': ['263', 'days/37', 'weeks'], 'title': 'State Property: Blood on the Streets'}
scraping  Christmas with the Kranks
{'domestic': 73780539, 'international': 22791941, 'worldwide': 96572480, 'budget': 60000000, 'genres': ['Comedy', 'Family'], 'title': 'Christmas with the Kranks'}
scraping  Vera Drake
{'domestic': 3775283, 'international': 9492586, 'worldwide': 13267869, 'budget': None, 'genres': ['450', 'days/64', 'weeks'], 'title': 'Vera Drake'}
scraping  Ma vie en cinémascope
{'domestic'

scraping  Black
{'domestic': 754819, 'international': 508381, 'worldwide': 1263200, 'budget': None, 'genres': ['45', 'theaters'], 'title': 'Black'}
scraping  First Descent
{'domestic': 750805, 'international': 237563, 'worldwide': 988368, 'budget': None, 'genres': ['35', 'days/5', 'weeks'], 'title': 'First Descent'}
scraping  Casanova
{'domestic': 11304403, 'international': 26387241, 'worldwide': 37691644, 'budget': None, 'genres': ['372', 'days/53', 'weeks'], 'title': 'Casanova'}
scraping  Mysterious Skin
{'domestic': 713240, 'international': 811726, 'worldwide': 1524966, 'budget': None, 'genres': ['240', 'days/34', 'weeks'], 'title': 'Mysterious Skin'}
scraping  The Ballad of Jack and Rose
{'domestic': 712275, 'international': 203776, 'worldwide': 916051, 'budget': None, 'genres': ['282', 'days/40', 'weeks'], 'title': 'The Ballad of Jack and Rose'}
scraping  Oldboy
{'domestic': 707481, 'international': 14272524, 'worldwide': 14980005, 'budget': None, 'genres': ['287', 'days/41', 'wee

scraping  Nine Lives
{'domestic': 478830, 'international': 1112693, 'worldwide': 1591523, 'budget': None, 'genres': ['444', 'days/63', 'weeks'], 'title': 'Nine Lives'}
scraping  Green Street Hooligans
{'domestic': 346830, 'international': 3467887, 'worldwide': 3814717, 'budget': None, 'genres': ['119', 'days/17', 'weeks'], 'title': 'Green Street Hooligans'}
scraping  Kaal
{'domestic': 345091, 'international': 364686, 'worldwide': 709777, 'budget': None, 'genres': ['40', 'theaters'], 'title': 'Kaal'}
scraping  Don't Move
{'domestic': 337265, 'international': 11733742, 'worldwide': 12071007, 'budget': None, 'genres': ['6', 'theaters'], 'title': "Don't Move"}
scraping  Shaadi No. 1
{'domestic': 336772, 'international': 165617, 'worldwide': 502389, 'budget': None, 'genres': ['45', 'theaters'], 'title': 'Shaadi No. 1'}
scraping  Breakfast on Pluto
{'domestic': 828699, 'international': 3113555, 'worldwide': 3942254, 'budget': None, 'genres': ['411', 'days/58', 'weeks'], 'title': 'Breakfast o

scraping  Masculin Féminin
{'domestic': 200380, 'international': None, 'worldwide': 200380, 'budget': None, 'genres': ['4', 'theaters'], 'title': 'Masculin Féminin'}
scraping  November
{'domestic': 192186, 'international': None, 'worldwide': 192186, 'budget': 1500000, 'genres': ['Drama', 'Mystery', 'Thriller'], 'title': 'November'}
scraping  Lost Embrace
{'domestic': 190860, 'international': 2107872, 'worldwide': 2298732, 'budget': None, 'genres': ['6', 'theaters'], 'title': 'Lost Embrace'}
scraping  Eros
{'domestic': 188392, 'international': 1364628, 'worldwide': 1553020, 'budget': None, 'genres': ['268', 'days/38', 'weeks'], 'title': 'Eros'}
scraping  The Baxter
{'domestic': 181872, 'international': None, 'worldwide': 181872, 'budget': None, 'genres': ['128', 'days/18', 'weeks'], 'title': 'The Baxter'}
scraping  Hellbent
{'domestic': 183066, 'international': None, 'worldwide': 183066, 'budget': None, 'genres': ['472', 'days/67', 'weeks'], 'title': 'Hellbent'}
scraping  My Date with D

scraping  D.E.B.S.
{'domestic': 97446, 'international': None, 'worldwide': 97446, 'budget': None, 'genres': ['282', 'days/40', 'weeks'], 'title': 'D.E.B.S.'}
scraping  Pure
{'domestic': 102471, 'international': None, 'worldwide': 102471, 'budget': None, 'genres': ['3', 'theaters'], 'title': 'Pure'}
scraping  Postmen in the Mountains
{'domestic': 203975, 'international': None, 'worldwide': 203975, 'budget': None, 'genres': ['See', 'more', 'details', 'at', 'IMDbPro'], 'title': 'Postmen in the Mountains'}
scraping  Ek Khiladi Ek Haseena
{'domestic': 90869, 'international': None, 'worldwide': 90869, 'budget': None, 'genres': ['15', 'theaters'], 'title': 'Ek Khiladi Ek Haseena'}
scraping  Face
{'domestic': 108151, 'international': 54566, 'worldwide': 162717, 'budget': None, 'genres': ['448', 'days/64', 'weeks'], 'title': 'Face'}
scraping  Gay Sex in the 70s
{'domestic': 249565, 'international': None, 'worldwide': 249565, 'budget': None, 'genres': ['5', 'theaters'], 'title': 'Gay Sex in the 

scraping  God's Sandbox
{'domestic': 51246, 'international': None, 'worldwide': 51246, 'budget': None, 'genres': ['3', 'theaters'], 'title': "God's Sandbox"}
scraping  The Warrior
{'domestic': 50257, 'international': None, 'worldwide': 50257, 'budget': None, 'genres': ['170', 'days/24', 'weeks'], 'title': 'The Warrior'}
scraping  The Bridge of San Luis Rey
{'domestic': 49981, 'international': 1750160, 'worldwide': 1800141, 'budget': 24000000, 'genres': ['Drama', 'History', 'Romance'], 'title': 'The Bridge of San Luis Rey'}
scraping  Team America: World Police
{'domestic': 32786074, 'international': 18040824, 'worldwide': 50826898, 'budget': 32000000, 'genres': ['Action', 'Comedy'], 'title': 'Team America: World Police'}
scraping  Assisted Living
{'domestic': 49048, 'international': None, 'worldwide': 49048, 'budget': None, 'genres': ['333', 'days/47', 'weeks'], 'title': 'Assisted Living'}
scraping  Raging Bull
{'domestic': 49034, 'international': None, 'worldwide': 49034, 'budget': Non

scraping  Kamikaze Girls
{'domestic': 34424, 'international': 104715, 'worldwide': 139139, 'budget': None, 'genres': ['3', 'theaters'], 'title': 'Kamikaze Girls'}
scraping  Guerrilla: The Taking of Patty Hearst
{'domestic': 105054, 'international': None, 'worldwide': 105054, 'budget': None, 'genres': ['9', 'theaters'], 'title': 'Guerrilla: The Taking of Patty Hearst'}
scraping  Keane
{'domestic': 33256, 'international': 361134, 'worldwide': 394390, 'budget': 850000, 'genres': ['Drama', 'Mystery', 'Thriller'], 'title': 'Keane'}
scraping  Smile
{'domestic': 32833, 'international': None, 'worldwide': 32833, 'budget': None, 'genres': ['60', 'theaters'], 'title': 'Smile'}
scraping  Lipstick & Dynamite, Piss & Vinegar: The First Ladies of Wrestling
{'domestic': 26073, 'international': None, 'worldwide': 26073, 'budget': None, 'genres': ['3', 'theaters'], 'title': 'Lipstick & Dynamite, Piss & Vinegar: The First Ladies of Wrestling'}
scraping  Going Shopping
{'domestic': 31908, 'international'

scraping  The Great Water
{'domestic': 17257, 'international': None, 'worldwide': 17257, 'budget': None, 'genres': ['2', 'theaters'], 'title': 'The Great Water'}
scraping  Almost Peaceful
{'domestic': 106184, 'international': None, 'worldwide': 106184, 'budget': None, 'genres': ['3', 'theaters'], 'title': 'Almost Peaceful'}
scraping  The Last Dispatch
{'domestic': 20684, 'international': None, 'worldwide': 20684, 'budget': None, 'genres': ['1', 'theater'], 'title': 'The Last Dispatch'}
scraping  Fascination
{'domestic': 16670, 'international': None, 'worldwide': 16670, 'budget': 5000000, 'genres': ['Drama', 'Mystery', 'Romance', 'Thriller'], 'title': 'Fascination'}
scraping  The Last Mogul
{'domestic': 16469, 'international': None, 'worldwide': 16469, 'budget': None, 'genres': ['191', 'days/27', 'weeks'], 'title': 'The Last Mogul'}
scraping  Genesis
{'domestic': 16413, 'international': 3122900, 'worldwide': 3139313, 'budget': None, 'genres': ['219', 'days/31', 'weeks'], 'title': 'Genes

scraping  Living Life
{'domestic': 9064, 'international': None, 'worldwide': 9064, 'budget': None, 'genres': ['268', 'days/38', 'weeks'], 'title': 'Living Life'}
scraping  WMD: Weapons of Mass Deception
{'domestic': 30210, 'international': None, 'worldwide': 30210, 'budget': None, 'genres': ['4', 'theaters'], 'title': 'WMD: Weapons of Mass Deception'}
scraping  The Swenkas
{'domestic': 8860, 'international': None, 'worldwide': 8860, 'budget': None, 'genres': ['1', 'theater'], 'title': 'The Swenkas'}
scraping  Ae Fond Kiss
{'domestic': 30148, 'international': 6517493, 'worldwide': 6547641, 'budget': None, 'genres': ['401', 'days/57', 'weeks'], 'title': 'Ae Fond Kiss'}
scraping  Goodbye, Dragon Inn
{'domestic': 35120, 'international': None, 'worldwide': 35120, 'budget': None, 'genres': ['3', 'theaters'], 'title': 'Goodbye, Dragon Inn'}
scraping  Twist of Faith
{'domestic': 8129, 'international': None, 'worldwide': 8129, 'budget': None, 'genres': ['2', 'theaters'], 'title': 'Twist of Fait

scraping  Stolen Childhoods
{'domestic': 2996, 'international': None, 'worldwide': 2996, 'budget': None, 'genres': ['1', 'theater'], 'title': 'Stolen Childhoods'}
scraping  Three Days of Rain
{'domestic': 2841, 'international': None, 'worldwide': 2841, 'budget': None, 'genres': ['93', 'days/13', 'weeks'], 'title': 'Three Days of Rain'}
scraping  The Beauty Academy of Kabul
{'domestic': 225448, 'international': None, 'worldwide': 225448, 'budget': None, 'genres': ['13', 'theaters'], 'title': 'The Beauty Academy of Kabul'}
scraping  The Tracker
{'domestic': 55188, 'international': None, 'worldwide': 55188, 'budget': None, 'genres': ['2', 'theaters'], 'title': 'The Tracker'}
scraping  The Optimists
{'domestic': 2377, 'international': None, 'worldwide': 2377, 'budget': None, 'genres': ['1', 'theater'], 'title': 'The Optimists'}
scraping  The Overture
{'domestic': 2254, 'international': 3486, 'worldwide': 5740, 'budget': None, 'genres': ['2', 'theaters'], 'title': 'The Overture'}
scraping  

In [6]:
bom_df_2019 = pd.DataFrame.from_dict(movie_dict_2019,orient="index")
bom_df_2019.head()

Unnamed: 0,domestic,international,worldwide,budget,genres,title
/release/rl3698624001/?ref_=bo_yld_table_1,600788188,1250118000.0,1850906470,200000000.0,"[Drama, Romance]",Titanic
/release/rl2973926913/?ref_=bo_yld_table_2,201578182,352131600.0,553709788,140000000.0,"[Action, Adventure, Sci-Fi, Thriller]",Armageddon
/release/rl3396044289/?ref_=bo_yld_table_3,216540909,265300000.0,481840909,70000000.0,"[Drama, War]",Saving Private Ryan
/release/rl172000769/?ref_=bo_yld_table_4,176484651,193400000.0,369884651,23000000.0,"[Comedy, Romance]",There's Something About Mary
/release/rl2037941761/?ref_=bo_yld_table_5,161491646,24500000.0,185991646,23000000.0,"[Comedy, Sport]",The Waterboy


In [7]:
bom_df_2019.to_csv("zippedData/bom-budget-and-genres-2019.csv")