In [1]:
# Load required libraries

from bs4 import BeautifulSoup
import requests

import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import re

import pandas as pd
import numpy as np
import dateutil.parser
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

%config InlineBackend.figure_formats = ['svg']  # or svg
%matplotlib inline

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [2]:
url = ("https://www.boxofficemojo.com/?ref_=bo_nb_gs_mojologo")

In [3]:
# To open the website.
driver = webdriver.Chrome(chromedriver)
driver.get(url) # ex: driver.get(youtube_query)
time.sleep(1)

In [4]:
# select worldwide
worldwide_tab = driver.find_element_by_xpath("//*[contains(text(), 'Worldwide')]")
worldwide_tab.click()
time.sleep(1)

In [5]:
# select year 2011 for any year just change all the 2011 to the desired year.
year_selection = driver.find_element_by_xpath('//select[@name = "year-navSelector"]')
year_selection.send_keys("2012")
time.sleep(1)

In [17]:
updated_URL=driver.current_url
driver.page_source
soup = BeautifulSoup(driver.page_source, 'html.parser')

table_worldwide = soup.find(id ='table')
rows = [row for row in table_worldwide.find_all('tr')]

In [25]:
movies_2012_link = {}

for row in rows[1:]:
    items = row.find_all('td')
    link = items[1].find('a')
    title, url = link.text, link['href']
    movies_2012_link[title] = [url] + [i.text for i in items]
    
movies_2012_link

In [None]:
df_movie_info_2012 = pd.DataFrame(movies_2011_link).T  #transpose
df_movie_info_2012.columns = ['link_stub','Rank', 'Release_Group', 'World_Wide_Profit', 'Domestic', 'Domestic_GToD_Percent', 'Foreign', 'Foreign_GToD_Percent']
df_movie_info_2012.to_csv('2011_WorldWide_Box_Office_Links.csv')
df_movie_info_2012.head(100)

In [None]:
df_movie_info_2012.info()

In [None]:
df_movie_info_2012.columns

In [None]:
# clean up the columns removing $\-,
df_movie_info_2012[df_movie_info_2011.columns[5:]] = df_movie_info_2011[df_movie_info_2011.columns[5:]].replace('[$\-,]', '', regex=True)

In [None]:
df_movie_info_2012['Domestic'] = pd.to_numeric(df_movie_info_2011['Domestic'])
df_movie_info_2012['Foreign'] = pd.to_numeric(df_movie_info_2011['Foreign'])

In [None]:
df_movie_info_2012.head()

In [None]:
df_movie_info_2012.info()

In [None]:
# Functions to convert to int and datetime.

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

In [None]:
def get_movie_dict(url):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic gross
        - runtime
        - MPAA rating
        - Budget
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    driver.get(url)
    time.sleep(1)
    
    # Use selenium to navigate through page
    title_summary = driver.find_element_by_xpath("//*[contains(text(), 'Title Summary')]")
    title_summary.click()
    time.sleep(1)
    
    #Update the to current url
    updated_URL=driver.current_url
    driver.page_source
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    
    headers = ['movie_title', 'domestic_total_gross',
               'runtime_minutes', 'rating', 'release_date', 'budget']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()

    #Get domestic gross
    raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[0].text)
    domestic_total_gross = money_to_int(raw_domestic_total_gross)

    #Get runtime
    raw_runtime = get_movie_value(soup,'Running')
    raw_runtime = "2hr 00min" if raw_runtime is None else raw_runtime
    runtime = runtime_to_minutes(raw_runtime)
    
    #Get rating
    rating = get_movie_value(soup,'MPAA')

    #Get release date
    raw_release_date = get_movie_value(soup,'Earliest').split('\n')[0]
    release_date = to_date(raw_release_date)
    
    #Get Budget  
    budget = get_movie_value(soup,'Budget')
    budget = "$500,000" if budget is None else budget
    movie_budget = money_to_int(budget)
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title, domestic_total_gross, runtime, rating, release_date, movie_budget]))

    return movie_dict

In [None]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj:
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [None]:
movies_2012_info_list = []

for link in df_movies_2012.link_stub:
    movies_2012_info_list.append(get_movie_dict(link))

In [None]:
movies_2012_info_list [:6]

In [None]:
# make a dataframe

df_movie_info_2012 = pd.DataFrame(movies_2011_info_list)# Transpose
df_movie_info_2012.set_index('movie_title', inplace=True)

# df_movie_info_2011.to_csv('2011_WorldWide_Box_Office_Links.csv')

df_movie_info_2012.head()

In [None]:
df_movie_info_2012.info()

In [None]:
df_movies_2012_full = df_movies_2012.merge(df_movie_info_2011, left_index=True, right_index=True)
df_movies_2012_full.to_csv('2011_Top_WorldWide_Box_Office.csv')
df_movies_2012_full.head()