In [12]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import string
import matplotlib.pyplot as plt
import seaborn as sns
import re

%matplotlib inline

In [2]:
def remove_punctuation(x):
    x = str(x)
    return x.translate(str.maketrans({a:None for a in string.punctuation}))

In [34]:
def remove_year_from_title(title):
    return re.sub('\([0-9]{2,}\)','',title).strip()

remove_year_from_title('The Jungle Book (2016)')

'The Jungle Book'

In [44]:
def scrape_opening_weekend(year=2015):
    url = 'http://www.boxofficemojo.com/yearly/chart/?yr='+str(year)+'&p=.htm'
    response = requests.get(url)
    while(not response.ok):
        response = requests.get(url)

    page = response.text
    soup = BeautifulSoup(page,'lxml')
    
    # Find number of pages on table for the year
    item = soup.find('center')
    #pages = len(item.find_all('a'))+1
    
    # Pull table from each page
    for i in range(1,3):
        url = 'http://www.boxofficemojo.com/yearly/chart/?page='+str(i)+'&view=releasedate&view2=domestic&yr='+str(year)+'&sort=opengross&p=.htm'
        tables = pd.read_html(url)
        table = tables[2]
        table = table.iloc[6:-6,2:9]
        table.columns = ['title','studio','total_gross','total_theatres','opening','opening_theatres','release_date']
        if i==1:
            df = table
        else:
            df = pd.concat([df,table])
    
    # Clean up dataframe types
    df.title = df.title.map(remove_year_from_title)
    df.release_date = pd.to_datetime(df.release_date+str(year),format='%m/%d%Y')
    df = df.dropna(subset=['opening','opening_theatres'])
    df.total_gross = df.total_gross.map(remove_punctuation)
    df.total_theatres = df.total_theatres.map(lambda x: int(x))
    df.opening = df.opening.map(remove_punctuation)
    df.opening_theatres = df.opening_theatres.map(lambda x: int(x))
    
    return df

In [40]:
def clean_mojo_data(df):
    # Clean up dataframe types
    df.title = df.title.map(remove_year_from_title)
    #df.release_date = pd.to_datetime(df.release_date+str(year),format='%m/%d%Y')
    df = df.dropna(subset=['opening','opening_theatres'])
    df.total_gross = df.total_gross.map(remove_punctuation)
    df.total_theatres = df.total_theatres.map(lambda x: int(x))
    df.opening = df.opening.map(remove_punctuation)
    df.opening_theatres = df.opening_theatres.map(lambda x: int(x))
    return df                                     

In [45]:
data = scrape_opening_weekend(2016)
data.head()

Unnamed: 0,title,studio,total_gross,total_theatres,opening,opening_theatres,release_date
6,Batman v Superman: Dawn of Justice,WB,320495804,4256,166007347,4242,2016-03-25
7,Deadpool,Fox,361274848,3856,132434639,3558,2016-02-12
8,The Jungle Book,BV,202218804,4028,103261464,4028,2016-04-15
9,Zootopia,BV,317554942,3959,75063401,3827,2016-03-04
10,Kung Fu Panda 3,Fox,141692280,3987,41282042,3955,2016-01-29


In [46]:
for year in range(2010,2017):
    data = pd.concat([data,scrape_opening_weekend(year)])

In [51]:
pd.to_pickle(data,'2010-2015_mojo.pkl')