In [1]:
import bs4
from bs4 import BeautifulSoup
import requests as rq
import re
import pandas as pd
import numpy as np
import datetime 
import os

In [8]:
def convdollar(x):
    
    """
    Converts box-office gross to easily readable string 
    """
    
    billion = 1000000000
    million = 1000000
    
    x = float(x)*100000
    
    if (x >= 1000000000):
    
        y = f"${str(round(x/billion,2))} billion"
    else:
        y = f"${str(round(x/million,2))} million"
        
    return(y)

In [9]:
def scrape():
    """
    Gets all box office data from 2019 to present from boxofficemojo.com
    """
    years=[str(a) for a in range(2019,2020)]
    df_list=[]
    for year in years:
        r=rq.get('https://www.boxofficemojo.com/yearly/chart/?view2=worldwide&yr=%s&p=.htm' % year)
        print('Box Office data for %s scraped' % year)
        p=BeautifulSoup(r.text,'html.parser')
            ### Look for the table ### 
        b=p.find_all('table')
        
        ### Usually the fourth table object on page ### 
        tb=b[3].find_all('td')
         ## Each data field is found in a <td> element in the fourth table. Store all data in a list ## 
        data=[]
        for i in tb:
            if i.find('a')!=None:
                data.append(i.find('a').contents[0])
            elif i.find('font')!=None:
                 data.append(i.find('font').contents[0])
            elif i.find('b')!=None:
                data.append(i.find('b').contents[0])
                ### Still a <b> tag left for <font> tags ## 
        data=[a.contents[0] if type(a)!=bs4.element.NavigableString else a for a in data]
        
        ### Strip special characters ### 
        data=[re.sub('[^A-Za-z0-9-. ]+', '', a) for a in data]
        
        ### Fill NaNs ### 
        data=[np.nan if a =='na' else a for a in data]
        
        ### Define the feature names ###
        columns=['bo_year_rank','title','studio','worldwide-gross','domestic-gross','domestic-pct','overseas-gross','overseas-pct']
        
        ### First 6 elements are column headers # 
        to_df=data[6:]
        
        ### Escape clause in case the layout changes from year to year ### 
        if len(to_df)%len(columns) != 0:
            print('Possible table misalignment in table for year %s' % year)
            break
            
            ### Convert to pandas dataframe ### 
        
        nrow=int(len(to_df)/len(columns))
        df=pd.DataFrame(np.array(to_df).reshape(nrow,8),columns=columns)
        df[['worldwide-gross','domestic-gross','overseas-gross']]=df[['worldwide-gross','domestic-gross','overseas-gross']].apply(convdollar)
        df['bo_year']=int(year)
        df_list.append(df)

        main=pd.concat(df_list)
    
        # Store data into csv # 
        #main.to_csv(os.path.join("output","current_boxoffice_mojo.csv"))
        return (main)
        print (main)
  

In [10]:
if __name__ == "__main__": 

    dirtymovies_df=scrape()

Box Office data for 2019 scraped


TypeError: ("cannot convert the series to <class 'float'>", 'occurred at index worldwide-gross')

In [37]:
 dirtymovies_df

Unnamed: 0,bo_year_rank,title,studio,worldwide-gross,domestic-gross,domestic-pct,overseas-gross,overseas-pct,bo_year
0,1,Avengers Endgame,BV,1.342700e+09,394000000.0,29.3,948700000.0,70.7,2019
1,2,Captain Marvel,BV,1.111400e+09,414500000.0,37.3,696800000.0,62.7,2019
2,3,The Wandering Earth,CMC,6.998000e+08,5900000.0,0.8,693900000.0,99.2,2019
3,4,How to Train Your Dragon The Hidden World,Uni.,5.164000e+08,159700000.0,30.9,356700000.0,69.1,2019
4,5,Alita Battle Angel,Fox,4.044000e+08,85700000.0,21.2,318700000.0,78.8,2019
5,6,Shazam,WB NL,3.462000e+08,131500000.0,38.0,214700000.0,62.0,2019
6,7,Dumbo 2019,BV,3.287000e+08,107500000.0,32.7,221200000.0,67.3,2019
7,8,Us,Uni.,2.496000e+08,173000000.0,69.3,76600000.0,30.7,2019
8,9,Glass,Uni.,2.470000e+08,111000000.0,45.0,135900000.0,55.0,2019
9,10,The LEGO Movie 2 The Second Part,WB,1.905000e+08,105700000.0,55.5,84800000.0,44.5,2019


In [41]:
## Cleaning data by dropping the unessery rows (10-87) drops the bottom 78 trows
dirtymovies_df[:-78]


Unnamed: 0,bo_year_rank,title,studio,worldwide-gross,domestic-gross,domestic-pct,overseas-gross,overseas-pct,bo_year
0,1,Avengers Endgame,BV,1342700000.0,394000000.0,29.3,948700000.0,70.7,2019
1,2,Captain Marvel,BV,1111400000.0,414500000.0,37.3,696800000.0,62.7,2019
2,3,The Wandering Earth,CMC,699800000.0,5900000.0,0.8,693900000.0,99.2,2019
3,4,How to Train Your Dragon The Hidden World,Uni.,516400000.0,159700000.0,30.9,356700000.0,69.1,2019
4,5,Alita Battle Angel,Fox,404400000.0,85700000.0,21.2,318700000.0,78.8,2019
5,6,Shazam,WB NL,346200000.0,131500000.0,38.0,214700000.0,62.0,2019
6,7,Dumbo 2019,BV,328700000.0,107500000.0,32.7,221200000.0,67.3,2019
7,8,Us,Uni.,249600000.0,173000000.0,69.3,76600000.0,30.7,2019
8,9,Glass,Uni.,247000000.0,111000000.0,45.0,135900000.0,55.0,2019
9,10,The LEGO Movie 2 The Second Part,WB,190500000.0,105700000.0,55.5,84800000.0,44.5,2019


In [42]:
#create new dataframe for top 10 
toptenmovies_df = dirtymovies_df[:-78]


In [43]:
toptenmovies_df

Unnamed: 0,bo_year_rank,title,studio,worldwide-gross,domestic-gross,domestic-pct,overseas-gross,overseas-pct,bo_year
0,1,Avengers Endgame,BV,1342700000.0,394000000.0,29.3,948700000.0,70.7,2019
1,2,Captain Marvel,BV,1111400000.0,414500000.0,37.3,696800000.0,62.7,2019
2,3,The Wandering Earth,CMC,699800000.0,5900000.0,0.8,693900000.0,99.2,2019
3,4,How to Train Your Dragon The Hidden World,Uni.,516400000.0,159700000.0,30.9,356700000.0,69.1,2019
4,5,Alita Battle Angel,Fox,404400000.0,85700000.0,21.2,318700000.0,78.8,2019
5,6,Shazam,WB NL,346200000.0,131500000.0,38.0,214700000.0,62.0,2019
6,7,Dumbo 2019,BV,328700000.0,107500000.0,32.7,221200000.0,67.3,2019
7,8,Us,Uni.,249600000.0,173000000.0,69.3,76600000.0,30.7,2019
8,9,Glass,Uni.,247000000.0,111000000.0,45.0,135900000.0,55.0,2019
9,10,The LEGO Movie 2 The Second Part,WB,190500000.0,105700000.0,55.5,84800000.0,44.5,2019


In [48]:
##clean new dataframe for top 10 by removing rows domestic-pct, overseas-pct
toptenmovies_df.drop("domestic-pct", axis=1).drop("overseas-pct", axis=1)


Unnamed: 0,bo_year_rank,title,studio,worldwide-gross,domestic-gross,overseas-gross,bo_year
0,1,Avengers Endgame,BV,1342700000.0,394000000.0,948700000.0,2019
1,2,Captain Marvel,BV,1111400000.0,414500000.0,696800000.0,2019
2,3,The Wandering Earth,CMC,699800000.0,5900000.0,693900000.0,2019
3,4,How to Train Your Dragon The Hidden World,Uni.,516400000.0,159700000.0,356700000.0,2019
4,5,Alita Battle Angel,Fox,404400000.0,85700000.0,318700000.0,2019
5,6,Shazam,WB NL,346200000.0,131500000.0,214700000.0,2019
6,7,Dumbo 2019,BV,328700000.0,107500000.0,221200000.0,2019
7,8,Us,Uni.,249600000.0,173000000.0,76600000.0,2019
8,9,Glass,Uni.,247000000.0,111000000.0,135900000.0,2019
9,10,The LEGO Movie 2 The Second Part,WB,190500000.0,105700000.0,84800000.0,2019


In [49]:
Cleanedtoptenmovies_df= toptenmovies_df.drop("domestic-pct", axis=1).drop("overseas-pct", axis=1) 

In [50]:
Cleanedtoptenmovies_df

Unnamed: 0,bo_year_rank,title,studio,worldwide-gross,domestic-gross,overseas-gross,bo_year
0,1,Avengers Endgame,BV,1342700000.0,394000000.0,948700000.0,2019
1,2,Captain Marvel,BV,1111400000.0,414500000.0,696800000.0,2019
2,3,The Wandering Earth,CMC,699800000.0,5900000.0,693900000.0,2019
3,4,How to Train Your Dragon The Hidden World,Uni.,516400000.0,159700000.0,356700000.0,2019
4,5,Alita Battle Angel,Fox,404400000.0,85700000.0,318700000.0,2019
5,6,Shazam,WB NL,346200000.0,131500000.0,214700000.0,2019
6,7,Dumbo 2019,BV,328700000.0,107500000.0,221200000.0,2019
7,8,Us,Uni.,249600000.0,173000000.0,76600000.0,2019
8,9,Glass,Uni.,247000000.0,111000000.0,135900000.0,2019
9,10,The LEGO Movie 2 The Second Part,WB,190500000.0,105700000.0,84800000.0,2019


  if x is >= 1000000000 then convert to "1.xx billion"  to make more readable we are going from float to string 
  else convert to xxxmillion 
  
  Next steps for arjun figure out function above then put finished dataframe into mongodbcollection 
  