In [4]:
from dataclasses import dataclass
import pathlib
import pandas as pd
import requests
from requests_html import HTML

In [8]:
@dataclass
class ScrapeBoxOffice:
    base_endpoint:str = "https://www.boxofficemojo.com/year/world/"
    year:int = None
    save_raw:bool = False
    save:bool = False
    output_dir: str = "."
    table_selector: str = '.imdb-scroll-table'
    table_data = []
    table_header_names = []
    df = pd.DataFrame()
    
    @property
    def name(self):
        return self.year if isinstance(self.year, int) else 'world'
    
    def get_endpoint(self):
        endpoint = self.base_endpoint
        if isinstance(self.year, int):
            endpoint = f"{endpoint}{self.year}/"
        return endpoint
    
    def get_output_dir(self):
        return pathlib.Path(self.output_dir)
    
    def extract_html_str(self, endpoint=None):
        url = endpoint if endpoint is not None else self.get_endpoint()
        r = requests.get(url, stream=True)
        html_text = None
        status = r.status_code
        if r.status_code == 200:
            html_text = r.text
            if self.save_raw:
                output_fname = f"{self.name}.html"
                raw_output_dir = self.get_output_dir() / 'html'
                raw_output_dir.mkdir(exist_ok=True, parents=True)
                output_fname = raw_output_dir / output_fname
                with open(f"{output_fname}", 'w', encoding="utf-8") as f:
                    f.write(html_text)
            return html_text, status
        return html_text, status
    
    def parse_html(self, html_str=''):
        r_html = HTML(html=html_str)
        r_table = r_html.find(self.table_selector)
        if len(r_table) == 0:
            return None
        table_data = []
        header_names = []
        parsed_table = r_table[0]
        rows = parsed_table.find("tr")
        header_row = rows[0]
        header_cols = header_row.find('th')
        header_names = [x.text for x in header_cols]
        for row in rows[1:]:
            cols = row.find("td")
            row_data = []
            row_dict_data = {}
            for i, col in enumerate(cols):
                header_name = header_names[i]
                row_data.append(col.text)
            table_data.append(row_data)
        self.table_data = table_data
        self.table_header_names = header_names
        return self.table_data, self.table_header_names
    
    def to_df(self, data=[], columns=[]):
        return pd.DataFrame(data, columns=columns)
    
    def run(self, save=False):
        save = self.save if save is False else save
        endpoint = self.get_endpoint()
        html_str, status = self.extract_html_str(endpoint=endpoint)
        if status not in range(200, 299):
            raise Exception(f"Extraction failed, endpoint status {status} at {endpoint}")
        data, headers = self.parse_html(html_str if html_str is not None else '')
        df = self.to_df(data=data, columns=headers)
        self.df = df
        if save:
            filepath = self.get_output_dir() / f'{self.name}.csv'
            df.to_csv(filepath, index=False)
        return self.df
    

In [42]:
scraper = ScrapeBoxOffice(year=2022, save=True, save_raw=True, output_dir='data')
df = scraper.run()

In [43]:
df

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,%,Foreign,%.1
0,1,The Batman,"$603,263,177","$302,863,177",50.2%,"$300,400,000",49.8%
1,2,Water Gate Bridge,"$480,203,271","$117,294",<0.1%,"$480,085,977",100%
2,3,Uncharted,"$337,762,979","$126,362,979",37.4%,"$211,400,000",62.6%
3,4,Too Cool to Kill,"$217,255,728","$185,882",<0.1%,"$217,069,846",99.9%
4,5,Scream,"$139,813,281","$81,413,281",58.2%,"$58,400,000",41.8%
...,...,...,...,...,...,...,...
195,196,Pups Alone,"$146,178",-,-,"$146,178",100%
196,197,Bachchhan Paandey,"$144,003",-,-,"$144,003",100%
197,198,Kernagis,"$143,637",-,-,"$143,637",100%
198,199,Stop-Zemlia,"$143,434",-,-,"$143,434",100%


In [44]:
df.to_csv("./2022.csv")