### Importing necessary Libraries

In [237]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

### Scraping three months data from three years


In [246]:
years = ['2023', '2022', '2021']
months = ['july', 'august', 'september']

### Defining Output columns of the dataframe

In [247]:
all_data = []

# Define the desired output column names
output_columns = ['date', 'day', 'day_no', 'top10_gross', 'change_yesterday',
                  'change_lastweek', 'release', '#1 Release', 'top_gross', 'year']

### Using Beautiful Soup to extract the data from three different pages (ie. three different years)

In [248]:
for year in years:
    for month in months:
        # Define the URL for the specific year and month
        url = f'https://www.boxofficemojo.com/daily/{year}?interval={month}&view=month'

        # Send an HTTP GET request to the URL
        r = requests.get(url)

        # Check if the request was successful (status code 200)
        if r.status_code == 200:
            page_source = r.content
            page_soup = bs(page_source, "html.parser")

            # Find the table containing the data
            tables = page_soup.find_all('table', attrs={'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})
            if not tables:
                print(f"No table found for year: {year}, month: {month}")
                continue

            table = tables[0]

            # Ignore the first row because it has the header data
            all_rows = table.find_all('tr')[1:]

            for row in all_rows:
                row_cols = row.find_all('td')

                # Initialize a dictionary for the row data with header names as keys
                row_data = dict(zip(output_columns, [col.text.strip() for col in row_cols]))

                # Append the 'year' value to the row data
                row_data['year'] = year

                # Append the row data to the list of all_data
                all_data.append(row_data)

        else:
            print(f"Error: {r.status_code} for URL: {url}")

### Storing the data in a dataframe

In [249]:
df = pd.DataFrame(all_data)

# Rearrange columns in the desired order
output_columns.remove('year')  # Remove the extra 'year' column
output_columns.insert(0, 'year')  # Place 'year' as the first column
df = df[output_columns]

In [250]:
df.tail()

Unnamed: 0,year,date,day,day_no,top10_gross,change_yesterday,change_lastweek,release,#1 Release,top_gross
271,2021,Sep 5,Sunday,248,"$34,538,599",-3%,+136.3%,29,Shang-Chi and the Legend of the Ten Rings,"$22,696,386"
272,2021,Sep 4,Saturday,247,"$35,596,201",-5.9%,+53.6%,29,Shang-Chi and the Legend of the Ten Rings,"$23,190,043"
273,2021,Sep 3,Friday,246,"$37,808,515",+906.2%,+103.2%,29,Shang-Chi and the Legend of the Ten Rings,"$29,502,259"
274,2021,Sep 2,Thursday,245,"$3,757,426",-13.1%,-16.9%,31,Candyman,"$1,310,410"
275,2021,Sep 1,Wednesday,244,"$4,324,762",-22.6%,-18.3%,32,Candyman,"$1,470,325"


In [251]:
df.shape

(276, 10)