This program scrapes data from boxofficemojo.com, puts it into a dataframe and cleans the dataframe to answer these questions:

1. What is the top-grossing movie in each season per year?

2. What is the top-grossing domestic movie for the whole year, per year?

3. Wha is the top-grossing movie of all time?

4. What weekday do people watch the most movies this year?

5. What about during quarantine?

6. Which season do people watch the most movies?

7. Which holiday do the most people watch movies?

In [1]:
## Imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from IPython.display import display

In [8]:
## Main function
def main():

    ## Create dataframe
    #Information    Year    Movie    Cummulative Gross    Movies    Released    Average Gross
    #               Movie 1
    #               Movie 2
    # Winter        Movie 3
    #
    #               Movie 1
    #               Movie 2
    # Spring        Movie 3
    #etc...

    # Create dataframe
    complete_df = pd.DataFrame(columns = ['Season', 'Year', 'Movie', 'Cummulative Gross', 'Releases', 'Average Gross'])

    #df = get_seasonal_data()
    complete_df.head(10)
    #df.head(10)

In [3]:
## Method to get soup
def get_soup(url):

    # Get response
    response = requests.get(url)

    # Check response
    if response.status_code == 200:

        # Get soup
        soup = BeautifulSoup(response.content, "html.parser")
        return soup
    
    else:

        # Print error
        print("Error: Response code", response.status_code)
        return None

In [11]:
## Method to get seasonal data
def get_seasonal_data():

    # Create list of seasons and seasonal dataframe
    seasons = ["winter", "spring", "summer", "fall"]
    seasonal_df = pd.DataFrame(columns=["Season", "Year", "Movie", "Cummulative Gross", "Released", "Average Gross"])

    # Loop through seasons to get soup from each season
    for season in seasons:

        # Create list of desired data for dataframe
        years = []
        money = []                  # Money on website has the same class for all three types of money, so this list holds everything  
        cummulativeGross = []
        releases = []
        averageGross = []
        movies = []

        # Base url
        url = "https://www.boxofficemojo.com/season/" + season + "/?grossesOption=calendarGrosses"

        # Get soup
        soup = get_soup(url)

        # Get data if soup exists
        if soup != None:

            # Year
            yearElements = soup.find_all("td", class_="a-text-left mojo-header-column mojo-truncate mojo-field-type-year mojo-sort-column")
            years = [element.getText() for element in yearElements]

            # Cumulative Gross
            moneyElements = soup.find_all("td", class_="a-text-right mojo-field-type-money")
            money = [element.getText() for element in moneyElements]
            cummulativeGross = money[0::3]   # Every third element is the cumulative gross, second and third are average and top release gross

            # Releases
            releasesElements = soup.find_all("td", class_="a-text-right mojo-field-type-positive_integer")
            releases = [element.getText() for element in releasesElements]

            # Average Gross
            averageGross = money[1::3]      # Every second of three element is the average gross, first and third are cumulative and top release gross

            # Movies
            moviesElements = soup.find_all("td", class_="a-text-left mojo-field-type-release mojo-cell-wide")
            movies = [element.getText() for element in moviesElements]

        # Add data to dataframe
        for year in years:
            seasonal_df.loc[len(seasonal_df)] = [season.capitalize(), year, movies[years.index(year)], cummulativeGross[years.index(year)], releases[years.index(year)], releases[years.index(year)], averageGross[years.index(year)]]

        # Combine seasons column
        seasonal_df = seasonal_df.groupby(seasonal_df["Season"].replace("", np.nan).ffill()).agg({"Year" : '<br>'.join, "Movie" : '<br>'.join, "Cummulative Gross" : '<br>'.join, "Releases" : '<br>'.join, "Average Gross" : '<br>'.join}).reset_index().dropna(subset=["Season"])

    # Return dataframe
    return seasonal_df

In [10]:
## Run main
if __name__ == "__main__":
    main() 

In [None]:
print(get)