# Web Scrapping

## Overview

In this project, we will collect box office data for movies from [Box Office Mojo](https://www.boxofficemojo.com/year/) using web scraping. We will use Python libraries such as `requests` for HTTP requests and `BeautifulSoup` for parsing HTML.

#### Import Required Libraries


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from bs4 import BeautifulSoup #to convert dataset into HTML code
import requests

In [2]:
url = "https://www.boxofficemojo.com/year/?ref_=bo_nb_di_secondarytab"

In [3]:
content = requests.get(url)

In [4]:
content

<Response [200]>

In [5]:
data = content.text

In [6]:
data[0:500]

'<!doctype html><html class="a-no-js" data-19ax5a9jf="dingo"><head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<script type=\'text/javascript\'>var ue_t0=ue_t0||+new Date();</script>\n<script type=\'text/javascript\'>\nwindow.ue_ihb = (window.ue_ihb || window.ueinit || 0) + 1;\nif (window.ue_ihb === 1) {\n\nvar ue_csm = window,\n    ue_hob = +new Date();\n(function(d){var e=d.ue=d.ue||{},f=Date.now||function(){return+new Date};e.d=function(b){return f()-(b?0:d.ue_t0)};e.'

In [7]:
soup = BeautifulSoup(data, 'html.parser')

In [8]:
len(soup)

2

In [9]:
# we are collecting all the table rows
tr = soup.find_all('tr')

In [10]:
year = []
totalGross = []
ly = []
totalRelease = []
average = []
bestRelease = []
link = []

In [11]:
# We are going to traverse every row and collect all data from website into lists
for row in tr[1:]:
    li = row.find_all('td')
    year.append(li[0].text)
    totalGross.append(li[1].text)
    ly.append(li[2].text)
    totalRelease.append(li[3].text)
    average.append(li[4].text)
    bestRelease.append(li[5].text)
    link.append("https://boxofficemojo.com/year/"+li[0].text)

In [12]:
# Check data in the list
len(year), len(totalGross), len(ly), len(totalRelease), len(average), len(bestRelease), len(link)

(49, 49, 49, 49, 49, 49, 49)

In [13]:
# Create a proper dataset
data = {
    'Year' : year,
    'Total Gross' : totalGross,
    'LY+-' : ly,
    'Total Release' : totalRelease,
    'Average Gross' : average,
    'Best Movie of Year' : bestRelease,
    'Link' : link
}

In [14]:
# Creating a DataFrame
df = pd.DataFrame(data)

In [15]:
df.head()

Unnamed: 0,Year,Total Gross,LY+-,Total Release,Average Gross,Best Movie of Year,Link
0,2025,"$5,332,398,608",-,404,"$13,199,006",A Minecraft Movie,https://boxofficemojo.com/year/2025
1,2024,"$8,570,012,657",-3.8%,675,"$12,696,315",Inside Out 2,https://boxofficemojo.com/year/2024
2,2023,"$8,907,111,984",+20.9%,592,"$15,045,797",Barbie,https://boxofficemojo.com/year/2023
3,2022,"$7,369,914,732",+64.4%,502,"$14,681,105",Top Gun: Maverick,https://boxofficemojo.com/year/2022
4,2021,"$4,483,016,589",+112.1%,442,"$10,142,571",Spider-Man: No Way Home,https://boxofficemojo.com/year/2021


In [16]:
df.shape

(49, 7)

In [17]:
rank = []
movie = []
gross = []
theater = []
total_gross = []
release_date = []
distributor = []

In [18]:
for url in df['Link']:
    soup = requests.get(url)
    data = BeautifulSoup(soup.text, 'html.parser')
    rows = data.find_all('tr')
    for row in rows[1:]:
        li = row.find_all('td')
        rank.append(li[0].text)
        movie.append(li[1].text)
        gross.append(li[5].text)
        theater.append(li[6].text)
        total_gross.append(li[7].text)
        release_date.append(li[8].text)
        distributor.append(li[9].text)

In [19]:
data = {
    'Rank' : rank,
    'Movie Name' : movie,
    'Gross' : gross,
    'Theater' : theater,
    'Total Gross' : total_gross,
    'Release Date' : release_date,
    'Distributor' : distributor
}

In [20]:
final_df = pd.DataFrame(data)
final_df

Unnamed: 0,Rank,Movie Name,Gross,Theater,Total Gross,Release Date,Distributor
0,1,A Minecraft Movie,"$423,949,195",4289,"$423,949,195",Apr 4,Warner Bros.\n\n
1,2,Lilo & Stitch,"$421,275,270",4410,"$421,275,270",May 23,Walt Disney Studios Motion Pictures\n\n
2,3,Jurassic World: Rebirth,"$317,672,050",4324,"$317,672,050",Jul 2,Universal Pictures\n\n
3,4,Superman,"$316,062,454",4275,"$316,062,454",Jul 11,Warner Bros.\n\n
4,5,Sinners,"$278,578,513",3518,"$278,578,513",Apr 18,Warner Bros.\n\n
...,...,...,...,...,...,...,...
8822,5,Exorcist II: The Heretic,"$30,749,142",703,"$30,749,142",Jun 17,Warner Bros.\n\n
8823,6,The Turning Point,"$25,815,410",5,"$25,933,445",Nov 14,Twentieth Century Fox\n\n
8824,7,Looking for Mr. Goodbar,"$22,512,655",110,"$22,512,655",Oct 19,Paramount Pictures\n\n
8825,8,Saturday Night Fever,"$18,234,852",726,"$94,213,184",Dec 16,Paramount Pictures\n\n


In [21]:
final_df.head()

Unnamed: 0,Rank,Movie Name,Gross,Theater,Total Gross,Release Date,Distributor
0,1,A Minecraft Movie,"$423,949,195",4289,"$423,949,195",Apr 4,Warner Bros.\n\n
1,2,Lilo & Stitch,"$421,275,270",4410,"$421,275,270",May 23,Walt Disney Studios Motion Pictures\n\n
2,3,Jurassic World: Rebirth,"$317,672,050",4324,"$317,672,050",Jul 2,Universal Pictures\n\n
3,4,Superman,"$316,062,454",4275,"$316,062,454",Jul 11,Warner Bros.\n\n
4,5,Sinners,"$278,578,513",3518,"$278,578,513",Apr 18,Warner Bros.\n\n


In [22]:
final_df.shape

(8827, 7)