# FILMS REVENUE DATA SCRAPPING FROM WIKIPEDIA.

In [1]:
# Import libraries
import requests
import csv
import json
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Web link
response = requests.get('https://en.wikipedia.org/wiki/List_of_highest-grossing_films#endnote_Rentals')

In [3]:
# soup Object
soup = BeautifulSoup(response.content)

In [4]:
# Scrap the data

def extract_wikipedia_table(url, table_class=None):
    """
    Extracts the contents of a table from a Wikipedia page.

    Args:
    url (str): The URL of the Wikipedia page.
    table_class (str, optional): The class name of the table to target. Defaults to None.

    Returns:
    list: A list of dictionaries containing the table data.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    if table_class:
        table = soup.find('table', {'class': table_class})
    else:
        table = soup.find('table')
    
    headers = []
    for th in table.find_all('th'):
        headers.append(th.text.strip())
    
    rows = []
    for tr in table.find_all('tr')[1:]:
        cells = tr.find_all(['th', 'td'])
        if len(cells) > 0:
            row_data = {}
            for i, cell in enumerate(cells):
                row_data[headers[i]] = cell.text.strip()
            rows.append(row_data)
    
    return rows

# Target link
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_films'
table_class = 'wikitable'  # Specify the class of the table
table_data = extract_wikipedia_table(url, table_class)

for row in table_data:
    print(row)


{'Rank': '1', 'Peak': '1', 'Title': 'Avatar', 'Worldwide gross': '$2,923,706,026', 'Year': '2009', 'Ref': '[# 1][# 2]'}
{'Rank': '2', 'Peak': '1', 'Title': 'Avengers: Endgame', 'Worldwide gross': '$2,797,501,328', 'Year': '2019', 'Ref': '[# 3][# 4]'}
{'Rank': '3', 'Peak': '3', 'Title': 'Avatar: The Way of Water', 'Worldwide gross': '$2,320,250,281', 'Year': '2022', 'Ref': '[# 5][# 6]'}
{'Rank': '4', 'Peak': '1', 'Title': 'Titanic', 'Worldwide gross': 'T$2,257,844,554', 'Year': '1997', 'Ref': '[# 7][# 8]'}
{'Rank': '5', 'Peak': '3', 'Title': 'Star Wars: The Force Awakens', 'Worldwide gross': '$2,068,223,624', 'Year': '2015', 'Ref': '[# 9][# 10]'}
{'Rank': '6', 'Peak': '4', 'Title': 'Avengers: Infinity War', 'Worldwide gross': '$2,048,359,754', 'Year': '2018', 'Ref': '[# 11][# 12]'}
{'Rank': '7', 'Peak': '6', 'Title': 'Spider-Man: No Way Home', 'Worldwide gross': '$1,922,598,800', 'Year': '2021', 'Ref': '[# 13][# 14]'}
{'Rank': '8', 'Peak': '3', 'Title': 'Jurassic World', 'Worldwide gros

In [5]:
# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(table_data)

# Save the DataFrame to a CSV file
df.to_csv('highest_grossing_films.csv', index=False)

# First 5 rows of the dataframe
df.head()

Unnamed: 0,Rank,Peak,Title,Worldwide gross,Year,Ref
0,1,1,Avatar,"$2,923,706,026",2009,[# 1][# 2]
1,2,1,Avengers: Endgame,"$2,797,501,328",2019,[# 3][# 4]
2,3,3,Avatar: The Way of Water,"$2,320,250,281",2022,[# 5][# 6]
3,4,1,Titanic,"T$2,257,844,554",1997,[# 7][# 8]
4,5,3,Star Wars: The Force Awakens,"$2,068,223,624",2015,[# 9][# 10]


## Data Cleaning

In [6]:
# Drop the `Ref` column 
df_clean = df.drop('Ref', axis=1)
df_clean.head()

Unnamed: 0,Rank,Peak,Title,Worldwide gross,Year
0,1,1,Avatar,"$2,923,706,026",2009
1,2,1,Avengers: Endgame,"$2,797,501,328",2019
2,3,3,Avatar: The Way of Water,"$2,320,250,281",2022
3,4,1,Titanic,"T$2,257,844,554",1997
4,5,3,Star Wars: The Force Awakens,"$2,068,223,624",2015


In [7]:
# Check missing columns
df_clean.isnull().sum()

Rank               0
Peak               0
Title              0
Worldwide gross    0
Year               0
dtype: int64

In [8]:
# Information
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Rank             50 non-null     object
 1   Peak             50 non-null     object
 2   Title            50 non-null     object
 3   Worldwide gross  50 non-null     object
 4   Year             50 non-null     object
dtypes: object(5)
memory usage: 2.1+ KB


In [9]:
# Unique movies  in the dataset
df_clean['Title'].unique()

array(['Avatar', 'Avengers: Endgame', 'Avatar: The Way of Water',
       'Titanic', 'Star Wars: The Force Awakens',
       'Avengers: Infinity War', 'Spider-Man: No Way Home',
       'Jurassic World', 'The Lion King', 'The Avengers', 'Furious 7',
       'Top Gun: Maverick', 'Frozen II', 'Barbie',
       'Avengers: Age of Ultron', 'Inside Out 2 †',
       'The Super Mario Bros. Movie', 'Black Panther',
       'Harry Potter and the Deathly Hallows – Part 2',
       'Star Wars: The Last Jedi', 'Jurassic World: Fallen Kingdom',
       'Frozen', 'Beauty and the Beast', 'Incredibles 2',
       'The Fate of the Furious', 'Iron Man 3', 'Minions',
       'Captain America: Civil War', 'Aquaman',
       'The Lord of the Rings: The Return of the King',
       'Spider-Man: Far From Home', 'Captain Marvel',
       'Transformers: Dark of the Moon', 'Skyfall',
       'Transformers: Age of Extinction', 'The Dark Knight Rises',
       'Joker', 'Star Wars: The Rise of Skywalker', 'Toy Story 4',
       'T

In [12]:
# Renaming the 'Worldwide gross' to 'Revenue'
df_clean.rename(columns={'Rank' : 'Position','Worldwide gross' : 'Revenue'}, inplace=True)
df_clean.head()

Unnamed: 0,Position,Peak,Title,Revenue,Year
0,1,1,Avatar,"$2,923,706,026",2009
1,2,1,Avengers: Endgame,"$2,797,501,328",2019
2,3,3,Avatar: The Way of Water,"$2,320,250,281",2022
3,4,1,Titanic,"T$2,257,844,554",1997
4,5,3,Star Wars: The Force Awakens,"$2,068,223,624",2015


In [13]:
# Save the Clean DataFrame to a CSV file
df_clean.to_csv('highest_grossing_films_clean.csv', index=False)