In [None]:
import re  # Regular expressions for text processing
import csv  # CSV module to save extracted data
import requests  # For making HTTP requests to Wikipedia
from bs4 import BeautifulSoup  # BeautifulSoup for web scraping

In [None]:
# Wikipedia URL for highest-grossing films
URL = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"
# Base Wikipedia URL for deep links to movie pages
urlfordeep = "https://en.wikipedia.org/wiki/"

# Send an HTTP request to fetch the webpage
response = requests.get(URL)
response.raise_for_status()  # Raise an error if the request fails

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

# Find the table containing the highest-grossing films
table = soup.find("table", class_="wikitable")

# List to store extracted movie data
movies = []


In [None]:
# Function to clean the director's name
def clean_director(director) -> str:
    l = list(director)
    for j in range(len(l)):
        # Allow only letters, commas, periods, and spaces
        if l[j] not in set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz, ."):
            l[j] = ""
    director = "".join(l)  # Reassemble the cleaned text
    return director


# Function to clean the box office revenue format
def cleangross(gross) -> str:
    l = list(gross)
    for j in range(len(l)):
        # Allow only digits, commas, and dollar signs
        if l[j] not in set("1234567890,$"):
            l[j] = ""
    gross = "".join(l)  # Reassemble the cleaned text
    return gross


# Function to clean movie titles (removes the † symbol)
def clean_title(title) -> str:
    return title.replace("†", "")

In [None]:
# Function to extract additional data (Director & Country) from a movie's individual Wikipedia page
def takedata(url) -> list:
    response = requests.get(url)  # Fetch the movie's page
    response.raise_for_status()  # Raise an error if the request fails
    soup = BeautifulSoup(response.text, "html.parser")

    # Locate the infobox where key information is stored
    table = soup.find("table", class_="infobox vevent")

    # Default values in case the data is missing
    producer = "N/A"
    country = "N/A"

    if table:
        for row in table.find_all("tr"):
            namecols = row.find("th")  # Header column (e.g., "Directed by", "Country")
            cols = row.find("td")  # Value column

            if namecols and cols:
                key = namecols.text.strip()

                if key == "Directed by":
                    producer = cols.text.strip()

                if key in ["Countries", "Country"]:
                    # Remove reference numbers (e.g., [1], [2]) from country names
                    country_raw = cols.text.strip()
                    country = re.sub(r'\[\d+\]', '', country_raw).replace('\n', ' ').strip()

    return [producer, country]  # Return the extracted director and country


# Loop through the rows of the table (excluding the header)
for row in table.find_all("tr")[1:51]:  # Only process the top 50 movies
    cols = row.find_all("td")  # Extract all <td> elements (data cells)
    namecols = row.find_all("th")  # Extract all <th> elements (title column)

    if len(cols) >= 2:
        rank = cols[0].text.strip()  # Movie ranking
        title = namecols[0].text.strip()  # Movie title
        gross = cols[2].text.strip()  # Box office revenue
        year = cols[3].text.strip()  # Release year

        # Get the deep link to the movie's individual Wikipedia page
        urlData = namecols[0]
        movurl = urlData.find("a")
        link = urlfordeep + movurl["href"][6:]  # Construct full URL

        # Extract director and country from the movie's Wikipedia page
        res = takedata(link)

        # Clean extracted data
        gross = cleangross(gross)
        res[0] = clean_director(res[0])
        title = clean_title(title)

        # Append cleaned data to the list
        movies.append([title, year, gross, res[0], res[1]])


In [None]:
# Print extracted data to the console
for i in movies:
    if len(i) != 0:
        print(f"{i[0]} {i[1]} {i[2]} {i[3]} {i[4]} ")

# Save extracted data to a CSV file
with open("top_movies.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)

    # Write header row
    writer.writerow(["Title", "Year", "Gross", "Director", "Country"])

    # Write movie data rows
    writer.writerows(movies)

print("Data successfully saved to 'top_movies.csv'")