In [None]:
import pandas as pd
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as soup

# Initialize Selenium WebDriver with custom User-Agent
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
browser.get("https://www.imdb.com/chart/top/")
time.sleep(1)

# Initialize empty lists for product details
movie_url = []
movie_title = []
movie_year = []
movie_rating = []
movie_votecount = []
movie_timerate = []

try:
    # Get the page source
    html_source = browser.page_source
    page_soup = soup(html_source, "html.parser")

    # Find the product container
    container = page_soup.findAll("li", {"class": "ipc-metadata-list-summary-item"})

    for containers in container:
        time.sleep(1)

        # Let get the url of the movie
        url = containers.findAll("a", {"class": "ipc-title-link-wrapper"})
        if len(url) == 0:
            movie_url.append(np.nan)
        else:
            movie_url.append("https://www.imdb.com" + url[0]["href"])

        # Let get the title of the movie
        title = containers.findAll("h3", {"class": "ipc-title__text ipc-title__text--reduced"})
        if len(title) == 0:
            movie_title.append(np.nan)
        else:
            movie_title.append(title[0].text.strip())

        # Let get the year the movie was released 
        year = containers.findAll("span", {"class": "sc-86fea7d1-8 JTbpG cli-title-metadata-item"})
        if len(year) == 0:
            movie_year.append(np.nan)
        else:
            movie_year.append(year[0].text.strip())

        # Let get the rating of the movie
        rating = containers.findAll("span", {"class": "ipc-rating-star--rating"})
        if len(rating) == 0:
            movie_rating.append(np.nan)
        else:
            movie_rating.append(rating[0].text.strip())

        # Let get the votecount of the movie
        votecount = containers.findAll("span", {"class": "ipc-rating-star--voteCount"})
        if len(votecount) == 0:
            movie_votecount.append(np.nan)
        else:
            movie_votecount.append(votecount[0].text.strip())

        # Let get the timerate of the movie
        timerate = containers.findAll("span", {"class": "sc-86fea7d1-8 JTbpG cli-title-metadata-item"})
        if len(timerate) < 2:
            movie_timerate.append(np.nan)
        else:
            movie_timerate.append(timerate[1].text.strip())

except Exception as e:
    print(f"Error during scraping: {e}")
    

# Close the browser
browser.quit()

# Create a DataFrame with the scraped data
df = pd.DataFrame({
    "URL": movie_url,
    "Title": movie_title,
    "Year": movie_year,
    "Rating": movie_rating,
    "VoteCount": movie_votecount,
    "TimeRate": movie_timerate,
})

# Remove duplicates
df = df.drop_duplicates()

# Save the DataFrame to a CSV file
df.to_csv("IMDB_250.csv", index=False)

# Display the first few rows of the scraped data
print(df.head(30))


# DATA CLEANING

In [None]:
import pandas as pd
imdb = pd.read_csv('imdb_250.csv')
imdb.head(5)

In [None]:
imdb.isna().sum()

In [None]:
imdb['VoteCount'] = imdb['VoteCount'].str.strip('()')
imdb.head(5)


In [None]:
imdb['Title'] = imdb['Title'].str.replace(r'^\d+\.\s*', '', regex=True)
imdb.head(5)


In [None]:
imdb.to_csv("IMDB_250.csv", index=False)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# DataFrame
df = imdb.head(30)  

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, len(df) * 0.4)) 

# Hide axes
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
ax.set_frame_on(False)

# Create table
table = ax.table(cellText=df.values, colLabels=df.columns, loc='center', cellLoc='left')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 1.5)  

# Save to PDF
plt.savefig("imdb_data.pdf", bbox_inches="tight")
print("PDF saved successfully.")


In [None]:
import pandas as pd
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4, landscape
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle

# Load your full dataset
df = pd.read_csv("IMDB_250.csv") 

# Create PDF
pdf_file = "IMDB_250_Report.pdf"
document = SimpleDocTemplate(pdf_file, pagesize=landscape(A4))
elements = []

# Convert DataFrame to list of lists (for table)
data = [df.columns.tolist()] + df.values.tolist()

# Create Table
table = Table(data, repeatRows=1)
table.setStyle(TableStyle([
    ("BACKGROUND", (0, 0), (-1, 0), colors.black),
    ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
    ("ALIGN", (0, 0), (-1, -1), "LEFT"),
    ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
    ("FONTSIZE", (0, 0), (-1, -1), 8),
    ("BOTTOMPADDING", (0, 0), (-1, 0), 6),
    ("GRID", (0, 0), (-1, -1), 0.25, colors.grey),
]))

elements.append(table)
document.build(elements)

print("✅ PDF saved as:", pdf_file)
