# Scraping IMDb Website

Step 1: Inspecting the website and its features using Developer Tools

Step 2: Importing required libraries

In [3]:
# importing required libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [37]:
# Defining the Webpage and Headers

url = "https://www.imdb.com/chart/top/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Referer": "https://www.imdb.com/"  
}

response = requests.get(url, headers=headers)
print("Status code:", response.status_code)

# Checking if the webpage is accessible
if response.status_code ==200:
    print("Success! First 500 chars of content:", response.text[:500])
else:
    print("Failed:", response.text[:500])

Status code: 200
Success! First 500 chars of content: <!DOCTYPE html><html lang="en-US" xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://www.facebook.com/2008/fbml"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><script>if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }</script><script>window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
             


In [None]:
# Parsing  HTML
soup = BeautifulSoup(response.text, "html.parser")

# Creating an empty list to store the data
movie_list = []

# Finding All movies on the page
movies = soup.select("li.ipc-metadata-list-summary-item")

for movie in movies[:100]:  
    
    title_tag = movie.find("h3", class_="ipc-title__text")
    full_title = title_tag.get_text(strip=True) if title_tag else "N/A"
    
    # Removing rank prefix and getting just the Movie name
    if ". " in full_title:
        title = full_title.split(". ", 1)[1]
    else:
        title = full_title

    # Rating
    rating_tag = movie.find("span", class_="ipc-rating-star--rating")
    rating = rating_tag.get_text(strip=True) if rating_tag else "N/A"

    # Dictionary (inside loop!)
    movie_data = {
        "Title": title,
        "Rating": rating
    }
    
    # Appending movie data to the list
    movie_list.append(movie_data)

print(movie_list)

[{'Title': 'The Shawshank Redemption', 'Rating': '9.3'}, {'Title': 'The Godfather', 'Rating': '9.2'}, {'Title': 'The Dark Knight', 'Rating': '9.1'}, {'Title': 'The Godfather Part II', 'Rating': '9.0'}, {'Title': '12 Angry Men', 'Rating': '9.0'}, {'Title': 'The Lord of the Rings: The Return of the King', 'Rating': '9.0'}, {'Title': "Schindler's List", 'Rating': '9.0'}, {'Title': 'The Lord of the Rings: The Fellowship of the Ring', 'Rating': '8.9'}, {'Title': 'Pulp Fiction', 'Rating': '8.8'}, {'Title': 'The Good, the Bad and the Ugly', 'Rating': '8.8'}]


In [28]:
# Converting Movie List to a DataFrame
df = pd.DataFrame(movie_list)
df

Unnamed: 0,Title,Rating
0,The Shawshank Redemption,9.3
1,The Godfather,9.2
2,The Dark Knight,9.1
3,The Godfather Part II,9.0
4,12 Angry Men,9.0
5,The Lord of the Rings: The Return of the King,9.0
6,Schindler's List,9.0
7,The Lord of the Rings: The Fellowship of the Ring,8.9
8,Pulp Fiction,8.8
9,"The Good, the Bad and the Ugly",8.8


In [32]:
# Saving DataFrame to CSV
df.to_csv("TopIDMb_Movies.csv", index=False)

Section 2: Handling Pagination

In [None]:
import time
# To handle Pagination, we scrape 3 IMDb webpages for highest-rated action movies
base_url = "https://www.imdb.com/search/title/"

# defining the header
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Referer": "https://www.imdb.com/"  
}

#defining search parameters 
params = {
    "genres":"action",
    "sort":"user_rating, desc",
    "title_type":"feature",
    "count":50
}

# creating an empty list to store movies and a variable to store page number
movie_list=[]
page_start = 1

for page in range(1,4):
    params["start"] = page_start
    response = requests.get(base_url, headers=headers, params=params)
    
    # checking status code
    if response.status_code !=200:
        print(f"Failed on page {page}:{response.status_code}")
        
    soup = BeautifulSoup(response.text, "html.parser")
    movies = soup.select("li.ipc-metadata-list-summary-item")
    
    if not movies:
        print(f"No more movies on page {page}")
        break
    print(f"Scraped page {page}:{len(movies)} movies")
    
    for movie in movies:
        title_tag = movie.select_one("h3.ipc-title__text")
        full_title = title_tag.get_text(strip=True) if title_tag else "N/A"
        
        if ". " in full_title:
            title = full_title.split(". ",1)[1]
        else:
            title = full_title
        
        rating_tag = movie.select_one("span.ipc-rating-star-rating")
        rating = rating_tag.get_text(strip=True) if rating_tag else "N/A"
        
        year_tag = movie.select_one("span.cli-title-metadata-item")
        year = year_tag.get_text(strip=True) if year_tag else "N/A"
        
        # Appending movies into movie_list
        movie_list.append(
            {"Title": title,
            "Rating":rating,
            "Year":year}
        )
    # adding interval to scraping to avoid getting blocked
    time.sleep(3)
    

        

Scraped page 1:25 movies
Scraped page 2:25 movies
Scraped page 3:25 movies


In [36]:
# Saving Results to a DataFrame and CSV
df= pd.DataFrame(movie_list)
print(df.head(10))

df.to_csv("imdb_top_action.csv", index=False)

                                      Title Rating Year
0                                     Hands    N/A  N/A
1             Lesbian Western - The Feature    N/A  N/A
2                  Escuadrón Anticorrupción    N/A  N/A
3                        Escuadrón Especial    N/A  N/A
4  Not Welcome Mexicano Caceria de Ilegales    N/A  N/A
5                                    Badang    N/A  N/A
6  Vixen Highway 2006: It Came from Uranus!    N/A  N/A
7              Browncoats: Independence War    N/A  N/A
8                    Mstitel chelovechestva    N/A  N/A
9                                Desh Drohi    N/A  N/A


Handling Dynamic Content on IMDb Website using Selenium

In [38]:


from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Opening browser
driver = webdriver.Chrome()

# Opening IMDb movie page on IMDb (Avatar: Fire and Ash)
driver.get("https://m.imdb.com/title/tt1757678/?ref_=hm_tenup_t_4")

# Wait for JavaScript to load
time.sleep(5)

# Find movie title
title = driver.find_element(By.TAG_NAME, "h1").text
print("Title:", title)

driver.quit()


Title: Avatar: Fire and Ash
