# IMDb Bollywood Movie Data Scraper

This notebook scrapes data about Hindi-language films from India on IMDb in two phases:
1. Collecting basic information from the search results page
2. Collecting detailed information from individual movie pages

In [1]:
# Import required libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
import numpy as np
import logging
from datetime import datetime

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='imdb_scraping.log',
    filemode='w'
)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logging.getLogger('').addHandler(console)

In [3]:
# Initialize variables and constants
SEARCH_URL = 'https://www.imdb.com/search/title/?title_type=feature&countries=IN&languages=hi'
MOVIE_BASE_URL = 'https://www.imdb.com/title/'
CURRENT_TIME = "2025-05-15 10:20:15"  # datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
USER_LOGIN = "Ramji-Purwar"

# Create a session with browser-like headers
session = requests.Session()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.9',
    'Referer': 'https://www.imdb.com/',
    'DNT': '1',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Cache-Control': 'max-age=0',
}

In [4]:
def scrape_search_page(url, max_movies=None):
    """
    PHASE 1: Scrape basic movie information from IMDb search results page
    """
    logging.info(f"Scraping search results from: {url}")
    
    try:
        response = session.get(url, headers=headers)
        if response.status_code != 200:
            logging.error(f"Failed to retrieve search page: Status code {response.status_code}")
            return []
            
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find movie items
        movie_items = soup.find_all('div', class_=lambda c: c and 'lister-item' in c)
        
        if not movie_items:
            logging.warning("No movie items found with primary selector. Trying alternative selectors...")
            movie_items = soup.select(".lister-list .lister-item")
        
        if not movie_items:
            logging.warning("Still no movie items found. Saving HTML for debugging...")
            with open("imdb_search_page.html", "w", encoding="utf-8") as f:
                f.write(str(soup))
            return []
        
        logging.info(f"Found {len(movie_items)} movie items")
        
        # Limit number of movies if specified
        if max_movies and max_movies < len(movie_items):
            movie_items = movie_items[:max_movies]
        
        movies_data = []
        for item in movie_items:
            try:
                # Extract movie ID
                movie_id = None
                link = item.select_one('a[href*="/title/tt"]')
                if link:
                    href = link.get('href', '')
                    id_match = re.search(r'/title/(tt\d+)', href)
                    if id_match:
                        movie_id = id_match.group(1)
                
                if not movie_id:
                    continue
                
                # Extract movie name
                name_elem = item.select_one('h3 a')
                movie_name = name_elem.get_text(strip=True) if name_elem else None
                
                # Extract year
                year_elem = item.select_one('.lister-item-year')
                year = None
                if year_elem:
                    year_text = year_elem.get_text(strip=True)
                    year_match = re.search(r'(\d{4})', year_text)
                    if year_match:
                        year = year_match.group(1)
                
                # Extract description
                desc_elem = item.select_one('p.text-muted:nth-of-type(2)')
                description = None
                if desc_elem:
                    description = desc_elem.get_text(strip=True)
                    # Remove newlines and excessive whitespace
                    description = re.sub(r'\s+', ' ', description).strip()
                
                movies_data.append({
                    'movie_id': movie_id,
                    'movie_name': movie_name,
                    'description': description,
                    'year': year,
                    'scrape_time': CURRENT_TIME,
                    'user_login': USER_LOGIN
                })
                
                logging.info(f"Extracted basic info for: {movie_name} ({movie_id})")
                
            except Exception as e:
                logging.error(f"Error processing movie item: {e}")
        
        return movies_data
        
    except Exception as e:
        logging.error(f"Error scraping search page: {e}")
        return []

In [5]:
def scrape_movie_details(movie_id):
    """
    PHASE 2: Scrape detailed information from a movie's individual page
    """
    if not movie_id:
        return None, None, None, None
    
    movie_url = f"{MOVIE_BASE_URL}{movie_id}/"
    logging.info(f"Fetching details from: {movie_url}")
    
    try:
        response = session.get(movie_url, headers=headers)
        if response.status_code != 200:
            logging.error(f"Failed to retrieve movie page: {response.status_code}")
            return None, None, None, None
            
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract rating
        rating = None
        # Method 1: New IMDb layout
        rating_elem = soup.select_one('[data-testid="hero-rating-bar__aggregate-rating__score"] span')
        if rating_elem:
            rating = rating_elem.get_text(strip=True)
        
        # Method 2: Check JSON-LD data
        if not rating:
            script_tags = soup.select('script[type="application/ld+json"]')
            for script in script_tags:
                try:
                    import json
                    json_data = json.loads(script.string)
                    if 'aggregateRating' in json_data and 'ratingValue' in json_data['aggregateRating']:
                        rating = str(json_data['aggregateRating']['ratingValue'])
                        break
                except Exception as e:
                    logging.warning(f"Error parsing JSON-LD: {e}")
        
        # Extract votes
        votes = None
        # Method 1: New IMDb layout
        votes_elem = soup.select_one('[data-testid="hero-rating-bar__aggregate-rating__count"]')
        if votes_elem:
            votes_text = votes_elem.get_text(strip=True)
            votes_match = re.search(r'([\d,]+)', votes_text)
            if votes_match:
                votes = votes_match.group(1).replace(',', '')
        
        # Method 2: Check JSON-LD data
        if not votes:
            script_tags = soup.select('script[type="application/ld+json"]')
            for script in script_tags:
                try:
                    import json
                    json_data = json.loads(script.string)
                    if 'aggregateRating' in json_data and 'ratingCount' in json_data['aggregateRating']:
                        votes = str(json_data['aggregateRating']['ratingCount'])
                        break
                except Exception as e:
                    logging.warning(f"Error parsing JSON-LD: {e}")
        
        # Extract director
        director = None
        # Method 1: Look for director section
        director_section = soup.select_one('[data-testid="title-pc-principal-credit"]')
        if director_section and 'Director' in director_section.get_text():
            director_links = director_section.select('a')
            if director_links:
                director = director_links[0].get_text(strip=True)
        
        # Method 2: Look in JSON-LD data
        if not director:
            script_tags = soup.select('script[type="application/ld+json"]')
            for script in script_tags:
                try:
                    import json
                    json_data = json.loads(script.string)
                    if 'director' in json_data:
                        if isinstance(json_data['director'], list):
                            director = json_data['director'][0]['name']
                        else:
                            director = json_data['director']['name']
                        break
                except Exception as e:
                    logging.warning(f"Error parsing JSON-LD: {e}")
        
        # Extract top 5 actors
        actors = []
        # Method 1: New IMDb layout
        cast_section = soup.select_one('[data-testid="title-cast"]')
        if cast_section:
            actor_links = cast_section.select('a[data-testid="title-cast-item__actor"]')
            for i, actor_link in enumerate(actor_links):
                if i >= 5:  # Limit to top 5 actors
                    break
                actors.append(actor_link.get_text(strip=True))
        
        # Method 2: Look in JSON-LD data
        if not actors:
            script_tags = soup.select('script[type="application/ld+json"]')
            for script in script_tags:
                try:
                    import json
                    json_data = json.loads(script.string)
                    if 'actor' in json_data:
                        actor_data = json_data['actor']
                        if isinstance(actor_data, list):
                            for i, actor in enumerate(actor_data):
                                if i >= 5:  # Limit to top 5 actors
                                    break
                                actors.append(actor['name'])
                        else:
                            actors.append(actor_data['name'])
                        break
                except Exception as e:
                    logging.warning(f"Error parsing JSON-LD: {e}")
        
        return rating, votes, director, actors
        
    except Exception as e:
        logging.error(f"Error extracting details for movie {movie_id}: {e}")
        return None, None, None, None

In [6]:
def scrape_with_pagination(start_page=1, end_page=2, max_movies_per_page=None):
    """
    Scrape multiple pages of search results
    """
    all_movies = []
    
    for page_num in range(start_page, end_page + 1):
        # Calculate the start parameter based on page number
        # Page 1 starts at 1, Page 2 starts at 51, Page 3 starts at 101, etc.
        start_idx = 1 + (page_num - 1) * 50
        
        # Construct the URL with the proper start parameter
        page_url = f"{SEARCH_URL}&start={start_idx}&ref_=adv_nxt"
        
        # Scrape the current page
        page_movies = scrape_search_page(page_url, max_movies_per_page)
        
        if page_movies:
            all_movies.extend(page_movies)
            logging.info(f"Added {len(page_movies)} movies from page {page_num}")
        else:
            logging.warning(f"No movies found on page {page_num}, skipping")
        
        # Add a delay between pages
        time.sleep(2 + np.random.rand() * 2)
    
    return all_movies

In [7]:
# PHASE 1: Scrape search results pages
# Use a small number for testing
movies_data = scrape_with_pagination(start_page=1, end_page=1, max_movies_per_page=5)

# Save initial data to CSV
initial_df = pd.DataFrame(movies_data)
initial_df.to_csv('bollywood_movies_initial.csv', index=False)

2025-05-15 10:20:15 - INFO - Scraping search results from: https://www.imdb.com/search/title/?title_type=feature&countries=IN&languages=hi&start=1&ref_=adv_nxt
2025-05-15 10:20:16 - INFO - Found 50 movie items
2025-05-15 10:20:16 - INFO - Extracted basic info for: The Diplomat (tt26229612)
2025-05-15 10:20:16 - INFO - Extracted basic info for: Kesari Chapter 2: The Untold Story of Jallianwala Bagh (tt3562110)
2025-05-15 10:20:16 - INFO - Extracted basic info for: The Bhootnii (tt27162102)
2025-05-15 10:20:16 - INFO - Extracted basic info for: Good Bad Ugly (tt27540217)
2025-05-15 10:20:16 - INFO - Extracted basic info for: Odela 2 (tt31529147)
2025-05-15 10:20:16 - INFO - Added 5 movies from page 1


In [8]:
# PHASE 2: Scrape individual movie pages
for i, movie in enumerate(movies_data):
    movie_id = movie['movie_id']
    
    # Get detailed movie information
    rating, votes, director, actors = scrape_movie_details(movie_id)
    
    # Update movie data with additional details
    movies_data[i]['rating'] = rating
    movies_data[i]['votes'] = votes
    movies_data[i]['director'] = director
    movies_data[i]['actors'] = ", ".join(actors) if actors else None
    
    # Add a delay between requests
    time.sleep(1 + np.random.rand())

2025-05-15 10:20:19 - INFO - Fetching details from: https://www.imdb.com/title/tt26229612/
2025-05-15 10:20:20 - INFO - Fetching details from: https://www.imdb.com/title/tt3562110/
2025-05-15 10:20:22 - INFO - Fetching details from: https://www.imdb.com/title/tt27162102/
2025-05-15 10:20:23 - INFO - Fetching details from: https://www.imdb.com/title/tt27540217/
2025-05-15 10:20:25 - INFO - Fetching details from: https://www.imdb.com/title/tt31529147/


In [9]:
# Create final DataFrame and save to CSV
final_df = pd.DataFrame(movies_data)
final_df.to_csv('bollywood_movies_complete.csv', index=False)

# Show the results
final_df

Unnamed: 0,movie_id,movie_name,description,year,scrape_time,user_login,rating,votes,director,actors
0,tt26229612,The Diplomat,An Indian diplomat who tries to repatriate an ...,2025,2025-05-15 10:20:15,Ramji-Purwar,7.2,8822,Shivam Nair,"Kubra Sait, Arunoday Singh, Harsh Chhaya, Soni..."
1,tt3562110,Kesari Chapter 2: The Untold Story of Jallianw...,A dramatization of the life story of C. Sankar...,2025,2025-05-15 10:20:15,Ramji-Purwar,8.3,6668,Vinod Taliwal,"Simrat Kaur, Anushka Soni, Harbhajan Mann, Pun..."
2,tt27162102,The Bhootnii,A female ghost who gets her heart broken by a ...,2025,2025-05-15 10:20:15,Ramji-Purwar,6.5,10021,Mrighdeep Singh Lamba,"Varun Sharma, Huma Qureshi, Sunny Kaushal, Ajm..."
3,tt27540217,Good Bad Ugly,A story of 3 strangers who come together to so...,2025,2025-05-15 10:20:15,Ramji-Purwar,6.0,9644,Vijay Kumar Arora,"Preity Zinta, Jimmy Sheirgill, Guddu Gill, Nav..."
4,tt31529147,Odela 2,In a rural village known for supernatural beli...,2024,2025-05-15 10:20:15,Ramji-Purwar,5.9,3516,Ashok Teja,"Vasishta N. Simha, Hebah Patel, Yuva Chandraa,..."


In [10]:
# Summary of results
print("Scraping complete! Data saved to CSV files.")
print(f"Number of movies scraped: {len(final_df)}")

Scraping complete! Data saved to CSV files.
Number of movies scraped: 5


In [11]:
# Function to scrape more pages if needed
def scrape_more_pages(start_page=3, end_page=5, max_movies_per_page=50):
    """
    Scrape additional pages of search results and update the CSV files
    """
    # Load existing data if available
    try:
        existing_df = pd.read_csv('bollywood_movies_complete.csv')
        existing_ids = set(existing_df['movie_id'])
        logging.info(f"Loaded existing data with {len(existing_ids)} movies")
    except Exception:
        existing_df = pd.DataFrame()
        existing_ids = set()
        logging.info("No existing data found, starting fresh")
    
    # Scrape new pages
    new_movies = scrape_with_pagination(start_page, end_page, max_movies_per_page)
    
    # Filter out movies we already have
    unique_new_movies = [movie for movie in new_movies if movie['movie_id'] not in existing_ids]
    logging.info(f"Found {len(unique_new_movies)} new unique movies")
    
    # Get detailed information for new movies
    for i, movie in enumerate(unique_new_movies):
        movie_id = movie['movie_id']
        logging.info(f"Processing new movie {i+1}/{len(unique_new_movies)}: {movie['movie_name']} ({movie_id})")
        
        rating, votes, director, actors = scrape_movie_details(movie_id)
        
        # Update movie data with additional details
        unique_new_movies[i]['rating'] = rating
        unique_new_movies[i]['votes'] = votes
        unique_new_movies[i]['director'] = director
        unique_new_movies[i]['actors'] = ", ".join(actors) if actors else None
        
        # Add a delay between requests
        time.sleep(1 + np.random.rand())
    
    # Create DataFrame for new movies
    new_df = pd.DataFrame(unique_new_movies)
    
    # Combine with existing data
    combined_df = pd.concat([existing_df, new_df], ignore_index=True) if not existing_df.empty else new_df
    
    # Save updated data
    combined_df.to_csv('bollywood_movies_complete.csv', index=False)
    logging.info(f"Updated CSV with {len(combined_df)} total movies")
    
    return combined_df

# Uncomment to run this function if you want to scrape more pages
# more_movies_df = scrape_more_pages(start_page=2, end_page=3, max_movies_per_page=10)