In [59]:
# Cell 1: Import required libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
import time
from datetime import datetime
import logging

In [60]:
# Cell 2: Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='imdb_scraping.log',
    filemode='w'
)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logging.getLogger('').addHandler(console)

In [61]:
# Cell 3: Initialize variables and set up session
# Note that we're NOT including the start parameter in the base URL
root = 'https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi'
movie_base_url = 'https://www.imdb.com/title/'

# Create a session with browser-like headers
session = requests.Session()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.9',
    'Referer': 'https://www.imdb.com/',
    'DNT': '1',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Cache-Control': 'max-age=0',
}

In [62]:
# Cell 4: Function to extract movie details from individual pages
def extract_movie_details(movie_id):
    """Extract detailed information from a movie's individual page."""
    if not movie_id:
        return None, None, None
        
    movie_url = f"{movie_base_url}{movie_id}/"
    logging.info(f"Fetching details from: {movie_url}")
    
    try:
        response = session.get(movie_url, headers=headers)
        if response.status_code != 200:
            logging.error(f"Failed to retrieve movie page: {response.status_code}")
            return None, None, None
            
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract genres
        genres = []
        # Method 1: Look for genre links in the new IMDb layout
        genre_links = soup.select('a[href*="genres="]')
        if genre_links:
            for link in genre_links:
                genre_text = link.get_text(strip=True)
                if genre_text and len(genre_text) < 20:  # Avoid non-genre links
                    genres.append(genre_text)
        
        # Method 2: Look for genre section with data-testid
        if not genres:
            genre_section = soup.select_one('[data-testid="genres"]')
            if genre_section:
                genre_spans = genre_section.select('span')
                for span in genre_spans:
                    genre_text = span.get_text(strip=True)
                    if genre_text and len(genre_text) < 20:
                        genres.append(genre_text)
        
        # Method 3: Look for genres in structured data
        if not genres:
            script_tags = soup.select('script[type="application/ld+json"]')
            for script in script_tags:
                try:
                    import json
                    json_data = json.loads(script.string)
                    if 'genre' in json_data:
                        if isinstance(json_data['genre'], list):
                            genres = json_data['genre']
                        else:
                            genres = [json_data['genre']]
                except Exception as e:
                    logging.warning(f"Error parsing JSON-LD: {e}")
        
        genre_value = ", ".join(genres) if genres else None
        
        # Extract rating
        rating_value = None
        # Method 1: Look for rating in the new IMDb layout
        rating_element = soup.select_one('[data-testid="hero-rating-bar__aggregate-rating__score"] span')
        if rating_element:
            rating_value = rating_element.get_text(strip=True)
        
        # Method 2: Check structured data for rating
        if not rating_value:
            for script in soup.select('script[type="application/ld+json"]'):
                try:
                    import json
                    json_data = json.loads(script.string)
                    if 'aggregateRating' in json_data and 'ratingValue' in json_data['aggregateRating']:
                        rating_value = str(json_data['aggregateRating']['ratingValue'])
                except Exception as e:
                    logging.warning(f"Error parsing JSON-LD for rating: {e}")
        
        # Method 3: Look for common rating patterns
        if not rating_value:
            for element in soup.select('[class*="rating"], [class*="score"]'):
                text = element.get_text(strip=True)
                rating_match = re.search(r'(\d+\.\d+)\/10', text)
                if rating_match:
                    rating_value = rating_match.group(1)
                    break
        
        # Extract vote count
        vote_value = None
        # Method 1: Look for vote count in the new IMDb layout
        vote_element = soup.select_one('[data-testid="hero-rating-bar__aggregate-rating__count"]')
        if vote_element:
            vote_text = vote_element.get_text(strip=True)
            vote_match = re.search(r'([\d,]+)', vote_text)
            if vote_match:
                vote_value = vote_match.group(1).replace(',', '')
        
        # Method 2: Check structured data for vote count
        if not vote_value:
            for script in soup.select('script[type="application/ld+json"]'):
                try:
                    import json
                    json_data = json.loads(script.string)
                    if 'aggregateRating' in json_data and 'ratingCount' in json_data['aggregateRating']:
                        vote_value = str(json_data['aggregateRating']['ratingCount'])
                except Exception as e:
                    logging.warning(f"Error parsing JSON-LD for votes: {e}")
        
        # Method 3: Look for vote count near rating
        if not vote_value:
            for element in soup.select('span, div'):
                if 'votes' in element.get_text().lower():
                    vote_match = re.search(r'([\d,]+)\s+votes', element.get_text(), re.I)
                    if vote_match:
                        vote_value = vote_match.group(1).replace(',', '')
                        break
        
        return genre_value, rating_value, vote_value
        
    except Exception as e:
        logging.error(f"Error extracting details for movie {movie_id}: {e}")
        return None, None, None

In [63]:
# Cell 5: Function to scrape a specific page with proper pagination
def scrape_page(page_num, max_movies_per_page=50):
    """Scrape a specific IMDb search results page."""
    # Calculate the start parameter based on page number
    # Page 1 starts at 1, Page 2 starts at 51, Page 3 starts at 101, etc.
    start_idx = 1 + (page_num - 1) * 50
    
    # Construct the URL with the proper start parameter
    page_url = f"{root}&start={start_idx}&ref_=adv_nxt"
    
    logging.info(f"Scraping page {page_num} (start={start_idx}): {page_url}")
    
    try:
        response = session.get(page_url, headers=headers)
        if response.status_code != 200:
            logging.error(f"Failed to retrieve page {page_num}: Status code {response.status_code}")
            return []
            
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find movie items with different possible selectors
        movie_items = soup.find_all('div', class_=lambda c: c and 'lister-item' in c)
        
        if not movie_items:
            movie_items = soup.find_all('li', class_=lambda c: c and 'ipc-metadata-list-summary-item' in c)
            
        if not movie_items:
            movie_items = soup.find_all(attrs={'data-testid': 'title'})
            
        logging.info(f"Found {len(movie_items)} movie items on page {page_num}")
        
        if not movie_items:
            logging.warning("No movie items found. Saving HTML for debugging...")
            with open(f"imdb_page_{page_num}.html", "w", encoding="utf-8") as f:
                f.write(str(soup))
            return []
            
        # Save the page HTML for debugging if needed
        # with open(f"imdb_page_{page_num}.html", "w", encoding="utf-8") as f:
        #     f.write(str(soup))
        
        # Limit movies per page for testing
        if max_movies_per_page < len(movie_items):
            movie_items = movie_items[:max_movies_per_page]
        
        # Process each movie item and extract data
        page_movies = []
        for item in movie_items:
            try:
                # Movie ID extraction
                movie_id_value = None
                img_with_data = item.select_one('img[data-tconst]')
                if img_with_data:
                    movie_id_value = img_with_data.get('data-tconst')
                
                if not movie_id_value:
                    link = item.select_one('a[href*="/title/tt"]')
                    if link:
                        href = link.get('href', '')
                        id_match = re.search(r'/title/(tt\d+)', href)
                        if id_match:
                            movie_id_value = id_match.group(1)
                
                # Skip if no ID
                if not movie_id_value:
                    logging.warning("Could not find movie ID, skipping item")
                    continue
                
                # Movie Name Extraction
                name = None
                for title_selector in [
                    item.select_one('h3 a'),
                    item.select_one('.ipc-title__text'),
                    item.select_one('[data-testid="title"]'),
                    item.select_one('a[href*="title/tt"]'),
                    item.select_one('h3, h2')
                ]:
                    if title_selector:
                        name = title_selector.get_text(strip=True)
                        break
                
                if not name:
                    logging.warning("Could not find movie name, skipping item")
                    continue
                
                # Year extraction
                year_value = None
                for year_selector in [
                    item.select_one('.lister-item-year'),
                    item.select_one('.release_date'),
                    item.select_one('[data-testid="releaseYear"]'),
                ]:
                    if year_selector:
                        year_text = year_selector.get_text(strip=True)
                        year_match = re.search(r'(\d{4})', year_text)
                        if year_match:
                            year_value = year_match.group(1)
                            break
                
                # If still not found, try to find any span with 4 digits (year)
                if not year_value:
                    for span in item.select('span'):
                        text = span.get_text(strip=True)
                        year_match = re.search(r'(\d{4})', text)
                        if year_match:
                            year_value = year_match.group(1)
                            break
                
                # Overview extraction
                overview_value = None
                for overview_selector in [
                    item.select_one('p.text-muted:nth-of-type(2)'),
                    item.select_one('[data-testid="plot"]'),
                    item.select_one('.ipc-html-content-inner-div')
                ]:
                    if overview_selector:
                        overview_value = overview_selector.get_text(strip=True).replace('\n', '')
                        break
                        
                # Alternative overview extraction for new layout
                if not overview_value:
                    paragraphs = item.select('p')
                    for p in paragraphs:
                        p_text = p.get_text(strip=True)
                        # Overview is typically a longer paragraph without specific labels
                        if len(p_text) > 50 and not p_text.startswith('Director'):
                            overview_value = p_text
                            break
                
                # Director and cast extraction
                director_value = None
                cast_value = None
                
                # Look for specific text patterns
                for p_tag in item.select('p'):
                    p_text = p_tag.get_text()
                    if 'Director' in p_text or 'Directors' in p_text:
                        if '|' in p_text:
                            parts = p_text.split('|')
                            director_part = parts[0]
                            director_value = director_part.replace('Director:', '').replace('Directors:', '').strip()
                            
                            if len(parts) > 1 and ('Stars' in parts[1] or 'Star:' in parts[1]):
                                cast_part = parts[1]
                                cast_value = cast_part.replace('Stars:', '').replace('Star:', '').strip()
                        else:
                            director_value = p_text.replace('Director:', '').replace('Directors:', '').strip()
                
                # Alternative approach
                if not director_value:
                    director_section = item.select_one('[data-testid="directors"]')
                    if director_section:
                        director_value = director_section.get_text(strip=True).replace('Directors', '').replace('Director', '').strip()
                
                if not cast_value:
                    cast_section = item.select_one('[data-testid="cast"]')
                    if cast_section:
                        cast_value = cast_section.get_text(strip=True).replace('Stars', '').replace('Star', '').strip()
                
                page_movies.append({
                    'movie_id': movie_id_value,
                    'movie_name': name,
                    'year': year_value,
                    'overview': overview_value,
                    'director': director_value,
                    'cast': cast_value,
                })
                
                logging.info(f"Extracted basic info for: {name} ({movie_id_value})")
                
            except Exception as e:
                logging.error(f"Error processing movie item: {e}")
        
        return page_movies
        
    except Exception as e:
        logging.error(f"Error processing page {page_num}: {e}")
        return []

In [64]:
# Cell 6: Main function to scrape movie list and then get detailed info
def scrape_imdb_movies(start_page=1, end_page=4, max_movies_per_page=50):
    """Scrape IMDb movies across multiple pages with proper pagination."""
    # Use a dictionary to store unique movies by ID
    unique_movies = {}
    user_login = "Ramji-Purwar"
    current_time = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
    
    # Process each page in sequence
    for page_num in range(start_page, end_page + 1):
        try:
            # Get basic movie info from the current page
            page_movies = scrape_page(page_num, max_movies_per_page)
            
            if not page_movies:
                logging.warning(f"No movies found on page {page_num}, skipping")
                continue
                
            logging.info(f"Found {len(page_movies)} movies on page {page_num}")
            
            # For each movie on the page, get detailed info
            for movie in page_movies:
                movie_id = movie['movie_id']
                
                # Skip if we've already processed this movie
                if movie_id in unique_movies:
                    logging.info(f"Skipping duplicate movie ID: {movie_id}")
                    continue
                
                # Get detailed info from the movie's page
                logging.info(f"Getting detailed info for movie: {movie['movie_name']} ({movie_id})")
                genre_value, rating_value, vote_value = extract_movie_details(movie_id)
                
                # Store complete movie data
                unique_movies[movie_id] = {
                    'movie_id': movie_id,
                    'movie_name': movie['movie_name'],
                    'year': movie['year'],
                    'genre': genre_value,
                    'overview': movie['overview'],
                    'director': movie['director'],
                    'cast': movie['cast'],
                    'rating': rating_value,
                    'votes': vote_value,
                    'scrape_time': current_time,
                    'user_login': user_login
                }
                
                logging.info(f"Completed: {movie['movie_name']} | Genre: {genre_value or 'Not found'} | Rating: {rating_value or 'Not found'} | Votes: {vote_value or 'Not found'}")
                
                # Add random delay between 1-3 seconds to avoid being blocked
                time.sleep(1 + np.random.rand() * 2)
            
            # Add a longer delay between pages
            time.sleep(3 + np.random.rand() * 2)  # Random delay between 3-5 seconds
            
        except Exception as e:
            logging.error(f"Error processing page {page_num}: {e}")
    
    # Convert the dictionary to a DataFrame
    df = pd.DataFrame(list(unique_movies.values()))
    
    logging.info(f"Total unique movies scraped: {len(df)}")
    return df

# Run a test on first few pages
test_df = scrape_imdb_movies(start_page=1, end_page=2, max_movies_per_page=10)

  current_time = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Found 25 movie items on page 1
Found 25

In [65]:
# Cell 7: Display the test results
test_df.head()

Unnamed: 0,movie_id,movie_name,year,genre,overview,director,cast,rating,votes,scrape_time,user_login
0,tt26229612,1. The Diplomat,2025,"Drama, Thriller",An Indian diplomat who tries to repatriate an ...,,,7.2,8822,2025-05-15 09:48:11,Ramji-Purwar
1,tt3562110,2. Kesari Chapter 2: The Untold Story of Jalli...,2025,"Biography, Drama, History",A dramatization of the life story of C. Sankar...,,,8.3,6668,2025-05-15 09:48:11,Ramji-Purwar
2,tt27162102,3. The Bhootnii,2025,"Action, Comedy, Horror",St Vincent college has a tree in their compoun...,,,6.5,10021,2025-05-15 09:48:11,Ramji-Purwar
3,tt27540217,4. Good Bad Ugly,2025,"Action, Crime, Drama",A fearless don tries to change his ruthless wa...,,,6.0,9644,2025-05-15 09:48:11,Ramji-Purwar
4,tt31529147,5. Odela 2,2025,"Adventure, Crime, Drama","In a remote village, steeped in rich cultural ...",,,5.9,3516,2025-05-15 09:48:11,Ramji-Purwar


In [66]:
# Cell 8: Check for missing values and duplicates
print("Missing values per column:")
print(test_df.isnull().sum())

print("\nTotal rows:", len(test_df))
print("Unique movie IDs:", test_df['movie_id'].nunique())

# Check for duplicate movie IDs (should be 0)
duplicate_ids = test_df[test_df.duplicated(subset=['movie_id'], keep=False)]
print(f"\nDuplicate movie IDs: {len(duplicate_ids)}")
if len(duplicate_ids) > 0:
    print(duplicate_ids[['movie_id', 'movie_name']])

# Check unique page numbers (based on movie position numbers)
if 'movie_name' in test_df.columns:
    # Extract position numbers from movie names (if they have format like "1. Movie Title")
    position_pattern = re.compile(r'^\d+\.')
    has_position = test_df['movie_name'].str.contains(position_pattern, regex=True)
    if has_position.any():
        positions = test_df.loc[has_position, 'movie_name'].str.extract(r'^(\d+)\.', expand=False).astype(int)
        unique_positions = sorted(positions.unique())
        print(f"\nUnique position numbers: {unique_positions}")
        print(f"Min position: {min(positions) if len(positions) > 0 else 'N/A'}")
        print(f"Max position: {max(positions) if len(positions) > 0 else 'N/A'}")

Missing values per column:
movie_id        0
movie_name      0
year            0
genre           0
overview        0
director       10
cast           10
rating          0
votes           0
scrape_time     0
user_login      0
dtype: int64

Total rows: 10
Unique movie IDs: 10

Duplicate movie IDs: 0

Unique position numbers: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10)]
Min position: 1
Max position: 10


In [67]:
# Cell 9: Save the test results
test_df.to_csv('Movie_Dataset_Test_MultiPage.csv', index=False)
print(f"Saved {len(test_df)} unique movie records to Movie_Dataset_Test_MultiPage.csv")

Saved 10 unique movie records to Movie_Dataset_Test_MultiPage.csv


In [68]:
# Cell 10: Run the full scraping with more pages
# Uncomment this code when you're ready for the full scrape

full_df = scrape_imdb_movies(
    start_page=1,       # Starting page
    end_page=4,         # Ending page - increase this for more results
    max_movies_per_page=50  # Maximum movies to process per page
)

# Save to CSV with timestamp
import datetime
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'Movie_Dataset_Complete_{timestamp}.csv'
full_df.to_csv(filename, index=False)
print(f"Saved {len(full_df)} unique movie records to {filename}")

  current_time = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Scraping page 1 (start=1): https://www.imdb.com/search/title/?title_type=feature&countries=in&languages=hi&start=1&ref_=adv_nxt
Found 25 movie items on page 1
Found 25

Saved 25 unique movie records to Movie_Dataset_Complete_20250515_095200.csv
