# Crawl IMdB Website for TOP Grossing Movies and their info from each year

In [5]:
%pip install selenium
%pip install bs4

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
import pandas as pd
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.options import Options

In [7]:
def crawl_imdb_movies(year: int):
    """
    Crawl IMDb movie data for a specific year

    Args:
        year (int): Year to crawl movie data for
    """
    # Create output directories
    output_dir = os.path.join("Data", str(year))
    os.makedirs(output_dir, exist_ok=True)

    # Construct URL with dynamic year
    url = f"https://www.imdb.com/search/title/?title_type=feature&release_date={year}-01-01,{year}-12-31&count=100&sort=boxoffice_gross_us,desc"

    # Configure Edge options
    options = Options()

    options.add_argument("--lang=en-US")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-browser-side-navigation")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-blink-features=AutomationControlled")

    # Setup WebDriver
    driver_path = "edgedriver.exe"
    service = Service(executable_path=driver_path)
    driver = webdriver.Edge(service=service, options=options)

    try:
        # Navigate to page
        driver.get(url)
        time.sleep(3)  # Initial page load wait

        # Load more movies
        loaded_data = 100
        while loaded_data != 500:
            try:
                # Locate and click "Load More" button
                load_more_button = WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located(
                        (
                            By.XPATH,
                            "//button[contains(@class, 'ipc-btn') and .//span[contains(text(), '100 more')]]",
                        )
                    )
                )

                # Scroll and click button
                driver.execute_script(
                    "arguments[0].scrollIntoView(true);", load_more_button
                )
                driver.execute_script("arguments[0].click();", load_more_button)

                time.sleep(10)  # Wait for new data
                loaded_data += 100

            except Exception as e:
                print(f"No more 'Load More' button or error: {e}")
                break

        # Parse page content
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        films = soup.find(
            "ul",
            class_="ipc-metadata-list ipc-metadata-list--dividers-between sc-748571c8-0 gFCVNT detailed-list-view ipc-metadata-list--base",
        )
        films_data = []

        # Extract movie metadata
        for film in films.find_all("li", class_="ipc-metadata-list-summary-item"):
            # Title
            title = (
                film.find("h3", class_="ipc-title__text").text
                if film.find("h3", class_="ipc-title__text")
                else None
            )

            # Metadata details
            metadata_div = film.find(
                "div", class_="sc-300a8231-6 dBUjvq dli-title-metadata"
            )

            year_text = (
                metadata_div.find_all("span")[0].text
                if len(metadata_div.find_all("span")) > 0
                else None
            )
            duration = (
                metadata_div.find_all("span")[1].text
                if len(metadata_div.find_all("span")) > 1
                else None
            )
            mpa = (
                metadata_div.find_all("span")[2].text
                if len(metadata_div.find_all("span")) > 2
                else None
            )

            rating_info = film.find("span", class_="ipc-rating-star--rating")
            rating = rating_info.text if rating_info else None

            # Movie link
            link_tag = film.find("a", class_="ipc-lockup-overlay ipc-focusable")
            movie_link = f"https://www.imdb.com{link_tag['href']}" if link_tag else None

            # Votes
            vote_count_info = film.find("span", class_="ipc-rating-star--voteCount")
            vote_count = (
                vote_count_info.text.strip().replace("\xa0", "")[1:-1]
                if vote_count_info
                else None
            )

            # Metascore
            meta_score_info = film.find(
                "span", class_="sc-b0901df4-0 bXIOoF metacritic-score-box"
            )
            meta_score = meta_score_info.text if meta_score_info else None

            # Description
            description_div = film.find("div", class_="ipc-html-content-inner-div")
            description = description_div.text.strip() if description_div else None

            films_data.append(
                {
                    "Title": title,
                    "Year": year_text,
                    "Duration": duration,
                    "MPA": mpa,
                    "Rating": rating,
                    "Votes": vote_count,
                    "méta_score": meta_score,
                    "description": description,
                    "Movie Link": movie_link,
                }
            )

        # Save initial movies data
        initial_movies_df = pd.DataFrame(films_data)
        initial_movies_path = os.path.join(output_dir, f"imdb_movies_{year}.csv")
        initial_movies_df.to_csv(initial_movies_path, index=False)

        # Advanced movie details collection
        all_movie_data = []
        for url in list(initial_movies_df["Movie Link"]):
            try:
                driver.get(url)
                html = driver.page_source
                soup = BeautifulSoup(html, "html.parser")

                # Detailed extraction functions (similar to original code)
                def safe_extract(soup_obj, selector, attribute=None, processing=None):
                    try:
                        element = soup_obj.find(*selector)
                        if element:
                            text = element.get(attribute) if attribute else element.text
                            return processing(text) if processing else text
                    except Exception:
                        pass
                    return None

                # Similar detailed extraction as in the original code
                advanced_details = {
                    "link": url,
                    "writers": safe_extract(
                        soup,
                        ["li", {"role": "presentation"}],
                        processing=lambda x: [
                            writer.text
                            for writer in x.find_all(
                                "a", class_="ipc-metadata-list-item__list-content-item"
                            )
                        ],
                    ),
                    "director": safe_extract(
                        soup,
                        ["li", {"class": "ipc-metadata-list__item"}],
                        processing=lambda x: x.find(
                            "a", class_="ipc-metadata-list-item__list-content-item"
                        ).text,
                    ),
                    "budget": safe_extract(
                        soup,
                        ["li", {"data-testid": "title-boxoffice-budget"}],
                        processing=lambda x: x.find(
                            "span",
                            {"class": "ipc-metadata-list-item__list-content-item"},
                        )
                        .text.replace("\u202f", ",")
                        .replace("\xa0", ""),
                    ),
                    # Add other detailed extractions similarly
                }

                all_movie_data.append(advanced_details)

            except Exception as e:
                print(f"Error processing {url}: {e}")

        # Save advanced movie details
        advanced_movies_df = pd.DataFrame(all_movie_data)
        advanced_movies_path = os.path.join(
            output_dir, f"advanced_movies_details_{year}.csv"
        )
        advanced_movies_df.to_csv(advanced_movies_path, index=False)

        # Merge datasets
        advanced_movies_df.rename(columns={"link": "Movie Link"}, inplace=True)
        merged_data = pd.merge(
            initial_movies_df, advanced_movies_df, how="inner", on="Movie Link"
        )
        merged_path = os.path.join(output_dir, f"merged_movies_data_{year}.csv")
        merged_data.to_csv(merged_path, index=False)

        return merged_data

    except Exception as e:
        print(f"An error occurred: {e}")

    finally:
        driver.quit()

In [8]:
years_to_crawl = range(1966, 2014)
for year in years_to_crawl:
    print(f"Crawling data for year {year}")
    crawl_imdb_movies(year)
years_to_crawl = range(2019, 2024)
for year in years_to_crawl:
    print(f"Crawling data for year {year}")
    crawl_imdb_movies(year)

Crawling data for year 1960
Crawling data for year 1961
Crawling data for year 1962
Crawling data for year 1963
Crawling data for year 1964
Crawling data for year 1965
Crawling data for year 1966
Error processing https://www.imdb.com/title/tt0144934/?ref_=sr_i_342: object of type 'NoneType' has no len()
Error processing https://www.imdb.com/title/tt0060689/?ref_=sr_i_343: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: MicrosoftEdge=131.0.2903.86)
Stacktrace:
	(No symbol) [0x00007FF6415C6B05]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF6418EF474+1437348]
	sqlite3_dbdata_init [0x00007FF641992DA6+643174]
	(No symbol) [0x00007FF641487657]
	(No symbol) [0x00007FF641511CE5]
	(No symbol) [0x00007FF64152670A]
	(No symbol) [0x00007FF64150BE03]
	(No symbol) [0x00007FF6414E2984]
	(No symbol) [0x00007FF6414E1E30]
	(No symbol) [0x00007FF6414E2571]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF6418

KeyboardInterrupt: 