In [4]:
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

In [5]:
IMDB_URL = "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31"

output_file = "imdb_2024_movies.csv"

# IMDb Selectors
movie_card_selector = "li.ipc-metadata-list-summary-item"
title_selector = "h3.ipc-title__text"
storyline_selector = "div.ipc-html-content-inner-div"
load_more_xpath = "//span[text()='50 more']/ancestor::button"

In [6]:
def scrape_imdb_movies(url):
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)

    movies_data = []

    try:
        driver.get(url)
        driver.maximize_window()
        time.sleep(3)

        # ------------------------------
        # CLICK "50 MORE" UNTIL FINISHED
        # ------------------------------
        while True:
            try:
                load_more_btn = wait.until(
                    EC.element_to_be_clickable((By.XPATH, load_more_xpath))
                )

                driver.execute_script(
                    "arguments[0].scrollIntoView({block:'center'});",
                    load_more_btn
                )
                time.sleep(1)

                # JavaScript click avoids overlay issues
                driver.execute_script("arguments[0].click();", load_more_btn)
                time.sleep(2)

            except TimeoutException:
                print("No more movies to load")
                break

        print("All movies loaded")

        # ------------------------------
        # EXTRACT ALL MOVIE DATA
        # ------------------------------
        movie_cards = driver.find_elements(By.CSS_SELECTOR, movie_card_selector)
        print(f"Total movies extracted: {len(movie_cards)}")

        for movie in movie_cards:
            try:
                # Movie Title
                title_text = movie.find_element(By.CSS_SELECTOR, title_selector).text
                title = title_text.split(". ", 1)[1]

                # Storyline
                try:
                    storyline = movie.find_element(
                        By.CSS_SELECTOR, storyline_selector
                    ).text
                except:
                    storyline = "No storyline available"

                movies_data.append({
                    "Movie Title": title,
                    "Storyline": storyline
                })

            except Exception as e:
                print("Error extracting movie:", e)

        return movies_data

    finally:
        driver.quit()


# ==============================
# SAVE TO CSV
# ==============================
def save_to_csv(data, filename):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding="utf-8")
    print(f"Saved {len(df)} movies to {filename}")


# ==============================
# MAIN
# ==============================
if __name__ == "__main__":
    movies = scrape_imdb_movies(IMDB_URL)

    if movies:
        save_to_csv(movies, output_file)
    else:
        print("No movies scraped")

No more movies to load
All movies loaded
Total movies extracted: 10000
Saved 10000 movies to imdb_2024_movies.csv
