In [46]:
import requests
from bs4 import BeautifulSoup
import csv, json

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

In [47]:
# Extract additional details from individual movie page
def get_detail_info(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")

        script_tag = soup.find("script", type="application/ld+json")
        data = json.loads(script_tag.string)

        release_year = data.get("datePublished", "N/A").split("-")[0]
        director = data.get("director", [{}])[0].get("name", "N/A")
        main_cast = ", ".join(actor["name"] for actor in data.get("actor", []))

        return {
            "Release Year": release_year,
            "Director": director,
            "Main Cast": main_cast
        }
    except Exception as e:
        print(f"⚠️ Failed to fetch details for {url}: {e}")
        return {
            "Release Year": "N/A",
            "Director": "N/A",
            "Main Cast": "N/A"
        }

In [48]:
def get_movies(page):
    # IMDb list page URL
    url = "https://www.imdb.com/list/ls057823854/?sort=popularity%2Casc&page=" + page
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.content, "html.parser")
    
    scripts = soup.find_all("script", type="application/ld+json")
    
    # Look for movie entries within scripts
    movies = []
    
    for script in scripts:
        try:
            data = json.loads(script.string)
    
            if isinstance(data, dict) and data.get("@type") == "ItemList":
                for entry in data["itemListElement"]:
                    movie = entry.get("item", {})
    
                    title = movie.get("name", "N/A")
                    genre = movie.get("genre", "N/A")
                    rating = movie.get("aggregateRating", {}).get("ratingValue", "N/A")
                    description = movie.get("description", "N/A")
                    url = movie.get("url", "N/A")
    
                    movie_detail = get_detail_info(url)
    
                    movie_info = {
                        "Title": title,
                        "Genre": genre,
                        "Rating": rating,
                        "Description": description,
                        **movie_detail,
                        "Url": url,
                    }
    
                    movies.append(movie_info)
                    print("✅ Extracted and saved", movie_info["Title"])
    
        except (json.JSONDecodeError, TypeError):
            continue
    print("✅ Extracted movies")
    return movies

In [49]:
def save_movies(page, movies):
    # CSV fieldnames
    fieldnames = ["Title", "Genre", "Rating", "Description", "Release Year", "Director", "Main Cast", "Url"]
    
    # Write to CSV
    with open("./scrap_datasets/imdb_movies_page_" + page + ".csv", "w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(movies)
    
    print("✅ Extracted and saved to imdb_movies_page_" + page + ".csv")

In [56]:
page = "40"

In [57]:
movies = get_movies(page)

✅ Extracted and saved If I Should Fall from Grace: The Shane MacGowan Story
✅ Extracted and saved Atatürk: Founder of Modern Turkey
✅ Extracted and saved Elia Kazan: A Director&apos;s Journey
✅ Extracted and saved The Scrapper
✅ Extracted and saved Govt. vs Green
✅ Extracted and saved Kevin James: Sweat the Small Stuff
✅ Extracted and saved Big Boys Gone Bananas!*
✅ Extracted and saved Arabian Nights
✅ Extracted and saved Septimus Heap: Magyk
✅ Extracted and saved The City That Sailed
✅ Extracted and saved Magic Kingdom for Sale
✅ Extracted and saved The Man from Primrose Lane
✅ Extracted and saved The Dive
✅ Extracted and saved Attack of La Niña
✅ Extracted and saved The Tramp and the Dictator
✅ Extracted and saved Life of Python
✅ Extracted and saved Patton Oswalt: No Reason to Complain
✅ Extracted and saved Chronicles of Nick
✅ Extracted and saved The Tragedy of Coriolanus
✅ Extracted and saved IBM and the Holocaust
✅ Extracted and saved For Neda
✅ Extracted and saved King of Heists

In [58]:
save_movies(page, movies)

✅ Extracted and saved to imdb_movies_page_40.csv
