In [1]:
import requests
import time
import os
import json
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv("TMDB_API_KEY")
BASE_URL = "https://api.themoviedb.org/3"
CACHE_DIR = "tmdb_cache"
OUTPUT_FILE = "../data/movie_ids.csv"
CHECKPOINT_FILE = "checkpoint.txt"

os.makedirs(CACHE_DIR, exist_ok=True)


def fetch_page(page):
    """Fetch a single page with retry and rate-limit handling"""
    
    cache_path = os.path.join(CACHE_DIR, f"page_{page}.json")

    if os.path.exists(cache_path):
        with open(cache_path, "r", encoding="utf-8") as f:
            return json.load(f)

    url = f"{BASE_URL}/discover/movie"
    params = {
        "api_key": API_KEY,
        "language": "en-US",
        "sort_by": "popularity.desc",
        "page": page
    }

    while True:
        response = requests.get(url, params=params)

        if response.status_code == 200:
            data = response.json()

            # Save to cache
            with open(cache_path, "w", encoding="utf-8") as f:
                json.dump(data, f)

            time.sleep(0.25)  # Safe delay
            return data

        elif response.status_code == 429:
            print("Rate limit hit. Sleeping 5 seconds...")
            time.sleep(5)

        else:
            print(f"Error {response.status_code} on page {page}")
            return None


def get_movie_ids():
    all_ids = []

    # Resume from checkpoint
    start_page = 1
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r") as f:
            start_page = int(f.read().strip())

    print(f"Starting from page {start_page}")

    first_data = fetch_page(start_page)
    if not first_data:
        print("Failed to fetch first page.")
        return

    total_pages = min(first_data.get("total_pages", 1), 1000)
    print(f"Total pages available: {total_pages}")

    for page in range(start_page, total_pages + 1):

        data = fetch_page(page)
        if not data or "results" not in data:
            continue

        ids = [movie["id"] for movie in data["results"]]
        all_ids.extend(ids)

        # Save progress
        df = pd.DataFrame({"Movie_id": all_ids})
        df.to_csv(OUTPUT_FILE, index=False)

        with open(CHECKPOINT_FILE, "w") as f:
            f.write(str(page + 1))

        print(f"Page {page}/{total_pages} | Total IDs: {len(all_ids)}")

    print("Completed successfully.")
    return len(all_ids)

In [None]:
if __name__ == "__main__":
    total = get_movie_ids()
    print(f"Final total IDs collected: {total}")

Starting from page 1
Total pages available: 1000
Page 1/1000 | Total IDs: 20
Page 2/1000 | Total IDs: 40
Page 3/1000 | Total IDs: 60
Page 4/1000 | Total IDs: 80
Page 5/1000 | Total IDs: 100
Page 6/1000 | Total IDs: 120
Page 7/1000 | Total IDs: 140
Page 8/1000 | Total IDs: 160
Page 9/1000 | Total IDs: 180
Page 10/1000 | Total IDs: 200
Page 11/1000 | Total IDs: 220
Page 12/1000 | Total IDs: 240
Page 13/1000 | Total IDs: 260
