In [None]:
import requests
import json
import pandas as pd
import time

# TMDb API setup
API_KEY = "api_key"
BASE_URL = "https://api.themoviedb.org/3"
DISCOVER_URL = f"{BASE_URL}/discover/movie"

# Output CSV file
OUTPUT_FILE = "tmdb_movies.csv"

# Settings
TOTAL_PAGES = 500
BATCH_SAVE_PAGES = 10  # Save after every 10 pages
SLEEP_BETWEEN_MOVIES = 0.15  # seconds between movie detail requests
SLEEP_BETWEEN_PAGES = 0.2  # seconds between pages

all_movies = []

def fetch_movies(page):
    """Fetch movies from /discover/movie for a given page."""
    params = {
        "api_key": API_KEY,
        "language": "en-US",
        "page": page,
        "sort_by": "popularity.desc"
    }
    response = requests.get(DISCOVER_URL, params=params)
    response.raise_for_status()
    return response.json().get("results", [])

def fetch_movie_details(movie_id):
    """Fetch full movie details with credits and keywords."""
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {
        "api_key": API_KEY,
        "append_to_response": "credits,keywords"
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()
    return {
        "movie_id": data.get("id"),
        "title": data.get("title"),
        "genres": json.dumps(data.get("genres", [])),
        "overview": data.get("overview"),
        "keywords": json.dumps(data.get("keywords", {}).get("keywords", [])),
        "cast": json.dumps(data.get("credits", {}).get("cast", [])),
        "crew": json.dumps(data.get("credits", {}).get("crew", [])),
        "vote_average": data.get("vote_average"),
        "vote_count": data.get("vote_count")
    }

try:
    for page in range(1, TOTAL_PAGES + 1):
        movies = fetch_movies(page)
        for movie in movies:
            movie_id = movie["id"]
            try:
                movie_data = fetch_movie_details(movie_id)
                all_movies.append(movie_data)
            except requests.HTTPError as e:
                print(f"Failed to fetch movie {movie_id}: {e}")
            time.sleep(SLEEP_BETWEEN_MOVIES)
        
        print(f"Page {page} done, total movies collected: {len(all_movies)}")
        time.sleep(SLEEP_BETWEEN_PAGES)

        # Save batch every BATCH_SAVE_PAGES pages
        if page % BATCH_SAVE_PAGES == 0:
            df = pd.DataFrame(all_movies)
            df.to_csv(OUTPUT_FILE, index=False)
            print(f"Saved batch after page {page} to {OUTPUT_FILE}")

    # Save final CSV
    df = pd.DataFrame(all_movies)
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"All done! Total movies collected: {len(all_movies)}. Saved to {OUTPUT_FILE}")

except requests.ConnectionError as conn_err:
    print(f"Connection Error : {conn_err}")
except requests.HTTPError as http_err:
    print(f"HTTP Error : {http_err}")
except requests.Timeout as timeout_err:
    print(f"Timeout Error : {timeout_err}")
