In [3]:
import os
import json
import time
import gzip
import requests
import pandas as pd
from datetime import datetime
from config import api_key

# to get the the most recent movie list
EXPORT_DATE = datetime.now().strftime("%m_%d_%Y")
EXPORT_URL = f"http://files.tmdb.org/p/exports/movie_ids_{EXPORT_DATE}.json.gz"
EXPORT_FILE = f"movie_ids_{EXPORT_DATE}.json.gz"

## We used the help of ChatGPT in order to download the available movie list from TMDb, load the ZIP file and create a request function for each movie (following 3 functions)

In [4]:
# downloads the movie list (status code = 200 - means request approved)
def download_export_file():
    response = requests.get(EXPORT_URL, stream=True)
    if response.status_code == 200:
        with open(EXPORT_FILE, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded export file: {EXPORT_FILE}")
    else:
        print(f"Failed to download export file. Status code: {response.status_code}")
        exit(1)

In [5]:
# loads the id's of the movies from the downloaded list
def load_movie_ids():
    with gzip.open(EXPORT_FILE, 'rt', encoding='utf-8') as f:
        return [json.loads(line)['id'] for line in f]

In [6]:
# requests the movie data from the API and returns data/error according to status code 
def query_tmdb(movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US"
    try:
        res = requests.get(url)
        if res.status_code == 200:
            return res.json()
        elif res.status_code == 429:
            retry_after = int(res.headers.get("Retry-After", 1))
            print(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
            time.sleep(retry_after)
            return query_tmdb(movie_id)
        else:
            print(f"Failed to fetch movie ID {movie_id}. Status code: {res.status_code}")
    except Exception as e:
        print(f"Exception occurred while fetching movie ID {movie_id}: {e}")
    return None

In [7]:
# get all available features from the API
def get_movie_info(data):
    return {
        'id': data.get('id'),
        'imdb_id': data.get('imdb_id'),
        'title': data.get('title'),
        'original_title': data.get('original_title'),
        'overview': data.get('overview'),
        'tagline': data.get('tagline'),
        'status': data.get('status'),
        'runtime': data.get('runtime'),
        'release_date': data.get('release_date'),
        'budget': data.get('budget'),
        'revenue': data.get('revenue'),
        'popularity': data.get('popularity'),
        'vote_average': data.get('vote_average'),
        'vote_count': data.get('vote_count'),
        'genres': ', '.join([genre['name'] for genre in data.get('genres', [])]),
        'production_companies': ', '.join([pc['name'] for pc in data.get('production_companies', [])]),
        'production_countries': ', '.join([pc['name'] for pc in data.get('production_countries', [])]),
        'spoken_languages': ', '.join([lang['english_name'] for lang in data.get('spoken_languages', [])]),
        'homepage': data.get('homepage'),
        'adult': data.get('adult'),
        'original_language': data.get('original_language'),
        'belongs_to_collection': data.get('belongs_to_collection', {}).get('name') if data.get('belongs_to_collection') else None,
        'poster_path': data.get('poster_path'),
        'backdrop_path': data.get('backdrop_path')
    }

In [8]:
def scrape_tmdb_movies():
    # check if movie list already downloaded, if not then download now
    if not os.path.exists(EXPORT_FILE):
        download_export_file()

    movie_ids = load_movie_ids()
    collected = []

    # in case it gets stuck trying to collect a movie, after 5 consecutive errors for the same movie atempt the code will stop 
    consecutive_errors = 0

    # limit the amount of data with missing Revenue/Budget we collect to 1,000
    no_bud_rev_count = 0
    no_bud_rev_limit = 1000
    phase = 1  # phase 1 = collect freely, phase 2 = collect only movies w/o missing revenue/budget

    try:
        for movie_id in movie_ids:
            data = query_tmdb(movie_id)
            if data:
                consecutive_errors = 0

                # convert revenue, budget and run-time to integers (sometimes appear as strings), defaulting to 0 if no value found
                revenue = int(data.get('revenue', 0) or 0)
                budget = int(data.get('budget', 0) or 0)
                run_time = int(data.get('runtime', 0) or 0)

                if phase == 1:
                    # due to many movies missing run-time info, we filter only those that do have
                    if run_time > 0:
                        movie_data = get_movie_info(data)
                        collected.append(movie_data)
                        print(f"‚úÖ (Phase 1) Collected {len(collected)} movies ‚Äî Current movie ID: {movie_id}")

                        if revenue == 0 or budget == 0:
                            no_bud_rev_count += 1
                            print(f"‚ö†Ô∏è No revenue/budget movie count: {no_bud_rev_count}/1000")

                        if no_bud_rev_count == no_bud_rev_limit:
                            phase = 2
                            print(f"\nüöÄ Switching to Phase 2: Only collecting movies with runtime, revenue, and budget > 0\n")
                    else:
                        print(f"‚ùå Skipped movie ID {movie_id} due to zero runtime in Phase 1.")
                
                elif phase == 2:
                    if run_time > 0 and revenue > 0 and budget > 0:
                        movie_data = get_movie_info(data)
                        collected.append(movie_data)
                        print(f"‚úÖ (Phase 2) Collected {len(collected)} movies ‚Äî Current movie ID: {movie_id}")
                    else:
                        print(f"‚ùå Skipped movie ID {movie_id} due to zero revenue/budget/runtime in Phase 2.")
            else:
                print(f"‚ùå Skipped movie ID {movie_id} due to error.")
                consecutive_errors += 1

            if consecutive_errors == 5:
                print(f"‚õî Stopped after 5 consecutive errors.")
                break
            
            # creating short time intervals to avoid API abuse (rate limiting)
            time.sleep(0.25)

    # stops scrapping
    except KeyboardInterrupt:
        print("\n‚èπ Scraping interrupted by user. Saving Progress...")

    # saves the scrapped data to csv file
    df = pd.DataFrame(collected)
    df.to_csv("tmdb_movies.csv", index=False)
    print(f"\nüìÑ Saved {len(df)} TMDb movies.")
    return

scrape_tmdb_movies()

Downloaded export file: movie_ids_04_28_2025.json.gz
‚úÖ (Phase 1) Collected 1 movies ‚Äî Current movie ID: 3924
‚ö†Ô∏è No revenue/budget movie count: 1/1000
‚úÖ (Phase 1) Collected 2 movies ‚Äî Current movie ID: 6124
‚ö†Ô∏è No revenue/budget movie count: 2/1000
‚úÖ (Phase 1) Collected 3 movies ‚Äî Current movie ID: 8773
‚ö†Ô∏è No revenue/budget movie count: 3/1000
‚úÖ (Phase 1) Collected 4 movies ‚Äî Current movie ID: 25449
‚ö†Ô∏è No revenue/budget movie count: 4/1000
‚úÖ (Phase 1) Collected 5 movies ‚Äî Current movie ID: 31975
‚ö†Ô∏è No revenue/budget movie count: 5/1000
‚úÖ (Phase 1) Collected 6 movies ‚Äî Current movie ID: 2
‚ö†Ô∏è No revenue/budget movie count: 6/1000
‚úÖ (Phase 1) Collected 7 movies ‚Äî Current movie ID: 3
‚ö†Ô∏è No revenue/budget movie count: 7/1000
‚úÖ (Phase 1) Collected 8 movies ‚Äî Current movie ID: 5
‚úÖ (Phase 1) Collected 9 movies ‚Äî Current movie ID: 6
‚úÖ (Phase 1) Collected 10 movies ‚Äî Current movie ID: 8
‚ö†Ô∏è No revenue/budget movie count: 8/1000