In [7]:
import pandas as pd
import time
import csv
import requests
import os
import requests

Functions for getting data from thje TMDB API

In [11]:
def fetch_movie_data(tmdb_id, i, TMDB_API_KEY):
    '''
    Fetches movie data from TMDB API given a tmdb_id

    index i is used for avoiding the rate limit
    '''

    # rate limit: 40 requests every 10 seconds 
    if i % 39==0:
        time.sleep(10)

    url = f"https://api.themoviedb.org/3/movie/{tmdb_id}?api_key={TMDB_API_KEY}&language=en-US&append_to_response=credits"
    data = requests.get(url).json()
    return data

def write_to_csv(row,filename="ml-latest/overview.csv"):
    '''
    Appends a row to a csv file
    '''
    with open(filename, 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(row)

def get_last_row_movie(filename="ml-latest/overview.csv"):
    '''
    Returns the movieId of the last row in the csv file
    '''
    with open(filename, 'r', encoding='utf-8') as f:
        last_line = f.readlines()[-1]
        return int(last_line.split(',')[0])
    
def process_movie_data(data, TMDB_API_KEY, file_path="ml-latest/overview.csv"):
    '''
    This function will process the movie data fetched from TMDB API.

    If the overview.csv it will not repull the data already present in the file.
    '''
    
    # checking if overview.csv exists
    if os.path.exists(file_path):
        last_movie_id = get_last_row_movie(file_path)
        print(f"Resuming from movieId: {last_movie_id}")

    else:
        last_movie_id = 0 
        # write header
        with open(file_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(["movieId", "overview", "cast_ids", "cast_names", "director", "director_id"])
        print("Creating new overview.csv file")
    
    for i, (movieId, imdbId, tmdbId) in enumerate(data.itertuples(index=False)):

        # skip already processed movies
        if movieId <= last_movie_id:
            continue

        # check for nan tmdbId
        if pd.isna(tmdbId):
            continue

        tmdbId = int(tmdbId)

        # get movie data
        data = fetch_movie_data(tmdbId, i, TMDB_API_KEY)

        # overview
        overview = data.get("overview", "").replace("\n", " ").strip()
        if overview != "":
            # remove commas to avoid csv issues
            overview = overview.replace(",", "")
            overview = overview.replace("'", "")

        # get ready for cast and crew
        credits = data.get("credits", {})
        cast = credits.get("cast", [])
        crew = credits.get("crew", [])

        # Director
        director_entry = next((crew_member for crew_member in crew if crew_member.get("job") == "Director"), None)
        director = director_entry["name"] if director_entry else None
        director_id = director_entry["id"] if director_entry else None

        # Top 10 cast
        top_cast = cast[:10]
        # reformate the cast names and ids
        cast_names = "|".join(c.get("name", "") for c in top_cast)
        cast_ids = "|".join(str(c.get("id", "")) for c in top_cast)

        # write to csv
        row = [movieId, overview, cast_ids, cast_names, director, director_id]
        write_to_csv(row, file_path)
        

# Get the extra data

IMPORTANT: API key to TMDB is required. It is free for non commercial applications.


We have included the data we pulled so you don't need to run this part as the data is already present

In [None]:
TMDB_API_KEY = ""

movies_df = pd.read_csv("ml-latest/links.csv")
process_movie_data(movies_df, TMDB_API_KEY=TMDB_API_KEY, file_path="ml-latest/overview.csv")

Resuming from movieId: 288983
