In [52]:
import requests
import pandas as pd
import time

In [53]:
# API KEY and URL
api_key = '75efdc5dabc6d61d0c10ee525b7e62b7'
base_url = 'https://api.themoviedb.org/3'

In [54]:
# This function gets full movie details from TMDB API using the movie ID
def get_movie_details(movie_id):
    url = f'{base_url}/movie/{movie_id}?api_key={api_key}&language=en-US&append_to_response=videos,watch/providers'
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            return response.json() #   # the request was successful
    except:
        return None
    return None

In [55]:
# maximum record of movies to collect 
max_movies = 10
movies_list = []
movie_ids = set()
page = 1

In [56]:
movie_fields = {
    'title': 'title',  # Movie name
    'overview': 'overview',  # Plot summary
    'release_date': 'release_date',  # Date the movie was released
    'runtime': 'runtime',  # Length of the movie in minutes
    'genres': lambda d: ', '.join([g['name'] for g in d.get('genres', [])]),  # Genres (e.g. Action, Drama)
    'production_companies': lambda d: ', '.join([c['name'] for c in d.get('production_companies', [])]),  # Studios that produced the movie
    'popularity': 'popularity',  # Popularity score on TMDB
    'vote_average': 'vote_average',  # Average rating from users
    'vote_count': 'vote_count',  # Number of users who rated
    'budget': 'budget',  # Movie production budget (in USD)
    'revenue': 'revenue',  # Total box office revenue (in USD)
    'poster_path': 'poster_path',  # Path to the movie poster image
    'spoken_languages': lambda d: ', '.join([l['name'] for l in d.get('spoken_languages', [])]),  # Languages spoken in the movie
    'trailer': lambda d: f"https://www.youtube.com/watch?v={d.get('videos', {}).get('results', [{}])[0].get('key')}" if d.get('videos', {}).get('results') else None,  # Link to the trailer on YouTube
    'watch_providers': lambda d: ', '.join(d.get('watch/providers', {}).get('results', {}).keys()),  # Streaming platforms available
    'homepage': 'homepage',  # Official movie website
    'tagline': 'tagline',  # Short marketing slogan or phrase
    'belongs_to_collection': lambda d: d.get('belongs_to_collection', {}).get('name') if d.get('belongs_to_collection') else None,  # Name of the movie series (if part of one)
    'adult': 'adult',  # True if it's adult content (18+)
    'imdb_id': 'imdb_id',  # IMDb ID
    'production_countries': lambda d: ', '.join([c['name'] for c in d.get('production_countries', [])]),  # Countries where the movie was produced
    'original_language': 'original_language',  # Original language of the movie
    'original_title': 'original_title',  # Original title (before translation)
    'status': 'status'  # Production status (e.g. Released, Canceled)
}


| Field Name             | Description                                     |
|------------------------|-------------------------------------------------|
| title                  | Movie name                                      |
| overview               | Plot summary of the movie                       |
| release_date           | Official release date                           |
| runtime                | Duration in minutes                             |
| genres                 | Movie genres (e.g. Action, Comedy)              |
| production_companies   | Studios that produced the movie                 |
| popularity             | Popularity score calculated by TMDB            |
| vote_average           | Average rating by users                         |
| vote_count             | Number of user ratings                          |
| budget                 | Production budget (USD)                         |
| revenue                | Total box office revenue (USD)                  |
| poster_path            | Path to poster image (for display)              |
| spoken_languages       | Languages spoken in the movie                   |
| trailer                | Link to YouTube trailer                         |
| watch_providers        | Streaming platforms where movie is available    |
| homepage               | Official website of the movie                   |
| tagline                | Marketing slogan or phrase                      |
| belongs_to_collection  | Movie series name (if applicable)               |
| adult                  | True if movie is adult content (18+)            |
| imdb_id                | Unique movie ID from IMDb                       |
| production_countries   | Countries where the movie was produced          |
| original_language      | Original language of the movie                  |
| original_title         | Original title before translation               |
| status                 | Production status (e.g. Released, Canceled)     |


In [57]:
# collects list of movies from multiple TMDB categories
def collect_movies_from_categories():

    global page # Allow change of the global page variable
    categories = ['popular', 'top_rated', 'now_playing', 'upcoming']

    # Keep collecting until reaching the target number of movies
    while len(movies_list) < max_movies:

        for category in categories:
             # Build the API URL for the current category and page
            url = f'{base_url}/movie/{category}?api_key={api_key}&language=en-US&page={page}' 
            try:
                response = requests.get(url, timeout=10) # GET request to fetch the movie list
                if response.status_code != 200:
                    continue # If the request failed
            except:
                continue
            
            movies = response.json().get('results', []) # Extract the list of movies from the API response

            # For each movie in the list, fetch and save full details using its ID
            for movie in movies:
                add_movie_by_id(movie['id'])

                if len(movies_list) >= max_movies: # stop processing this page
                    break

            if len(movies_list) >= max_movies: # stop processing this category
                break

        page += 1
        time.sleep(0.5)

In [None]:
#receives a movie ID and collect full details
def add_movie_by_id(movie_id):

    if movie_id in movie_ids: # If the movie was already collected – skip to avoid duplicates
        return

    movie_ids.add(movie_id) # Add the ID to the list of already collected movies

    details = get_movie_details(movie_id) 
    if not details:
        return
 
   
    movie_data = {} # store data of each movie

    for key, extractor in movie_fields.items():
        if callable(extractor):  # If the value is a function (like lambda) – apply it on the details
            movie_data[key] = extractor(details)
        else:  # If it's a string – get the value directly
            movie_data[key] = details.get(extractor)

    movies_list.append(movie_data)
    print(f'Collected: {len(movies_list)} movies', end='\r') # Show progress of collected movies


In [None]:
collect_movies_from_categories()  # Start the process of collecting movies from all categories and pages

Collected: 10 movies

In [None]:
# Convert the list of movies into a pandas DataFrame
df = pd.DataFrame(movies_list)
df.to_csv('movies_data.csv', index=False)  # Save the DataFrame to a CSV file

print('Done!🎉 \nFile name: movies_data.csv')


Done!🎉 
File name: movies_data.csv


In [61]:
df.head().T

Unnamed: 0,0,1,2,3,4
title,In the Lost Lands,A Minecraft Movie,G20,Novocaine,Gunslingers
overview,A queen sends the powerful and feared sorceres...,Four misfits find themselves struggling with o...,After the G20 Summit is overtaken by terrorist...,"When the girl of his dreams is kidnapped, ever...",When the most wanted man in America surfaces i...
release_date,2025-02-27,2025-03-31,2025-04-09,2025-03-12,2025-04-11
runtime,102,101,110,110,104
genres,"Fantasy, Adventure, Action","Family, Comedy, Adventure, Fantasy","Action, Mystery, Drama","Action, Comedy, Thriller","Western, Action"
production_companies,"Constantin Film, Spark Productions","Warner Bros. Pictures, Legendary Pictures, Moj...","JuVee Productions, Mad Chance, MRC, Amazon MGM...","Circle of Confusion, Safehouse Pictures, Param...","5150 Action, Skibavision, ZTA Entertainment, Z..."
popularity,727.9871,673.4901,639.4622,488.1797,448.398
vote_average,6.374,6.1,6.537,6.861,6.765
vote_count,178,555,217,427,17
budget,55000000,150000000,0,18000000,0


In [62]:
df['title'].nunique()


10

In [63]:

df['imdb_id'].nunique()

10

In [64]:
df['title'].value_counts().loc[lambda x: x > 1]



Series([], Name: count, dtype: int64)