# 01 Movie Data Collector from TMDb API

This script collects movie data from The Movie Database (TMDb) API for various languages and genres released between 2015 and 2023. It uses concurrent requests to efficiently fetch movie details such as title, release year, language, genre, overview, vote average, vote count, and popularity. The collected data is then saved into a CSV file for further analysis.

In [4]:
# Import necessary libraries
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor

In [5]:
# Load environment variables from .env file
load_dotenv()

# TMDb API Key (loaded from environment variable)
api_key = os.getenv('TMDB_API_KEY')

# Base URL for the API requests
base_url = 'https://api.themoviedb.org/3/discover/movie'

# Parameters for the request
params = {
    'api_key': api_key,
    'language': 'en-US',
    'sort_by': 'popularity.desc',
    'include_adult': 'false',
    'include_video': 'false',
    'page': 1,
    'primary_release_date.gte': '2015-01-01',
    'primary_release_date.lte': '2023-12-31',
    'with_original_language': None,
    'with_genres': None  # Genre will be set later
}

# Function to fetch movies from a specific page
def get_movies(page):
    params['page'] = page
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        return response.json().get('results', [])
    else:
        print(f"Error fetching page {page}: {response.status_code}")
        return []

# Function to fetch movies concurrently for a given language and genre
def fetch_movies_concurrent(lang_code, lang_name, genre_name, genre_id, total_pages):
    movies = []
    params['with_original_language'] = lang_code
    params['with_genres'] = genre_id
    
    def get_movies_concurrent(page):
        return get_movies(page)

    with ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(get_movies_concurrent, range(1, total_pages + 1)))
        for movies_on_page in results:
            for movie in movies_on_page:
                movies.append({
                    'title': movie.get('title'),
                    'release_year': movie.get('release_date', '')[:4],
                    'language': lang_name,
                    'genre': genre_name,
                    'overview': movie.get('overview', ''),
                    'vote_average': movie.get('vote_average', 0),
                    'vote_count': movie.get('vote_count', 0),
                    'popularity': movie.get('popularity', 0),
                })
    return movies

# Movie data storage
movies = []

# Number of pages to be fetched
total_pages = 50

# Extended language list
languages = {
    'hi': 'Indisch',
    'en': 'Amerikanisch',
    'de': 'Deutsch',
    'fr': 'Französisch',
    'es': 'Spanisch',
    'it': 'Italienisch'
}

# Extended genre list
genres = {
    'Humor': 35,      # Comedy
    'Romanze': 10749,  # Romance
    'Action': 28,      # Action
    'Drama': 18,       # Drama
    'Horror': 27       # Horror
}

# Collect movies for each language and genre
for lang_code, lang_name in languages.items():
    for genre_name, genre_id in genres.items():
        print(f"Processing movies for Language: {lang_name}, Genre: {genre_name}")
        movies.extend(fetch_movies_concurrent(lang_code, lang_name, genre_name, genre_id, total_pages))

# Save to CSV
df = pd.DataFrame(movies)
df.to_csv('data/movies_2015_2023_genres.csv', index=False)
print(f"Total number of movies: {len(df)}")

Processing movies for Language: Indisch, Genre: Humor
Processing movies for Language: Indisch, Genre: Romanze
Processing movies for Language: Indisch, Genre: Action
Processing movies for Language: Indisch, Genre: Drama
Processing movies for Language: Indisch, Genre: Horror
Processing movies for Language: Amerikanisch, Genre: Humor
Processing movies for Language: Amerikanisch, Genre: Romanze
Processing movies for Language: Amerikanisch, Genre: Action
Processing movies for Language: Amerikanisch, Genre: Drama
Processing movies for Language: Amerikanisch, Genre: Horror
Processing movies for Language: Deutsch, Genre: Humor
Processing movies for Language: Deutsch, Genre: Romanze
Processing movies for Language: Deutsch, Genre: Action
Processing movies for Language: Deutsch, Genre: Drama
Processing movies for Language: Deutsch, Genre: Horror
Processing movies for Language: Französisch, Genre: Humor
Processing movies for Language: Französisch, Genre: Romanze
Processing movies for Language: Fra

#### In the end, we collect a total of 19,610 movies from year 2015 to 2023, filtered by language and genre, and saves them into a CSV file.