# TMDB MOVIES EXTRACTION 

In [1]:
# locating the Spark installation
import findspark
findspark.init()

In [2]:
# Import the libraries for the extraction
import os
import requests
import time
from pyspark.sql import SparkSession
from pyspark.sql.types import *



In [3]:
# Initialize Spark session
spark = SparkSession.builder.appName("TMDbMovieFetcher").getOrCreate()

In [None]:
def extract_movies():
    
    # Start timing
    start_time = time.time()

    # TMDb API Setup
    API_KEY = ""
    if not API_KEY:
        raise ValueError("TMDB_API_KEY environment variable is not set")

    BASE_URL = "https://api.themoviedb.org/3/movie/{}?api_key={}"
    CREDITS_URL = "https://api.themoviedb.org/3/movie/{}/credits?api_key={}"

    # Movie IDs
    movie_ids = [
        299534, 19995, 140607, 299536, 597, 135397, 420818, 24428, 168259,
        99861, 284054, 12445, 181808, 330457, 351286, 109445, 321612, 260513
    ]

    movie_rows = []

    for mid in movie_ids:
        try:
            print(f"Fetching data for movie ID: {mid}")
            movie_response = requests.get(BASE_URL.format(mid, API_KEY))
            credits_response = requests.get(CREDITS_URL.format(mid, API_KEY))

            if movie_response.status_code != 200 or credits_response.status_code != 200:
                print(f"Failed to fetch data for movie ID {mid}")
                continue

            movie = movie_response.json()
            credits = credits_response.json()

            movie_rows.append((
                movie.get("id"),
                movie.get("title"),
                movie.get("original_language"),
                movie.get("release_date"),
                movie.get("runtime"),
                movie.get("popularity"),
                movie.get("vote_average"),
                movie.get("vote_count"),
                movie.get("overview"),
                movie.get("genres"),
                movie.get("production_companies"),
                movie.get("belongs_to_collection"),
                movie.get("production_countries"),
                movie.get("spoken_languages"),
                credits.get("cast"),
                credits.get("crew"),
                movie.get("tagline"),
                movie.get("poster_path"),
                movie.get("revenue"),
                movie.get("budget")
            ))

            time.sleep(0.25)  # Respect API rate limits

        except Exception as e:
            print(f"Error processing movie ID {mid}: {e}")

    # Define schema with structured types
    schema = StructType([
        StructField("id", IntegerType(), True),
        StructField("title", StringType(), True),
        StructField("original_language", StringType(), True),
        StructField("release_date", StringType(), True),
        StructField("runtime", IntegerType(), True),
        StructField("popularity", DoubleType(), True),
        StructField("vote_average", DoubleType(), True),
        StructField("vote_count", IntegerType(), True),
        StructField("overview", StringType(), True),
        StructField("genres", ArrayType(
            StructType([
                StructField("id", IntegerType(), True),
                StructField("name", StringType(), True)
            ])
        ), True),
        StructField("production_companies", ArrayType(
            StructType([
                StructField("id", IntegerType(), True),
                StructField("name", StringType(), True),
                StructField("origin_country", StringType(), True)
            ])
        ), True),
        StructField("belongs_to_collection", StructType([
            StructField("id", IntegerType(), True),
            StructField("name", StringType(), True),
            StructField("poster_path", StringType(), True),
            StructField("backdrop_path", StringType(), True)
        ]), True),
        StructField("production_countries", ArrayType(
            StructType([
                StructField("iso_3166_1", StringType(), True),
                StructField("name", StringType(), True)
            ])
        ), True),
        StructField("spoken_languages", ArrayType(
            StructType([
                StructField("iso_639_1", StringType(), True),
                StructField("name", StringType(), True)
            ])
        ), True),
        StructField("cast", ArrayType(
            StructType([
                StructField("cast_id", IntegerType(), True),
                StructField("character", StringType(), True),
                StructField("credit_id", StringType(), True),
                StructField("gender", IntegerType(), True),
                StructField("id", IntegerType(), True),
                StructField("name", StringType(), True),
                StructField("order", IntegerType(), True),
                StructField("profile_path", StringType(), True)
            ])
        ), True),
        StructField("crew", ArrayType(
            StructType([
                StructField("credit_id", StringType(), True),
                StructField("department", StringType(), True),
                StructField("gender", IntegerType(), True),
                StructField("id", IntegerType(), True),
                StructField("job", StringType(), True),
                StructField("name", StringType(), True),
                StructField("profile_path", StringType(), True)
            ])
        ), True),
        StructField("tagline", StringType(), True),
        StructField("poster_path", StringType(), True),
        StructField("revenue", LongType(), True),
        StructField("budget", LongType(), True)
    ])

    # Create PySpark DataFrame
    movies_df = spark.createDataFrame(movie_rows, schema=schema)

    end_time = time.time()
    print(f"Script completed in {round(end_time - start_time, 2)} seconds")

    return movies_df


In [10]:
# Fetch movies
movies_df = extract_movies()

Fetching data for movie ID: 299534
Fetching data for movie ID: 19995
Fetching data for movie ID: 140607
Fetching data for movie ID: 299536
Fetching data for movie ID: 597
Fetching data for movie ID: 135397
Fetching data for movie ID: 420818
Fetching data for movie ID: 24428
Fetching data for movie ID: 168259
Fetching data for movie ID: 99861
Fetching data for movie ID: 284054
Fetching data for movie ID: 12445
Fetching data for movie ID: 181808
Fetching data for movie ID: 330457
Fetching data for movie ID: 351286
Fetching data for movie ID: 109445
Fetching data for movie ID: 321612
Fetching data for movie ID: 260513
Script completed in 47.92 seconds


In [11]:
# Show the DataFrame
movies_df.show(5,truncate=True)

+------+--------------------+-----------------+------------+-------+----------+------------+----------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+---------+
|    id|               title|original_language|release_date|runtime|popularity|vote_average|vote_count|            overview|              genres|production_companies|belongs_to_collection|production_countries|    spoken_languages|                cast|                crew|             tagline|         poster_path|   revenue|   budget|
+------+--------------------+-----------------+------------+-------+----------+------------+----------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+---

In [12]:
# Save the DataFrame to Parquet format
movies_df.write.mode("overwrite").parquet("../data/extracted_movies_df.parquet")