# TMDB MOVIES EXTRACTION 

In [None]:
# locating the Spark installation
import findspark
findspark.init()

In [3]:
# Import the libraries for the extraction
import os
import requests
import time
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType


In [4]:
# Initialize Spark session
spark = SparkSession.builder.appName("TMDbMovieFetcher").getOrCreate()

In [5]:
# Function to extract movie data from TMDb API
def extract_movies():
    # Start timing
    start_time = time.time()


    # TMDb API Setup
    API_KEY = "c77dbb6c6c1f73c1976abf0cadb639f1"
    BASE_URL = "https://api.themoviedb.org/3/movie/{}?api_key={}"
    CREDITS_URL = "https://api.themoviedb.org/3/movie/{}/credits?api_key={}"

    # Movie IDs
    movie_ids = [
        0, 299534, 19995, 140607, 299536, 597, 135397, 420818, 24428, 168259,
        99861, 284054, 12445, 181808, 330457, 351286, 109445, 321612, 260513
    ]

    # Storage for rows
    movie_rows = []

    # Fetch loop
    for mid in movie_ids:
        try:
            print(f"Fetching data for movie ID: {mid}")
            movie_response = requests.get(BASE_URL.format(mid, API_KEY))
            credits_response = requests.get(CREDITS_URL.format(mid, API_KEY))

            if movie_response.status_code != 200 or credits_response.status_code != 200:
                print(f"Failed to fetch movie or credits for ID {mid}")
                continue

            movie = movie_response.json()
            credits = credits_response.json()

            movie_rows.append((
                movie.get("id"),
                movie.get("title"),
                movie.get("original_language"),
                movie.get("release_date"),
                movie.get("runtime"),
                movie.get("popularity"),
                movie.get("vote_average"),
                movie.get("vote_count"),
                movie.get("overview"),
                str(movie.get("genres")),
                str(movie.get("production_companies")),
                str(credits.get("cast")),
                str(credits.get("crew"))
            ))

        except Exception as e:
            print(f"Error processing movie ID {mid}: {e}")

    # Define schema
    schema = StructType([
        StructField("id", IntegerType(), True),
        StructField("title", StringType(), True),
        StructField("original_language", StringType(), True),
        StructField("release_date", StringType(), True),
        StructField("runtime", IntegerType(), True),
        StructField("popularity", DoubleType(), True),
        StructField("vote_average", DoubleType(), True),
        StructField("vote_count", IntegerType(), True),
        StructField("overview", StringType(), True),
        StructField("genres", StringType(), True),
        StructField("production_companies", StringType(), True),
        StructField("cast", StringType(), True),
        StructField("crew", StringType(), True)
    ])

    # Create PySpark DataFrame
    movies_df = spark.createDataFrame(movie_rows, schema=schema)
    # Timing
    end_time = time.time()
    print(f"Script completed in {round(end_time - start_time, 2)} seconds")
    
    return movies_df


In [6]:
# Fetch movies
movies_df = extract_movies()



Fetching data for movie ID: 0
Failed to fetch movie or credits for ID 0
Fetching data for movie ID: 299534
Fetching data for movie ID: 19995
Fetching data for movie ID: 140607
Fetching data for movie ID: 299536
Fetching data for movie ID: 597
Fetching data for movie ID: 135397
Fetching data for movie ID: 420818
Fetching data for movie ID: 24428
Fetching data for movie ID: 168259
Fetching data for movie ID: 99861
Fetching data for movie ID: 284054
Fetching data for movie ID: 12445
Fetching data for movie ID: 181808
Fetching data for movie ID: 330457
Fetching data for movie ID: 351286
Fetching data for movie ID: 109445
Fetching data for movie ID: 321612
Fetching data for movie ID: 260513
Script completed in 99.04 seconds


In [7]:
# Show the DataFrame
movies_df.show(5,truncate=True)

+------+--------------------+-----------------+------------+-------+----------+------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    id|               title|original_language|release_date|runtime|popularity|vote_average|vote_count|            overview|              genres|production_companies|                cast|                crew|
+------+--------------------+-----------------+------------+-------+----------+------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|299534|   Avengers: Endgame|               en|  2019-04-24|    181|   53.8808|       8.237|     26250|After the devasta...|[{'id': 12, 'name...|[{'id': 420, 'log...|[{'adult': False,...|[{'adult': False,...|
| 19995|              Avatar|               en|  2009-12-15|    162|   29.1285|       7.588|     32162|In the 22nd centu...|[{'id': 28, 'name...|[{'id': 444, 'log..

In [None]:
# Save the DataFrame to Parquet format
movies_df.write.mode("overwrite").parquet("../data/extracted_movies_df.parquet")