In [40]:
import os
import sys
sys.path.append(os.path.abspath('..'))
import json
import builtins

In [41]:
from pyspark.sql import SparkSession
from Functions.data_extraction import movie
from Functions.Schema import get_tmdb_raw_schema
from Functions.data_cleaning import extract_data, extract_name,get_director,separate_data
from pyspark.sql.functions import col, lit, size, when, coalesce

In [42]:
# Initialize Spark
spark = SparkSession.builder \
    .appName("TMDB Movie Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

print("✓ PySpark initialized successfully!")
print(f"   Spark Version: {spark.version}")

✓ PySpark initialized successfully!
   Spark Version: 3.5.0


In [43]:
#Extracting perferred movie data from the Api
key = os.getenv("key")

movies = []

movie_ids = [0,299534, 19995, 140607, 299536, 597, 135397, 420818,
             24428, 168259, 99861, 284054, 12445, 181808, 330457,
               351286, 109445, 321612, 260513
]

for movie_id in movie_ids:
    movies.append(movie(movie_id,key))

#print(movies)

Movie ID 0 not found (404). Skipping.


In [44]:
movies = list(builtins.filter(None, movies))

In [45]:
# Read with multiLine and let Spark infer schema
df = spark.read \
    .option("multiLine", "true") \
    .option("mode", "PERMISSIVE") \
    .option("columnNameOfCorruptRecord", "_corrupt_record") \
    .json('tmdb_movies.json')

In [46]:
# Apply extract_data function to get genre names
df = df.withColumn("genres", extract_data(col("genres")))

# Apply extract_name function to get collection names
# Need to pass 'name' as a literal column since the UDF expects column parameters
df = df.withColumn("belongs_to_collection", extract_name(col("belongs_to_collection"), lit("name")))

# Apply extract_data function to get production countries names
df = df.withColumn("production_countries", extract_data(col("production_countries")))

# Apply extract_data function to get production companies names
df = df.withColumn("production_companies", extract_data(col("production_companies")))

# Apply extract_data function to get spoken_languages
df = df.withColumn("spoken_languages", extract_data(col("spoken_languages")))

In [47]:
# Extracting cast from credits
df = df.withColumn("cast", extract_name(col("credits"), lit("cast")))
df = df.withColumn("cast", extract_data(col("cast")))

# Extracting crew from credits
df = df.withColumn("crew", extract_name(col("credits"), lit("crew")))
df = df.withColumn("crew", extract_data(col("crew")))

# Calculate cast_size and crew_size
# size() returns the length of an array, returns -1 for null
df = df.withColumn("cast_size", when(col("cast").isNotNull(), size(col("cast"))).otherwise(0))
df = df.withColumn("crew_size", when(col("crew").isNotNull(), size(col("crew"))).otherwise(0))

In [48]:
# Extract crew from credits
df = df.withColumn("crew", extract_name(col("credits"), lit("crew")))
df = df.withColumn("crew", extract_data(col("crew")))

    # Sizes
df = df.withColumn("cast_size", coalesce(size(col("cast")), lit(0)))
df = df.withColumn("crew_size", coalesce(size(col("crew")), lit(0)))
