In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import (
    StructType, StructField, IntegerType, StringType,
    FloatType, ArrayType, DateType, LongType,
    BooleanType
)

from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .appName("model-app")\
        .master("spark://spark-master:7077")\
        .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/23 11:11:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
def read_parquet_file(file_path):
    df = spark.read \
                .option("mode", "PERMISSIVE") \
                .parquet(file_path)
    return df

current_date = "2025-06-21"

In [3]:
movie_details_df = read_parquet_file("movie_details.parquet")
movie_details_df.printSchema(), movie_details_df.count()

                                                                                

root
 |-- movie_id: integer (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- budget: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- genre_id: integer (nullable = true)
 |    |    |-- genre_name: string (nullable = true)
 |-- origin_country: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- popularity: float (nullable = true)
 |-- production_companies: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- company_id: integer (nullable = true)
 |    |    |-- company_name: string (nullable = true)
 |    |    |-- company_country: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- revenue: long (nullable = true)
 |-- runtime: integer (nullable = true)
 |-- vote_average: float (nullable = true)
 |-- vote_count: integer (nullable = true)



                                                                                

(None, 9203)

In [4]:
movie_details_sep = movie_details_df \
    .withColumn(
        "genres",
        transform(col("genres"), lambda x: x["genre_name"])
    ) \
    .withColumn(
        "production_companies",
        transform(col("production_companies"), lambda x: x["company_id"])
    )

companies_df = movie_details_df \
    .select(
        explode(col("production_companies")).alias("production_companies")
    ) \
    .select(
        "production_companies.*"
    ) \
    .dropDuplicates()

companies_df.printSchema(), movie_details_sep.printSchema()

root
 |-- company_id: integer (nullable = true)
 |-- company_name: string (nullable = true)
 |-- company_country: string (nullable = true)

root
 |-- movie_id: integer (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- budget: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- origin_country: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- popularity: float (nullable = true)
 |-- production_companies: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- release_date: date (nullable = true)
 |-- revenue: long (nullable = true)
 |-- runtime: integer (nullable = true)
 |-- vote_average: float (nullable = true)
 |-- vote_count: integer (nullable = true)



(None, None)

In [5]:
cast_df = read_parquet_file("cast.parquet")

cast_df.printSchema(), cast_df.count()

root
 |-- movie_id: integer (nullable = true)
 |-- person_id: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- order: integer (nullable = true)
 |-- department: string (nullable = true)



                                                                                

(None, 160565)

In [6]:
cast_movie_df = cast_df \
    .groupBy(
        col("movie_id")
    ).agg(
        collect_list(
            struct(
                col("person_id"),
                col("order")
            )
        ).alias("casts")
    )
cast_details_df = cast_df \
    .select(
        "person_id",
        "name",
        "gender",
        "department"
    ) \
    .dropDuplicates(["person_id"])
cast_details_df.printSchema(), cast_movie_df.printSchema()

root
 |-- person_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: integer (nullable = true)
 |-- department: string (nullable = true)

root
 |-- movie_id: integer (nullable = true)
 |-- casts: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- person_id: integer (nullable = true)
 |    |    |-- order: integer (nullable = true)



(None, None)

In [7]:
crew_df = read_parquet_file("crew.parquet")

crew_movie_id = crew_df \
    .groupBy(
        col("movie_id")
    ).agg(
        collect_list(
            struct(
                col("person_id"),
                col("job")
            )
        ).alias("crews")
    )

crew_details_df = crew_df \
    .select(
        "person_id",
        "name",
        "gender",
        "department"
    ) \
    .dropDuplicates(["person_id"])


crew_details_df.printSchema(), crew_movie_id.printSchema()

root
 |-- person_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: integer (nullable = true)
 |-- department: string (nullable = true)

root
 |-- movie_id: integer (nullable = true)
 |-- crews: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- person_id: integer (nullable = true)
 |    |    |-- job: string (nullable = true)



(None, None)

In [8]:
box_office_df = read_parquet_file("box_office.parquet")\
    .select(
        "imdb_id",
        col("budget").cast(LongType()),
        col("domestic_opening").cast(LongType()),
        col("worldwide_gross").cast(LongType()),
    )

box_office_df.printSchema()

root
 |-- imdb_id: string (nullable = true)
 |-- budget: long (nullable = true)
 |-- domestic_opening: long (nullable = true)
 |-- worldwide_gross: long (nullable = true)



In [51]:
## join movie df
movie_join_df = movie_details_sep \
    .join(cast_movie_df, movie_details_sep.movie_id == cast_movie_df.movie_id, "left") \
    .join(crew_movie_id, movie_details_sep.movie_id == crew_movie_id.movie_id, "left") \
    .join(box_office_df, movie_details_sep.imdb_id == box_office_df.imdb_id, "left") \
    .withColumn(
        "opening_gross",
        col("domestic_opening").cast(LongType())
    ) \
    .withColumn(
        "movie_budget",
        coalesce(box_office_df.budget, movie_details_sep.budget).cast(LongType())
    )\
    .withColumn(
        "gross",
        array(struct(coalesce(box_office_df.worldwide_gross, movie_details_sep.revenue).cast(LongType()).alias("value")))
    ) \
    .withColumn(
        "current_date",
        array(lit(current_date).cast(DateType())).cast(ArrayType(DateType()))
    ) \
    .drop(crew_movie_id.movie_id, cast_movie_df.movie_id, box_office_df.imdb_id, 
          "budget", box_office_df.worldwide_gross, movie_details_sep.revenue, "domestic_opening") \
    .withColumn(
        "popularity",
        array(col("popularity")).cast(ArrayType(FloatType()))
    ) \
    .withColumn(
            "vote_average",
            array(col("vote_average")).cast(ArrayType(FloatType()))
    ) \
    .withColumn(
            "vote_count",
            array(col("vote_count")).cast(ArrayType(IntegerType()))
    )
movie_join_df.printSchema(), movie_join_df.count()

root
 |-- movie_id: integer (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- origin_country: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- popularity: array (nullable = false)
 |    |-- element: float (containsNull = true)
 |-- production_companies: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- release_date: date (nullable = true)
 |-- runtime: integer (nullable = true)
 |-- vote_average: array (nullable = false)
 |    |-- element: float (containsNull = true)
 |-- vote_count: array (nullable = false)
 |    |-- element: integer (containsNull = true)
 |-- casts: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- person_id: integer (nullable = true)
 |    |    |-- order: integer (nullable = true)
 |-- crews: arra

(None, 9203)

In [52]:
movie_join_df.select("movie_id", "gross", "current_date").where(col("movie_id") == 147).show()

+--------+--------+------------+
|movie_id|   gross|current_date|
+--------+--------+------------+
|     147|[{NULL}]|[2025-06-21]|
+--------+--------+------------+



In [129]:
credit_df = cast_details_df.union(crew_details_df).dropDuplicates(["person_id"])

credit_df.printSchema(), credit_df.count()

root
 |-- person_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: integer (nullable = true)
 |-- department: string (nullable = true)



(None, 94371)

In [116]:
spark.stop()