In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, array, when, count, lit, coalesce , isnull, array_contains


spark = SparkSession.builder \
    .appName("Sliver Process") \
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9870") \
    .getOrCreate()

In [19]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, FloatType, ArrayType

schema = StructType([
    StructField("movie_results", ArrayType(StructType([
        StructField("adult", BooleanType(), True),
        StructField("backdrop_path", StringType(), True),
        StructField("id", IntegerType(), True),
        StructField("title", StringType(), True),
        StructField("original_language", StringType(), True),
        StructField("original_title", StringType(), True),
        StructField("overview", StringType(), True),
        StructField("poster_path", StringType(), True),
        StructField("media_type", StringType(), True),
        StructField("genre_ids", ArrayType(IntegerType(), True), True),
        StructField("popularity", FloatType(), True),
        StructField("release_date", StringType(), True),
        StructField("video", BooleanType(), True),
        StructField("vote_average", FloatType(), True),
        StructField("vote_count", IntegerType(), True)
    ]), True)),
    StructField("person_results", ArrayType(StructType([]), True), True),
    StructField("tv_results", ArrayType(StructType([]), True), True),
    StructField("tv_episode_results", ArrayType(StructType([]), True), True),
    StructField("tv_season_results", ArrayType(StructType([]), True), True)
])
df = spark.read.parquet("hdfs://namenode:8020/movies_data/bronze.parquet",schema=schema)

                                                                                

#### Xử lý các giá trị null

In [20]:
df.printSchema()

root
 |-- adult: boolean (nullable = true)
 |-- backdrop_path: string (nullable = true)
 |-- belongs_to_collection: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)
 |-- budget: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: long (valueContainsNull = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- origin_country: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: long (valueContainsNull

In [21]:
# Tạo DataFrame cleaned_df từ df gốc
cleaned_df = df.withColumn(
    "backdrop_path",
    when(col("backdrop_path").isNull(), "default_backdrop.jpg").otherwise(col("backdrop_path"))
).withColumn(
    "overview",
    when(col("overview").isNull(), "No overview available").otherwise(col("overview"))
).withColumn(
    "poster_path",
    when(col("poster_path").isNull(), "default_poster.jpg").otherwise(col("poster_path"))
).withColumn(
    "homepage",
    when(col("homepage").isNull(), "No homepage").otherwise(col("homepage"))
)

In [22]:
# Thay thế NULL bằng 0 cho các cột số
numeric_cols = ["budget", "popularity", "vote_count", "vote_average", "runtime"]
for col_name in numeric_cols:
    cleaned_df = cleaned_df.withColumn(
        col_name,
        when(col(col_name).isNull(), 0).otherwise(col(col_name))
    )

In [23]:
# Thay thế NULL bằng False cho các cột boolean
boolean_cols = ["adult", "video"]
for col_name in boolean_cols:
    cleaned_df = cleaned_df.withColumn(
        col_name,
        when(col(col_name).isNull(), False).otherwise(col(col_name))
    )

In [24]:
# Xử lý các cột mảng
array_cols = ["genres", "origin_country", "production_companies", "spoken_languages"]
for col_name in array_cols:
    cleaned_df = cleaned_df.withColumn(
        col_name,
        when(col(col_name).isNull(), array()).otherwise(col(col_name))
    )

In [25]:
from pyspark.sql.functions import create_map, lit

cleaned_df = cleaned_df.withColumn(
    "belongs_to_collection",
    when(col("belongs_to_collection").isNull(), 
         create_map(lit("id"), lit(None).cast("long"), lit("name"), lit(None).cast("string")))
    .otherwise(col("belongs_to_collection"))
)

In [26]:
# Xử lý release_date
cleaned_df = cleaned_df.withColumn(
    "release_date",
    when(col("release_date").isNull(), "1900-01-01").otherwise(col("release_date"))
)

# Xử lý tagline
cleaned_df = cleaned_df.withColumn(
    "tagline",
    when(col("tagline").isNull(), "No tagline").otherwise(col("tagline"))
)

# Xử lý imdb_id
cleaned_df = cleaned_df.withColumn(
    "imdb_id",
    when(col("imdb_id").isNull(), "tt0000000").otherwise(col("imdb_id"))
)

In [27]:
cleaned_df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+-----+--------------------+---------------------+---------+--------------------+--------------------+-------+----------+--------------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+----------+-------+--------------------+--------+--------------------+--------------------+-----+------------+----------+
|adult|       backdrop_path|belongs_to_collection|   budget|              genres|            homepage|     id|   imdb_id|origin_country|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|release_date|   revenue|runtime|    spoken_languages|  status|             tagline|               title|video|vote_average|vote_count|
+-----+--------------------+---------------------+---------+--------------------+--------------------+-------+----------+--------------+-----------------+--------------------+---------------

                                                                                

In [28]:
cleaned_df.write.mode("overwrite").format("parquet").save("hdfs://namenode:8020/movies_data/sliver.parquet")

In [29]:
spark.stop()