# Silver Data Transformations

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window as w

In [0]:
df = (
    spark.read.format("delta")
    .option("inferSchema", True)
    .option("header", True)
    .load("abfss://bronze@netflixstorageaccount.dfs.core.windows.net/netflix_titles")
)

In [0]:
df.display()

# Fill Null Values

In [0]:
df = df.fillna({"duration_minutes": 0, "duration_seasons": 0})

In [0]:
df.display()

In [0]:
df = df.withColumn(
    "duration_minutes", col("duration_minutes").cast(IntegerType())
).withColumn("duration_seasons", col("duration_seasons").cast(IntegerType()))

In [0]:
df.printSchema()

In [0]:
df = df.withColumn("short-title", split(col("title"), ":")[0])
display(df)

In [0]:
df = df.withColumn(
    "release_year", year(to_date(col("release_year").cast("string"), "yyyy"))
)
display(df)

In [0]:
df = df.withColumn(
    "type_flag",
    when(col("type") == "Movie", 1).when(col("type") == "TV Show", 2).otherwise(0),
)

In [0]:
windowSpec = w.orderBy(col('duration_minutes').desc())
df = df.withColumn('duration_rank', dense_rank().over(windowSpec))
df.display()

In [0]:
df_visualise = df.groupBy('type').agg(count("*").alias('count'))
df_visualise.display()

Databricks visualization. Run in Databricks to view.

In [0]:
df.write.format('delta')\
    .mode('append')\
    .option('path', 'abfss://silver@netflixstorageaccount.dfs.core.windows.net/netflix_titles')\
    .save()