In [0]:
df = spark.read.format('delta')\
               .option('header', True)\
               .option('inferSchema', True)\
               .load('abfss://bronze@netflixprojectsan.dfs.core.windows.net/netflix_titles')

In [0]:
from pyspark.sql.functions import regexp_extract, col, when
from pyspark.sql.types import IntegerType

# Extract numeric values from strings using regex
df = df.withColumn(
    "duration_minutes_clean",
    when(
        regexp_extract(col("duration_minutes"), r"(\d+)", 1) != "", 
        regexp_extract(col("duration_minutes"), r"(\d+)", 1).cast(IntegerType())
    ).otherwise(None)
).withColumn(
    "duration_seasons_clean",
    when(
        regexp_extract(col("duration_seasons"), r"(\d+)", 1) != "", 
        regexp_extract(col("duration_seasons"), r"(\d+)", 1).cast(IntegerType())
    ).otherwise(None)
)

# Optionally fill nulls if needed (e.g., default to 0 if not found)
df = df.fillna({'duration_minutes_clean': 0, 'duration_seasons_clean': 1})

In [0]:
df = df.drop("duration_minutes", "duration_seasons") \
       .withColumnRenamed("duration_minutes_clean", "duration_minutes") \
       .withColumnRenamed("duration_seasons_clean", "duration_seasons")

In [0]:
from pyspark.sql.functions import split

df = df.withColumn('shortMovieName',split(col('title'),':')[0])\
       .withColumn('rating_id',split(col('rating'),'-')[0])

In [0]:
df = df.withColumn('type_id',\
        when(col('type')=='Movie',1)\
       .when(col('type')=='TV Show',2)\
       .otherwise(0))

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank

windowSpec = Window.partitionBy('type').orderBy(col('duration_minutes').desc())
df = df.withColumn('duration_rank', dense_rank().over(windowSpec))

In [0]:
from pyspark.sql.functions import count

df_vis = df.groupby('type').agg(count('*').alias('total_count'))
display(df_vis)

Databricks visualization. Run in Databricks to view.

In [0]:
df.write.format('delta')\
        .mode('overwrite')\
        .option('path','abfss://silver@netflixprojectsan.dfs.core.windows.net/netflix_titles')\
        .save()