In [0]:
import dlt
from pyspark.sql import functions as F

def read_bronze(table_name: str):
    return dlt.read(table_name)

def clean_int(col_name: str):
    return (
        F.when((F.col(col_name).isNull()) | (F.col(col_name) == "\\N"), None)
         .otherwise(F.col(col_name).cast("int"))
    )

def clean_double(col_name: str):
    return (
        F.when((F.col(col_name).isNull()) | (F.col(col_name) == "\\N"), None)
         .otherwise(F.col(col_name).cast("double"))
    )

def clean_string(col_name: str):
    return (
        F.when((F.col(col_name).isNull()) | (F.col(col_name) == "\\N"), None)
         .otherwise(F.col(col_name))
    )
def clean_name_or_title(col_name: str):
  
    return (
        F.when(
            (F.col(col_name).isNull()) | (F.col(col_name) == "\\N"),
            None
        ).otherwise(
            F.trim(
                F.regexp_replace(
                    F.col(col_name),
                    r'^[^A-Za-z0-9]+',   # remove garbage at the START only
                    ''
                )
            )
        )
    )


def split_list(col_name: str):
    return F.when(
        (F.col(col_name).isNull()) | (F.col(col_name) == "\\N"),
        None,
    ).otherwise(F.split(F.col(col_name), ","))

def add_metadata(df, source_table: str):
    return (
        df
        .withColumn("src_table", F.lit(source_table))
        .withColumn("load_ts", F.current_timestamp())
        .withColumn("loaded_by", F.current_user())
    )


In [0]:
@dlt.table(
    name="silver_name_basics",
    comment="Cleaned personnel master (names, birth/death, professions, known-for titles)."
)
def silver_name_basics():
    df = read_bronze("bronze_name_basics")

    df_clean = (
        df
        # âœ… changed HERE
        .withColumn("primaryName",  clean_name_or_title("primaryName"))
        .withColumn("birthYear",    clean_int("birthYear"))
        .withColumn("deathYear",    clean_int("deathYear"))

        .withColumn("primaryProfession_arr", split_list("primaryProfession"))
        .withColumn(
            "primaryProfession",
            F.when(
                F.col("primaryProfession_arr").isNull(),
                None
            ).otherwise(F.concat_ws(", ", F.col("primaryProfession_arr")))
        )

        .withColumn("knownForTitles_arr", split_list("knownForTitles"))
        .withColumn(
            "knownForTitles",
            F.when(
                F.col("knownForTitles_arr").isNull(),
                None
            ).otherwise(F.concat_ws(", ", F.col("knownForTitles_arr")))
        )

        .dropDuplicates(["nconst"])
    )

    return add_metadata(df_clean, "bronze_name_basics")


In [0]:
@dlt.table(
    name="silver_title_basics",
    comment="Cleaned title master (type, titles, years, runtime, genres)."
)
def silver_title_basics():
    df = read_bronze("bronze_title_basics")

    df_clean = (
        df
        .withColumn("titleType",      clean_string("titleType"))
        
        .withColumn("primaryTitle",   clean_name_or_title("primaryTitle"))
        .withColumn("originalTitle",  clean_name_or_title("originalTitle"))
        .withColumn("isAdult",        clean_int("isAdult"))
        .withColumn("startYear",      clean_int("startYear"))
        .withColumn("endYear",        clean_int("endYear"))
        .withColumn("runtimeMinutes", clean_int("runtimeMinutes"))

        # ARRAY version for modeling / explode in gold
        .withColumn("genres_arr", split_list("genres"))
        # Optional pretty STRING version
        .withColumn(
            "genres",
            F.when(
                F.col("genres_arr").isNull(),
                None
            ).otherwise(F.concat_ws(", ", F.col("genres_arr")))
        )

        .dropDuplicates(["tconst"])
    )

    return add_metadata(df_clean, "bronze_title_basics")


In [0]:
@dlt.expect_or_drop(
    "valid_tconst_in_crew",
    "tconst IS NOT NULL"
)
@dlt.table(
    name="silver_title_crew",
    comment="Cleaned title crew with director and writer lists."
)
def silver_title_crew():
    df = read_bronze("bronze_title_crew")

    df_clean = (
        df
        .withColumn("directors_arr", split_list("directors"))
        .withColumn("writers_arr",   split_list("writers"))

        .withColumn(
            "directors",
            F.when(F.col("directors_arr").isNull(), None)
             .otherwise(F.concat_ws(", ", F.col("directors_arr")))
        )
        .withColumn(
            "writers",
            F.when(F.col("writers_arr").isNull(), None)
             .otherwise(F.concat_ws(", ", F.col("writers_arr")))
        )
        .drop("directors_arr", "writers_arr")

        .dropDuplicates(["tconst"])
    )

    return add_metadata(df_clean, "bronze_title_crew")


In [0]:
@dlt.expect_or_drop(
    "valid_tconst_in_episode",
    "tconst IS NOT NULL"
)
@dlt.table(
    name="silver_title_episode",
    comment="Cleaned episode information (parent series, season, episode numbers)."
)
def silver_title_episode():
    df = read_bronze("bronze_title_episode")

    df_clean = (
        df
        .withColumn("parentTconst", clean_string("parentTconst"))
        .withColumn("seasonNumber", clean_int("seasonNumber"))
        .withColumn("episodeNumber", clean_int("episodeNumber"))
        .dropDuplicates(["tconst", "parentTconst", "seasonNumber", "episodeNumber"])
    )

    return add_metadata(df_clean, "bronze_title_episode")


In [0]:
import dlt
from pyspark.sql.functions import regexp_replace, col, trim, when

@dlt.expect_or_drop(
    "valid_tconst_in_principals",
    "tconst IS NOT NULL"
)
@dlt.expect_or_drop(
    "valid_nconst_in_principals",
    "nconst IS NOT NULL"
)
@dlt.table(
    name="silver_title_principals",
    comment="Cleaned title principals with sanitized job and characters columns."
)
def silver_title_principals():

    df = read_bronze("bronze_title_principals")

    df_clean = (
        df
        # ---------------------------------------------------
        # CLEAN CHARACTERS: remove [ ] \ " and convert junk to NULL
        # ---------------------------------------------------
        .withColumn("characters", regexp_replace(col("characters"), r'\\', ''))     # remove backslashes
        .withColumn("characters", regexp_replace(col("characters"), r'"', ''))      # remove quotes
        .withColumn("characters", regexp_replace(col("characters"), r'\[|\]', ''))  # remove brackets
        .withColumn("characters", trim(col("characters")))
        .withColumn(
            "characters",
            when(
                col("characters").isin("N", "null", ""),   # after removing "\" , "\N" becomes "N"
                None
            ).otherwise(col("characters"))
        )

        # ---------------------------------------------------
        # CLEAN JOB: turn \N / N / '' / 'null' into NULL
        # ---------------------------------------------------
        .withColumn("job", trim(col("job")))
        .withColumn(
            "job",
            when(
                col("job").isin("\\N", "N", "", "null"),
                None
            ).otherwise(col("job"))
        )

        # (Optional) CLEAN CATEGORY too, in case there are \N values
        .withColumn("category", trim(col("category")))
        .withColumn(
            "category",
            when(
                col("category").isin("\\N", "N", "", "null"),
                None
            ).otherwise(col("category"))
        )

        # Make sure ordering is numeric
        .withColumn("ordering", col("ordering").cast("int"))
    )

    return add_metadata(df_clean, "bronze_title_principals")


In [0]:
@dlt.expect_or_drop(
    "valid_tconst_in_ratings",
    "tconst IS NOT NULL"
)
@dlt.table(
    name="silver_title_ratings",
    comment="Cleaned ratings (average rating and vote counts) per title."
)
def silver_title_ratings():
    df = read_bronze("bronze_title_ratings")

    df_clean = (
        df
        .withColumn("averageRating", clean_double("averageRating"))
        .withColumn("numVotes",      clean_int("numVotes"))
        .dropDuplicates(["tconst"])
    )

    return add_metadata(df_clean, "bronze_title_ratings")


In [0]:
@dlt.expect_or_drop(
    "valid_titleId_in_akas",
    "titleId IS NOT NULL"
)
@dlt.table(
    name="silver_title_akas",
    comment="Cleaned alternate titles with region/language codes and attributes."
)
def silver_title_akas():
    df = read_bronze("bronze_title_akas")

    df_clean = (
        df
        .withColumn("ordering", clean_int("ordering"))
        .withColumn("region",   clean_string("region"))
        .withColumn("language", clean_string("language"))

        # arrays
        .withColumn("types_arr",      split_list("types"))
        .withColumn("attributes_arr", split_list("attributes"))

        # strings
        .withColumn(
            "types",
            F.when(F.col("types_arr").isNull(), None)
             .otherwise(F.concat_ws(", ", F.col("types_arr")))
        )
        .withColumn(
            "attributes",
            F.when(F.col("attributes_arr").isNull(), None)
             .otherwise(F.concat_ws(", ", F.col("attributes_arr")))
        )
        .drop("types_arr", "attributes_arr")

        .dropDuplicates(["titleId", "ordering"])
    )

    return add_metadata(df_clean, "bronze_title_akas")
