In [0]:
import dlt
from pyspark.sql import functions as F

# =====================================================
# HELPERS
# =====================================================

def read_bronze(table_name: str):
    return dlt.read(table_name)

def clean_int(col_name: str):
    return (
        F.when((F.col(col_name).isNull()) | (F.col(col_name) == "\\N"), None)
         .otherwise(F.col(col_name).cast("int"))
    )

def clean_double(col_name: str):
    return (
        F.when((F.col(col_name).isNull()) | (F.col(col_name) == "\\N"), None)
         .otherwise(F.col(col_name).cast("double"))
    )

def clean_string(col_name: str):
    return (
        F.when((F.col(col_name).isNull()) | (F.col(col_name) == "\\N"), None)
         .otherwise(F.trim(F.col(col_name)))
    )

def clean_name_or_title(col_name: str):
    return (
        F.when((F.col(col_name).isNull()) | (F.col(col_name) == "\\N"), None)
         .otherwise(
            F.trim(
                F.regexp_replace(
                    F.col(col_name),
                    r'^[^A-Za-z0-9]+',  # remove junk at the start
                    ''
                )
            )
        )
    )

def split_list(col_name: str):
    """
    - Treats NULL, '\\N', and '' as NULL
    - Splits comma-separated strings into arrays
    - Removes empty elements; if array becomes empty -> NULL
    """
    raw_col = F.col(col_name)

    base_null = (
        raw_col.isNull() |
        (raw_col == "\\N") |
        (F.trim(raw_col) == "")
    )

    arr = F.split(raw_col, ",")
    arr_clean = F.array_remove(arr, "")

    return (
        F.when(base_null, None)
         .otherwise(
            F.when(F.size(arr_clean) == 0, None).otherwise(arr_clean)
         )
    )

def add_metadata(df, source):
    return (
        df
        .withColumn("src_table", F.lit(source))
        .withColumn("load_ts", F.current_timestamp())
        .withColumn("loaded_by", F.current_user())
    )

# =====================================================
# SILVER: TITLE BASICS  (PK = tconst)
# =====================================================

@dlt.expect_or_drop(
    "valid_pk_tconst",
    "tconst IS NOT NULL AND tconst <> '' AND tconst LIKE 'tt%'"
)
@dlt.expect_or_drop(
    "valid_years",
    """
    (startYear IS NULL OR startYear BETWEEN 1870 AND 2025)
    AND
    (endYear IS NULL OR endYear BETWEEN 1870 AND 2025)
    AND
    (endYear IS NULL OR startYear IS NULL OR endYear >= startYear)
    """
)
@dlt.table(
    name="silver_title_basics",
    comment="Validated title master with clean dates and genres."
)
def silver_title_basics():
    df = read_bronze("bronze_title_basics")

    df_clean = (
        df
        .withColumn("titleType",      clean_string("titleType"))
        .withColumn("primaryTitle",   clean_name_or_title("primaryTitle"))
        .withColumn("originalTitle",  clean_name_or_title("originalTitle"))
        .withColumn("isAdult",        clean_int("isAdult"))
        .withColumn("startYear",      clean_int("startYear"))
        .withColumn("endYear",        clean_int("endYear"))
        .withColumn("runtimeMinutes", clean_int("runtimeMinutes"))

        .withColumn("genres_arr", split_list("genres"))
        .withColumn(
            "genres",
            F.when(F.col("genres_arr").isNull(), None)
             .otherwise(F.concat_ws(", ", F.col("genres_arr")))
        )

        .dropDuplicates(["tconst"])
    )

    return add_metadata(df_clean, "bronze_title_basics")

# =====================================================
# SILVER: NAME BASICS  (PK = nconst)
# =====================================================

@dlt.expect_or_drop(
    "valid_pk_nconst",
    "nconst IS NOT NULL AND nconst <> '' AND nconst LIKE 'nm%'"
)
@dlt.table(
    name="silver_name_basics",
    comment="Validated people dimension with enforced PK."
)
def silver_name_basics():
    df = read_bronze("bronze_name_basics")

    df_clean = (
        df
        .withColumn("primaryName", clean_name_or_title("primaryName"))
        .withColumn("birthYear",   clean_int("birthYear"))
        .withColumn("deathYear",   clean_int("deathYear"))

        .withColumn("primaryProfession_arr", split_list("primaryProfession"))
        .withColumn(
            "primaryProfession",
            F.when(F.col("primaryProfession_arr").isNull(), None)
             .otherwise(F.concat_ws(", ", F.col("primaryProfession_arr")))
        )

        .withColumn("knownForTitles_arr", split_list("knownForTitles"))
        .withColumn(
            "knownForTitles",
            F.when(F.col("knownForTitles_arr").isNull(), None)
             .otherwise(F.concat_ws(", ", F.col("knownForTitles_arr")))
        )

        .dropDuplicates(["nconst"])
    )

    return add_metadata(df_clean, "bronze_name_basics")

# =====================================================
# SILVER: TITLE PRINCIPALS (FK = tconst, nconst)
# =====================================================

@dlt.expect_or_drop("valid_fk_tconst", "tconst IS NOT NULL")
@dlt.expect_or_drop("valid_fk_nconst", "nconst IS NOT NULL")
@dlt.table(
    name="silver_title_principals",
    comment="Validated cast/crew mapping with non-null FKs."
)
def silver_title_principals():
    df = read_bronze("bronze_title_principals")

    df_clean = (
        df
        # clean characters: strip brackets/quotes, normalize blanks to NULL
        .withColumn("characters", F.regexp_replace(F.col("characters"), r'[\\\[\]\"]', ''))
        .withColumn("characters", F.trim(F.col("characters")))
        .withColumn(
            "characters",
            F.when(
                F.col("characters").isin("", "null", "N", "\\N"),
                None
            ).otherwise(F.col("characters"))
        )

        .withColumn("job",      clean_string("job"))
        .withColumn("category", clean_string("category"))
        .withColumn("ordering", clean_int("ordering"))

        .dropDuplicates(["tconst", "nconst", "category"])
    )

    return add_metadata(df_clean, "bronze_title_principals")

# =====================================================
# SILVER: TITLE CREW (FK = tconst)
# =====================================================

@dlt.expect_or_drop("valid_fk_tconst", "tconst IS NOT NULL")
@dlt.table(
    name="silver_title_crew",
    comment="Validated director/writer mapping."
)
def silver_title_crew():
    df = read_bronze("bronze_title_crew")

    df_clean = (
        df
        .withColumn("directors_arr", split_list("directors"))
        .withColumn("writers_arr",   split_list("writers"))

        .withColumn(
            "directors",
            F.when(F.col("directors_arr").isNull(), None)
             .otherwise(F.concat_ws(", ", F.col("directors_arr")))
        )
        .withColumn(
            "writers",
            F.when(F.col("writers_arr").isNull(), None)
             .otherwise(F.concat_ws(", ", F.col("writers_arr")))
        )

        .drop("directors_arr", "writers_arr")
        .dropDuplicates(["tconst"])
    )

    return add_metadata(df_clean, "bronze_title_crew")

# =====================================================
# SILVER: TITLE AKAS (FK = titleId)
# =====================================================

@dlt.expect_or_drop("valid_fk_titleId", "titleId IS NOT NULL")
@dlt.table(
    name="silver_title_akas",
    comment="Validated alternate titles with regions & languages."
)
def silver_title_akas():
    df = read_bronze("bronze_title_akas")

    df_clean = (
        df
        .withColumn("ordering", clean_int("ordering"))
        .withColumn("region",   clean_string("region"))
        .withColumn("language", clean_string("language"))

        .withColumn("types_arr",      split_list("types"))
        .withColumn("attributes_arr", split_list("attributes"))

        .withColumn(
            "types",
            F.when(F.col("types_arr").isNull(), None)
             .otherwise(F.concat_ws(", ", F.col("types_arr")))
        )
        .withColumn(
            "attributes",
            F.when(F.col("attributes_arr").isNull(), None)
             .otherwise(F.concat_ws(", ", F.col("attributes_arr")))
        )

        .drop("types_arr", "attributes_arr")
        .dropDuplicates(["titleId", "ordering"])
    )

    return add_metadata(df_clean, "bronze_title_akas")

# =====================================================
# SILVER: TITLE EPISODE (FK = tconst)
# =====================================================

@dlt.expect_or_drop("valid_fk_tconst", "tconst IS NOT NULL")
@dlt.table(
    name="silver_title_episode",
    comment="Validated episodes with season/episode numbers."
)
def silver_title_episode():
    df = read_bronze("bronze_title_episode")

    df_clean = (
        df
        .withColumn("parentTconst",  clean_string("parentTconst"))
        .withColumn("seasonNumber",  clean_int("seasonNumber"))
        .withColumn("episodeNumber", clean_int("episodeNumber"))
        .dropDuplicates(["tconst", "parentTconst", "seasonNumber", "episodeNumber"])
    )

    return add_metadata(df_clean, "bronze_title_episode")

# =====================================================
# SILVER: TITLE RATINGS (FK = tconst)
# =====================================================

@dlt.expect_or_drop("valid_fk_tconst", "tconst IS NOT NULL")
@dlt.table(
    name="silver_title_ratings",
    comment="Validated ratings per title."
)
def silver_title_ratings():
    df = read_bronze("bronze_title_ratings")

    df_clean = (
        df
        .withColumn("averageRating", clean_double("averageRating"))
        .withColumn("numVotes",      clean_int("numVotes"))
        .dropDuplicates(["tconst"])
    )

    return add_metadata(df_clean, "bronze_title_ratings")
