In [0]:
# Generic Silver Loader: NO HARDCODED COLUMNS
import traceback, datetime
from pyspark.sql.functions import (
    col, trim, to_timestamp, year, lit, concat_ws, sha2, current_timestamp
)
from delta.tables import DeltaTable

# ---------------------------
# WIDGETS (ADF can override these)
# ---------------------------
dbutils.widgets.text("Source_path", "")
dbutils.widgets.text("Target_path", "")
dbutils.widgets.text("pk_columns", "")         # JSON list: ["id"] or ["code"]
dbutils.widgets.text("year_column", "")        # optional, blank allowed
dbutils.widgets.text("merge_flag", "true")
dbutils.widgets.text("batch_name", "Batch-1")

Source_path = dbutils.widgets.get("Source_path").strip()
Target_path = dbutils.widgets.get("Target_path").strip()
year_column = dbutils.widgets.get("year_column").strip()
batch_name = dbutils.widgets.get("batch_name").strip()

# Parse PK columns
try:
    pk_columns = json.loads(dbutils.widgets.get("pk_columns"))
except:
    pk_columns = []

merge_flag = dbutils.widgets.get("merge_flag").lower() in ("true","1","yes")

print("Source:", Source_path)
print("Target:", Target_path)
print("PK:", pk_columns)

# ---------------------------
# READ SOURCE parquet (generic)
# ---------------------------
try:
    df = (spark.read
          .option("mergeSchema","true")
          .option("recursiveFileLookup","true")
          .parquet(Source_path))
    print("Source columns:", df.columns)
except Exception as e:
    traceback.print_exc()
    raise Exception("Failed to read parquet.")

display(df.limit(5))

# ---------------------------
# GENERIC CLEANING (NO HARDCODING)
# ---------------------------

# 1) Trim all string columns (universal)
string_cols = [c for c, t in df.dtypes if t == "string"]
for c in string_cols:
    df = df.withColumn(c, trim(col(c)))

# 2) Convert year column only if user passed it
if year_column and year_column in df.columns:
    try:
        df = df.withColumn(year_column, to_timestamp(col(year_column)))
        df = df.withColumn("_year", year(col(year_column)))
    except:
        df = df.withColumn("_year", lit(datetime.datetime.utcnow().year))
else:
    df = df.withColumn("_year", lit(datetime.datetime.utcnow().year))

# 3) Add audit columns if missing
if "__ingest_ts" not in df.columns:
    df = df.withColumn("__ingest_ts", current_timestamp())

if "__source_path" not in df.columns:
    df = df.withColumn("__source_path", lit(Source_path))

if "__batch_id" not in df.columns:
    df = df.withColumn("__batch_id", lit(batch_name))

# 4) Compute row hash (generic, no hardcoding)
#    Use PK if provided, else use ALL columns
if pk_columns:
    hash_cols = [c for c in pk_columns if c in df.columns]
    if not hash_cols:
        hash_cols = df.columns
else:
    hash_cols = df.columns

df = df.withColumn(
    "__row_hash",
    sha2(concat_ws("||", *[col(c).cast("string") for c in hash_cols]), 256)
)

# ---------------------------
# REMOVE DUPLICATES (generic)
# ---------------------------
if pk_columns:
    valid_pk = [p for p in pk_columns if p in df.columns]
    if valid_pk:
        df = df.dropDuplicates(valid_pk)
    else:
        df = df.dropDuplicates(["__row_hash"])
else:
    df = df.dropDuplicates(["__row_hash"])

display(df.limit(5))

# ---------------------------
# WRITE / MERGE INTO DELTA (generic)
# ---------------------------
from delta.tables import DeltaTable

delta_exists = True
try:
    spark.read.format("delta").load(Target_path)
except:
    delta_exists = False

if not merge_flag:
    print("MERGE disabled → simple overwrite")
    df.write.format("delta").mode("overwrite").partitionBy("_year").save(Target_path)

else:
    if not delta_exists:
        print("Delta not found → creating initial table")
        df.write.format("delta").mode("overwrite").partitionBy("_year").save(Target_path)
    else:
        print("Delta found → MERGE")
        dt = DeltaTable.forPath(spark, Target_path)

        # MERGE key: pk_columns → fallback __row_hash
        if pk_columns:
            merge_keys = [p for p in pk_columns if p in df.columns]
        else:
            merge_keys = ["__row_hash"]

        join_cond = " AND ".join([f"target.{c} = source.{c}" for c in merge_keys])

        dt.alias("target").merge(
            df.alias("source"),
            join_cond
        ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

# ---------------------------
# SHOW TARGET
# ---------------------------
try:
    tgt = spark.read.format("delta").load(Target_path)
    print("Target count:", tgt.count())
    display(tgt.limit(5))
except:
    print("Target read failed.")
