In [0]:
# =====================================================
# SIMPLE SILVER LAYER - FIXED (no merge, beginner friendly)
# =====================================================

import json, datetime, traceback
from pyspark.sql.functions import (
    col, trim, current_timestamp, lit,
    concat_ws, sha2, to_timestamp, year
)

# ---------------------------
# Widgets (ADF will pass these)
# ---------------------------
dbutils.widgets.text("Source_path", "")
dbutils.widgets.text("Target_path", "")
dbutils.widgets.text("pk_columns", "")      # example: ["CurrencyAlternateKey"] OR CurrencyAlternateKey
dbutils.widgets.text("table_name", "")
dbutils.widgets.text("direct_account_key", "")   # optional: pass storage key if needed

# ---------------------------
# Read widgets
# ---------------------------
Source_path = dbutils.widgets.get("Source_path").strip()
Target_path = dbutils.widgets.get("Target_path").strip()
pk_raw = dbutils.widgets.get("pk_columns").strip()
table_name = dbutils.widgets.get("table_name").strip()
direct_key = dbutils.widgets.get("direct_account_key").strip()

print("Source_path :", Source_path)
print("Target_path :", Target_path)
print("PK Columns  :", pk_raw)
print("Table Name  :", table_name)

# ---------------------------
# Quick validation
# ---------------------------
if not Source_path:
    raise RuntimeError("Source_path widget is required (e.g. abfss://.../bronze/.../).")
if not Target_path:
    raise RuntimeError("Target_path widget is required (e.g. abfss://.../silver/.../).")

# ---------------------------
# OPTIONAL: set storage key if provided (for ABFSS)
# ---------------------------
if direct_key:
    k = direct_key.strip().strip('"').strip("'")
    # try to infer storage account from Source_path or Target_path
    acct = None
    for p in (Source_path, Target_path):
        if p and "@" in p:
            try:
                acct = p.split("@",1)[1].split(".")[0]; break
            except:
                pass
    if not acct:
        acct = "scrgvkrmade"   # fallback (change if you use different account)
    print("Configuring storage key for account:", acct)
    spark.conf.set(f"fs.azure.account.key.{acct}.dfs.core.windows.net", k)
else:
    print("No direct_account_key passed. Ensure cluster has storage permissions (MSI / mount / secret).")

# ---------------------------
# parse pk_columns inline (JSON array or CSV)
# ---------------------------
pk_cols = []
if pk_raw:
    try:
        parsed = json.loads(pk_raw)
        if isinstance(parsed, list):
            pk_cols = [str(x).strip() for x in parsed if str(x).strip()]
        else:
            # not a list, fallback to CSV split
            pk_cols = [c.strip() for c in str(pk_raw).split(",") if c.strip()]
    except Exception:
        pk_cols = [c.strip() for c in str(pk_raw).split(",") if c.strip()]

print("Parsed PK columns:", pk_cols)

# ---------------------------
# READ PARQUET FROM BRONZE (recursive)
# ---------------------------
print("Reading parquet from Source_path (recursive):", Source_path)
try:
    df = (spark.read
              .option("mergeSchema", "true")
              .option("recursiveFileLookup", "true")
              .parquet(Source_path))
    print("Read OK. Rows:", df.count(), "Columns:", df.columns)
    display(df.limit(5))
except Exception:
    print("Failed to read parquet from Source_path. Full error:")
    traceback.print_exc()
    raise RuntimeError("Parquet read failed. Check Source_path and storage permissions.")

# ---------------------------
# BASIC CLEANING
# - trim strings
# - add audit fields if missing
# - create __row_hash
# - add _year (from ModifiedDate if present else fallback)
# ---------------------------
# trim all string columns
string_cols = [c for c, t in df.dtypes if t == "string"]
for c in string_cols:
    df = df.withColumn(c, trim(col(c)))

# audit columns
if "__ingest_ts" not in df.columns:
    df = df.withColumn("__ingest_ts", current_timestamp())

if "__source_path" not in df.columns:
    df = df.withColumn("__source_path", lit(Source_path))

if "__batch_id" not in df.columns:
    df = df.withColumn("__batch_id", lit("Batch-" + datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')))

# compute __row_hash using PK columns if present else all columns
from pyspark.sql.functions import sha2, concat_ws

cols_for_hash = []
if pk_cols:
    cols_for_hash = [c for c in pk_cols if c in df.columns]
if not cols_for_hash:
    cols_for_hash = df.columns

# protect against empty columns list
if not cols_for_hash:
    raise RuntimeError("No columns available to compute __row_hash (empty schema?).")

df = df.withColumn("__row_hash", sha2(concat_ws("||", *[col(c).cast("string") for c in cols_for_hash]), 256))

print("After basic cleaning. Columns now:", df.columns)
display(df.limit(5))

# ---------------------------
# DEDUPE
# - use provided PK columns if they exist in DF
# - else dedupe by __row_hash
# ---------------------------
valid_pk = [c for c in pk_cols if c in df.columns]
if valid_pk:
    print("Dropping duplicates using PK columns:", valid_pk)
    before = df.count()
    df = df.dropDuplicates(valid_pk)
    after = df.count()
    print(f"Removed {before - after} duplicates.")
else:
    print("No valid PK found â†’ deduping by __row_hash")
    before = df.count()
    df = df.dropDuplicates(["__row_hash"])
    after = df.count()
    print(f"Removed {before - after} duplicates by __row_hash.")

display(df.limit(5))

# ---------------------------
# ADD _year
# ---------------------------
if "ModifiedDate" in df.columns:
    try:
        df = df.withColumn("ModifiedDate", to_timestamp(col("ModifiedDate")))
        df = df.withColumn("_year", year(col("ModifiedDate")))
    except Exception:
        df = df.withColumn("_year", lit(datetime.datetime.utcnow().year))
else:
    df = df.withColumn("_year", lit(datetime.datetime.utcnow().year))

# ---------------------------
# WRITE TO SILVER (overwrite)
# ---------------------------
print("Writing Delta to Target_path (overwrite):", Target_path)
try:
    (df.write
       .format("delta")
       .mode("overwrite")
       .option("overwriteSchema", "true")
       .partitionBy("_year")
       .save(Target_path))
    print("Write completed.")
except Exception:
    print("Failed to write Delta to Target_path. Error:")
    traceback.print_exc()
    raise RuntimeError("Delta write failed. Check Target_path and storage permissions.")

# ---------------------------
# VALIDATE: read back small sample from target
# ---------------------------
print("Validating output by reading back Target_path:", Target_path)
try:
    tgt = spark.read.format("delta").load(Target_path)
    print("Rows in Silver:", tgt.count())
    display(tgt.limit(10))
except Exception:
    print("Could not read back Delta target. Listing files in target path for debugging:")
    try:
        for f in dbutils.fs.ls(Target_path):
            print("-", f.path)
    except Exception:
        traceback.print_exc()
        print("Also failed to list target path. Check path correctness and permissions.")

print("Done.")
