In [0]:

# ---------- WIDGETS / DEFAULTS (ADF overrides these) ----------
# Create these widgets in the notebook UI once (or ADF passes them)
dbutils.widgets.text("Source_path", "abfss://project@scrgvkrmade.dfs.core.windows.net/bronze/ResellerSales/Sales.Currency/")
dbutils.widgets.text("Target_path", "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/")
dbutils.widgets.text("column_list", '["CurrencyCode","CurrencyName","ModifiedDate","__ingest_ts","__source_file","__source_path","__batch_id","__row_hash"]')
dbutils.widgets.text("direct_account_key", "E4VB7pXWFXttUWbbSBPY35/Dvsw6Fs6XgIWLTj3lCS6v/jCEow9Uxs+r6Usobhenv14UdWEzb+R8+AStNyS0dg==")   # optional
dbutils.widgets.text("domain", "ResellerSales")
dbutils.widgets.text("file_name", "Sales.Currency/*/*.parquet")
dbutils.widgets.text("table_name", "stg.currency")
dbutils.widgets.text("year_column", "ModifiedDate")          # e.g. ModifiedDate
dbutils.widgets.text("incremental_flag", "false")
dbutils.widgets.text("merge_flag", "true")
dbutils.widgets.text("pk_columns", "CurrencyAlternateKey")
dbutils.widgets.text("batch_name", "Batch_B")
dbutils.widgets.text("watermark", "")


In [0]:
# Linear end-to-end (NO select_columns variable)
# Read parquet -> clean/transform -> create/merge Delta
import traceback, datetime
from pyspark.sql.functions import col, trim, to_timestamp, year, lit, concat_ws, sha2, current_timestamp, coalesce, upper
from delta.tables import DeltaTable

# ---------------------------
# Inputs (explicit parquet file path you provided)
# ---------------------------
src_parquet_path = "wasbs://project@scrgvkrmade.blob.core.windows.net/bronze/ResellerSales/Sales.Currency/2019/part-00000-tid-4070521650027828931-365a17b5-0732-4562-a52e-a83db34c5cfe-119-1.c000.snappy.parquet"

# Target path via widget (ADF can override)
dbutils.widgets.text("Target_path", "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/")
Target_path = dbutils.widgets.get("Target_path").strip()

print("Source parquet (explicit):", src_parquet_path)
print("Target delta path:", Target_path)

# ---------------------------
# READ SOURCE parquet (single file)
# ---------------------------
print("Reading parquet file...")
try:
    df = spark.read.parquet(src_parquet_path)
    print("Read OK.")
except Exception:
    traceback.print_exc()
    raise RuntimeError("Failed to read the specified parquet file.")

# show columns found in the parquet
print("Source columns:", df.columns)
display(df.limit(5))

# ---------------------------
# CLEAN / TRANSFORM using ALL available columns
# - Trim string columns
# - Uppercase CurrencyCode (if present)
# - Cast ModifiedDate to timestamp and add _year (if present)
# - Fill simple nulls (e.g., CurrencyName)
# - Add audit columns if missing
# - Compute deterministic __row_hash
# ---------------------------

# 1) Trim all string columns
string_cols = [c for c,d in df.dtypes if d == "string"]
for c in string_cols:
    df = df.withColumn(c, trim(col(c)))

# 2) Uppercase CurrencyCode if exists
if "CurrencyCode" in df.columns:
    df = df.withColumn("CurrencyCode", upper(col("CurrencyCode")))

# 3) Cast ModifiedDate -> timestamp and add _year (fallback to current year if missing)
if "ModifiedDate" in df.columns:
    try:
        df = df.withColumn("ModifiedDate", to_timestamp(col("ModifiedDate")))
        df = df.withColumn("_year", year(col("ModifiedDate")))
        print("Converted ModifiedDate to timestamp and added _year.")
    except Exception:
        print("Could not cast ModifiedDate to timestamp; adding fallback _year.")
        if "_year" not in df.columns:
            df = df.withColumn("_year", lit(datetime.datetime.utcnow().year))
else:
    if "_year" not in df.columns:
        df = df.withColumn("_year", lit(datetime.datetime.utcnow().year))
        print("Added fallback _year with current year.")

# 4) Fill some sensible nulls
if "CurrencyName" in df.columns:
    df = df.withColumn("CurrencyName", coalesce(col("CurrencyName"), lit("")))

# 5) Add audit columns if missing
if "__ingest_ts" not in df.columns:
    df = df.withColumn("__ingest_ts", current_timestamp())
if "__source_file" not in df.columns:
    df = df.withColumn("__source_file", lit(src_parquet_path.split("/")[-1]))
if "__source_path" not in df.columns:
    df = df.withColumn("__source_path", lit(src_parquet_path))
if "__batch_id" not in df.columns:
    dbatch = "Batch-" + datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S")
    df = df.withColumn("__batch_id", lit(dbatch))

# 6) Compute deterministic row hash (__row_hash) using best available PK(s) or all columns
pk_candidates = [c for c in ["CurrencyAlternateKey","CurrencyCode"] if c in df.columns]
if pk_candidates:
    cols_for_hash = pk_candidates
else:
    cols_for_hash = df.columns

concat_expr = concat_ws("||", *[col(c).cast("string") for c in cols_for_hash])
df = df.withColumn("__row_hash", sha2(concat_expr, 256))

print("After cleaning/transforms. Columns now:", df.columns)
display(df.limit(5))

# ---------------------------
# DROP DUPLICATES by PK if available (keeps first encountered)
# ---------------------------
pk_for_merge = ["CurrencyAlternateKey"]
pk_in_df = [p for p in pk_for_merge if p in df.columns]
if pk_in_df:
    before = None
    try:
        before = df.count()
    except Exception:
        pass
    df = df.dropDuplicates(pk_in_df)
    if before is not None:
        after = df.count()
        print(f"Dropped duplicates by PK {pk_in_df}: {before-after} rows removed.")
else:
    # if no CurrencyAlternateKey, dedupe by __row_hash
    if "__row_hash" in df.columns:
        df = df.dropDuplicates(["__row_hash"])
        print("Dropped duplicates by __row_hash.")
    else:
        print("No PK or __row_hash available for deduplication; skipping dropDuplicates.")

# ---------------------------
# WRITE / MERGE into Delta
# - If target doesn't exist: create initial delta partitioned by _year
# - If target exists: MERGE using CurrencyAlternateKey if present else __row_hash
# ---------------------------
print("Preparing to write/merge to target Delta:", Target_path)
delta_exists = True
try:
    _ = spark.read.format("delta").load(Target_path)
except Exception:
    delta_exists = False

from delta.tables import DeltaTable

if not delta_exists:
    print("Target Delta not found -> creating initial Delta.")
    (df.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .partitionBy("_year")
        .save(Target_path))
    print("Initial Delta created at:", Target_path)
else:
    # choose merge PK: CurrencyAlternateKey if present, else __row_hash
    if "CurrencyAlternateKey" in df.columns:
        merge_pks = ["CurrencyAlternateKey"]
    elif "__row_hash" in df.columns:
        merge_pks = ["__row_hash"]
    else:
        raise RuntimeError("No column available for MERGE PK (neither CurrencyAlternateKey nor __row_hash).")

    # build merge condition
    join_cond = " AND ".join([f"target.`{c}` = source.`{c}`" for c in merge_pks])
    print("Merging using keys:", merge_pks)
    dt = DeltaTable.forPath(spark, Target_path)
    dt.alias("target").merge(
        df.alias("source"),
        join_cond
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()
    print("MERGE completed into:", Target_path)

# ---------------------------
# FINAL: show small sample from target
# ---------------------------
try:
    tgt = spark.read.format("delta").load(Target_path)
    print("Target approx row count:", tgt.count())
    display(tgt.limit(5))
except Exception as e:
    print("Could not read target after write/merge:", e)
    try:
        for f in dbutils.fs.ls(Target_path):
            print("-", f.path)
    except Exception as e2:
        print("Also failed to list target path:", e2)

print("End-to-end completed.")
