In [0]:
abfss://project@scrgvkrmade.dfs.core.windows.net/silver/stg_product_subcategory/
/mnt/adls/project/gold/dim_employee/
abfss://project@scrgvkrmade.dfs.core.windows.net/gold/dim_employee/

["EmployeeKey","EmployeeAlternateKey","EmployeeNationalIDAlternateKey","JobTitle","HireDate","BirthDate","MaritalStatus","Gender","SalariedFlag","VacationHours","SickLeaveHours","CurrentFlag","EffectiveDate","EndDate","IsCurrent","__source_path","LoadTS","RowHash"]

(["__ingest_ts","_ingest_ts","__source_path","_source_path","_batch_id","__row_hash"])

In [0]:
# =========================
# Silver_To_Gold - simple upsert (beginner)
# =========================

import json, datetime, traceback
from pyspark.sql.functions import col, trim, current_timestamp, lit, concat_ws, sha2, to_timestamp
from delta.tables import DeltaTable

# Widgets
dbutils.widgets.text("Source_path", "")         # silver delta path
dbutils.widgets.text("Target_path", "")         # gold delta path
dbutils.widgets.text("pk_columns", "")          # JSON array or CSV e.g. ["CurrencyAlternateKey"]
dbutils.widgets.text("column_list", "")         # optional business columns list
dbutils.widgets.text("table_name", "")          # friendly name
dbutils.widgets.text("merge_flag", "true")      # true -> MERGE upsert, false -> overwrite
dbutils.widgets.text("incremental_flag", "false")
dbutils.widgets.text("watermark", "")           # e.g. "2025-12-02T07:40:46"
dbutils.widgets.text("direct_account_key", "")

# Read widgets
Source_path = dbutils.widgets.get("Source_path").strip()
Target_path = dbutils.widgets.get("Target_path").strip()
pk_raw = dbutils.widgets.get("pk_columns").strip()
col_list_raw = dbutils.widgets.get("column_list").strip()
merge_flag = dbutils.widgets.get("merge_flag").strip().lower() in ("true","1","yes","y")
incremental_flag = dbutils.widgets.get("incremental_flag").strip().lower() in ("true","1","yes","y")
watermark = dbutils.widgets.get("watermark").strip()
direct_key = dbutils.widgets.get("direct_account_key").strip()

# Basic validation
if not Source_path or not Target_path:
    raise RuntimeError("Provide Source_path and Target_path widgets.")

# Optional storage key
if direct_key:
    k = direct_key.strip().strip('"').strip("'")
    acct = None
    for p in (Source_path, Target_path):
        if p and "@" in p:
            acct = p.split("@",1)[1].split(".")[0]; break
    if not acct:
        acct = "scrgvkrmade"
    spark.conf.set(f"fs.azure.account.key.{acct}.dfs.core.windows.net", k)
    print("Configured storage key for account:", acct)
else:
    print("No storage key passed; rely on cluster permission.")

# parse lists
def parse_list(txt):
    if not txt:
        return []
    try:
        v = json.loads(txt)
        if isinstance(v, list):
            return [str(x).strip() for x in v if str(x).strip()]
    except:
        return [p.strip() for p in txt.split(",") if p.strip()]
    return []

pk_columns = parse_list(pk_raw)
business_columns = parse_list(col_list_raw)
print("PK columns:", pk_columns)
print("Business columns:", business_columns)

# Read source (delta preferred)
try:
    src = spark.read.format("delta").load(Source_path)
    print("Loaded Silver Delta:", Source_path)
except Exception:
    src = spark.read.parquet(Source_path)
    print("Loaded Silver Parquet:", Source_path)

print("Source columns:", src.columns)
display(src.limit(5))

# Incremental filter if requested
if incremental_flag:
    if not watermark:
        raise RuntimeError("incremental_flag=true requires watermark.")
    ts_col = None
    for cand in ("ModifiedDate","__ingest_ts","_ingestion_ts"):
        if cand in src.columns:
            ts_col = cand; break
    if not ts_col:
        raise RuntimeError("No timestamp column found for incremental filtering.")
    try:
        src = src.withColumn(ts_col, to_timestamp(col(ts_col)))
    except Exception:
        pass
    src = src.filter(col(ts_col) > lit(watermark))
    print("Rows after watermark filter:", src.count())
    display(src.limit(5))
else:
    print("Full load (no incremental filter).")

except Exception:
else:
    exclude = set(["__ingest_ts","_ingest_ts","__source_path","_source_path","_batch_id","__row_hash"])
    hash_cols = [c for c in src.columns if c not in exclude]

if not hash_cols:
    raise RuntimeError("No columns available for business hash.")
src = src.withColumn("__business_hash", sha2(concat_ws("||", *[col(c).cast("string") for c in hash_cols]), 256))
print("Computed __business_hash on:", hash_cols)
display(src.limit(5))

# Choose match key(s)
valid_pk = [p for p in pk_columns if p in src.columns]
if not valid_pk:
    print("No valid PK found in source; falling back to __business_hash for matching.")
    valid_pk = ["__business_hash"]

# Write initial or merge/upsert
target_exists = True
try:
    spark.read.format("delta").load(Target_path)
except Exception:
    target_exists = False

if not target_exists:
    print("Target not found. Writing initial Gold table.")
    out = src.withColumn("LoadTS", current_timestamp())
    out.write.format("delta").mode("overwrite").save(Target_path)
    print("Initial Gold written.")
else:
    if not merge_flag:
        print("merge_flag=false -> overwrite target.")
        out = src.withColumn("LoadTS", current_timestamp())
        out.write.format("delta").mode("overwrite").save(Target_path)
        print("Overwrite done.")
    else:
        print("merge_flag=true -> performing upsert MERGE using PKs:", valid_pk)
        dt = DeltaTable.forPath(spark, Target_path)
        join_cond = " AND ".join([f"target.`{c}` = source.`{c}`" for c in valid_pk])
        dt.alias("target").merge(
            src.alias("source"),
            join_cond
        ).whenMatchedUpdateAll() \
         .whenNotMatchedInsertAll() \
         .execute()
        print("Merge completed.")

# Validate
try:
    tgt = spark.read.format("delta").load(Target_path)
    print("Gold rows approx:", tgt.count())
    display(tgt.limit(5))
except Exception:
    print("Could not read gold target for validation.")

print("Silver_To_Gold finished.")


In [0]:
# ====================================================
# SIMPLE DIM LOADER (SCD2 friendly) - beginner style
# - Reads one or more Silver sources (semicolon-separated)
# - Builds business columns, surrogate key, SCD2 fields
# - MERGES into Gold target using pk_columns (or __business_hash fallback)
# ====================================================

import json, datetime, uuid, traceback
from pyspark.sql.functions import (
    col, trim, current_timestamp, lit, concat_ws, sha2, to_timestamp, year, row_number, monotonically_increasing_id
)
from pyspark.sql import Window
from delta.tables import DeltaTable

# --------- WIDGETS (set these in notebook or from ADF) ----------
dbutils.widgets.text("Source_path", "")        # semicolon-separated Silver path(s)
dbutils.widgets.text("Target_path", "")        # Gold path where dim will be written
dbutils.widgets.text("pk_columns", "")         # JSON array or CSV, e.g. '["CurrencyAlternateKey"]'
dbutils.widgets.text("column_list", "")        # JSON array or CSV of business columns to keep in dim
dbutils.widgets.text("table_name", "")         # friendly name, e.g., dbo.DimCurrency
dbutils.widgets.text("merge_flag", "true")     # true -> MERGE (SCD2 upsert)
dbutils.widgets.text("direct_account_key", "") # optional

# --------- read widgets ----------
Source_path_raw = dbutils.widgets.get("Source_path").strip()
Target_path = dbutils.widgets.get("Target_path").strip()
pk_raw = dbutils.widgets.get("pk_columns").strip()
col_list_raw = dbutils.widgets.get("column_list").strip()
table_name = dbutils.widgets.get("table_name").strip()
merge_flag = dbutils.widgets.get("merge_flag").strip().lower() in ("true","1","yes","y")
direct_key = dbutils.widgets.get("direct_account_key").strip()

print("Dim loader params:")
print(" Source:", Source_path_raw)
print(" Target:", Target_path)
print(" table_name:", table_name)
print(" merge_flag:", merge_flag)

# --------- optional: configure storage key if passed ----------
if direct_key:
    k = direct_key.strip().strip('"').strip("'")
    acct = None
    for p in (Source_path_raw, Target_path):
        if p and "@" in p:
            try:
                acct = p.split("@",1)[1].split(".")[0]; break
            except:
                pass
    if not acct:
        acct = "scrgvkrmade"
    spark.conf.set(f"fs.azure.account.key.{acct}.dfs.core.windows.net", k)
    print("Configured storage key for", acct)

# --------- helpers to parse lists ----------
def parse_list(txt):
    if not txt:
        return []
    try:
        parsed = json.loads(txt)
        if isinstance(parsed, list):
            return [str(x).strip() for x in parsed if str(x).strip()]
    except:
        return [p.strip() for p in txt.split(",") if p.strip()]
    return []

pk_columns = parse_list(pk_raw)
business_columns = parse_list(col_list_raw)

# --------- read and union source paths ----------
src_paths = [p.strip() for p in Source_path_raw.split(";") if p.strip()]
if not src_paths:
    raise RuntimeError("Provide Source_path (one or more paths separated by ';').")

df_list = []
for p in src_paths:
    try:
        tmp = spark.read.format("delta").load(p)
    except Exception:
        tmp = spark.read.parquet(p)
    df_list.append(tmp)

from functools import reduce
df = reduce(lambda a,b: a.unionByName(b, allowMissingColumns=True), df_list)

print("Source columns:", df.columns)
display(df.limit(5))

# --------- cleaning: trim strings ----------
string_cols = [c for c,t in df.dtypes if t == "string"]
for c in string_cols:
    df = df.withColumn(c, trim(col(c)))

# --------- select business columns (if provided) ----------
keep_cols = []
if business_columns:
    keep_cols = [c for c in business_columns if c in df.columns]
# ensure PKs present
for k in pk_columns:
    if k in df.columns and k not in keep_cols:
        keep_cols.append(k)
# always keep audit cols and business hash if present
for a in ["__ingest_ts","__source_file","__source_path","__batch_id","__row_hash"]:
    if a in df.columns and a not in keep_cols:
        keep_cols.append(a)

if keep_cols:
    df = df.select(*keep_cols)

print("Columns used for dim:", df.columns)
display(df.limit(5))

# --------- compute business hash for change detection ----------
# use business_columns if provided else all columns
if business_columns:
    hash_cols = [c for c in business_columns if c in df.columns]
else:
    exclude = set(["__ingest_ts","__source_path","__batch_id","__row_hash"])
    hash_cols = [c for c in df.columns if c not in exclude]

if not hash_cols:
    raise RuntimeError("No columns available for business hash.")

df = df.withColumn("__business_hash", sha2(concat_ws("||", *[col(c).cast("string") for c in hash_cols]), 256))
print("Computed business hash on:", hash_cols)
display(df.limit(5))

# --------- prepare merge key(s) ----------
valid_pk = [p for p in pk_columns if p in df.columns]
if not valid_pk:
    print("No valid PK found, falling back to __business_hash for key")
    valid_pk = ["__business_hash"]
print("Merge keys:", valid_pk)

# --------- create surrogate_key for incoming rows (simple approach) ----------
# We'll add surrogate_key only when creating new rows (during initial create or insert).
df = df.withColumn("_miid", monotonically_increasing_id())
win = Window.orderBy(col("_miid"))
df = df.withColumn("_rn", row_number().over(win)).drop("_miid")  # temporary row number

# --------- does target exist? ----------
target_exists = True
try:
    spark.read.format("delta").load(Target_path)
except Exception:
    target_exists = False

if not target_exists:
    print("Target not found -> creating initial Gold dimension.")
    # assign surrogate keys starting at 1
    df = df.withColumn("surrogate_key", (col("_rn")).cast("long")).drop("_rn")
    df = df.withColumn("effective_from", current_timestamp()).withColumn("effective_to", lit(None).cast("timestamp")).withColumn("current_flag", lit(True))
    df.write.format("delta").mode("overwrite").save(Target_path)
    print("Initial Gold dimension created at:", Target_path)
else:
    if not merge_flag:
        print("merge_flag=false -> overwrite target")
        out = df.withColumn("LoadTS", current_timestamp()).drop("_rn")
        out.write.format("delta").mode("overwrite").save(Target_path)
        print("Overwrite completed.")
    else:
        # SCD2-ish simple approach: expire current rows where business hash changed, then insert new rows
        print("merge_flag=true -> performing SCD2 steps (expire + insert).")
        dt = DeltaTable.forPath(spark, Target_path)
        # expire: set current_flag=false where pk matches and business hash differs
        match_on_pk = " AND ".join([f"target.`{p}` = source.`{p}`" for p in valid_pk])
        expire_cond = match_on_pk + " AND target.current_flag = true AND target.__business_hash <> source.__business_hash"
        dt.alias("target").merge(
            df.alias("source"),
            expire_cond
        ).whenMatchedUpdate(set = {"current_flag": lit(False), "effective_to": "current_timestamp()"}).execute()
        print("Expired previous records where business changed.")

        # assign new surrogate keys to incoming rows
        try:
            max_sk_row = spark.read.format("delta").load(Target_path).selectExpr("max(surrogate_key) as m").collect()[0]
            max_sk = int(max_sk_row["m"]) if max_sk_row and max_sk_row["m"] is not None else 0
        except Exception:
            max_sk = 0

        new_rows = df.withColumn("_miid2", monotonically_increasing_id())
        w2 = Window.orderBy(col("_miid2"))
        new_rows = new_rows.withColumn("_rn2", row_number().over(w2))
        new_rows = new_rows.withColumn("surrogate_key", (col("_rn2") + lit(max_sk)).cast("long")).drop("_miid2","_rn2","_rn")
        new_rows = new_rows.withColumn("effective_from", current_timestamp()).withColumn("effective_to", lit(None).cast("timestamp")).withColumn("current_flag", lit(True))

        # insert new rows (only non-matching current rows)
        tmp_view = "_tmp_inserts_" + uuid.uuid4().hex[:8]
        new_rows.createOrReplaceTempView(tmp_view)
        merge_sql = f"""
            MERGE INTO delta.`{Target_path}` AS target
            USING (SELECT * FROM {tmp_view}) AS source
            ON {" AND ".join([f"target.`{p}` = source.`{p}`" for p in valid_pk])} AND target.current_flag = true
            WHEN NOT MATCHED THEN
              INSERT ({', '.join([f'`{c}`' for c in new_rows.columns])})
              VALUES ({', '.join([f'source.`{c}`' for c in new_rows.columns])})
        """
        spark.sql(merge_sql)
        print("Inserted new SCD rows.")
# final validation
try:
    tgt = spark.read.format("delta").load(Target_path)
    print("Gold rows approx:", tgt.count())
    display(tgt.limit(5))
except Exception as e:
    print("Could not read target for validation:", e)

print("Dim loader finished for", table_name)
