In [0]:
from pyspark.sql.functions import (
    col, lit, sha2, concat_ws, current_timestamp, coalesce, trim, 
    to_timestamp, year
)
from delta.tables import DeltaTable
import datetime
import traceback
import sys

# Source Path (Input)
SOURCE_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/bronze/ResellerSales/Sales.Currency/"
TARGET_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/dim/dim_currency/"

# Primary Key column(s) as a comma-separated string (Composite Key)
PK_RAW = "CurrencyAlternateKey,CurrencyName" 
PK_COLS = [c.strip() for c in PK_RAW.split(",") if c.strip()]
print(f"Composite Primary Key for MERGE: {PK_COLS}")
# ---------------------------

# Mapping format: ("Source_Column_Name", "Target_Column_Name")
CURRENCY_MAPPING = [
    ("CurrencyCode", "CurrencyAlternateKey"), 
    ("Name", "CurrencyName"),
    ("ModifiedDate", "ModifiedDate")
]

# Configure Authentication (REPLACE WITH REAL KEY)
storage_account_name = "scrgvkrmade"
account_key = "E4VB7pXWFXttUWbbSBPY35/Dvsw6Fs6XgIWLTj3lCS6v/jCEow9Uxs+r6Usobhenv14UdWEzb+R8+AStNyS0dg=="
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
    account_key
)
print("Configured Spark authentication.")


In [0]:

df_source = (spark.read
                  .option("mergeSchema", "true")
                  .option("recursiveFileLookup", "true")
                  .parquet(SOURCE_PATH))
print(f"Read OK. Rows: {df_source.count()} Columns: {df_source.columns}")

selected_expr = [col(src).alias(tgt) for src, tgt in CURRENCY_MAPPING if src in df_source.columns]
df_source = df_source.select(*selected_expr)

df_source = df_source.withColumn("CurrencyAlternateKey", trim(col("CurrencyAlternateKey")))
df_source = df_source.withColumn("CurrencyName", trim(col("CurrencyName")))

df_source = df_source.withColumn("CurrencyAlternateKey",coalesce(col("CurrencyAlternateKey"), lit("N/A"))).withColumn("CurrencyName",coalesce(col("CurrencyName"), lit("Unknown Currency")))

df_source = df_source.withColumn("ModifiedDate",to_timestamp(col("ModifiedDate")))

initial_count = df_source.count()
df_source = df_source.dropDuplicates(PK_COLS)
final_count = df_source.count()

df_source = df_source.withColumn("__ingest_ts", current_timestamp())
df_source = df_source.withColumn("__source_path", lit(SOURCE_PATH)) 
df_source = df_source.withColumn("__target_path", lit(TARGET_PATH)) 
df_source = df_source.withColumn("__batch_id", lit("Batch-" + datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')))

# Create a hash for change detection
hash_cols = ["CurrencyAlternateKey", "CurrencyName"]
df_source = df_source.withColumn(
    "__row_hash",
    sha2(concat_ws("||", *[coalesce(col(c).cast("string"), lit("")) for c in hash_cols]), 256)
)

# Add required DimCurrency columns (placeholders)
df_source = df_source.withColumn("LoadTS", current_timestamp())
df_source = df_source.withColumn("IsCurrent", lit(True))
df_source = df_source.withColumn("CurrencyKey", lit(None).cast("long")) # Placeholder for surrogate key

# ADD _year for Partitioning (Using the imported 'year' function)
df_source = df_source.withColumn("_year", year(col("ModifiedDate")))


In [0]:
target_exists = DeltaTable.isDeltaTable(spark, TARGET_PATH)

# List of all final columns for the DimCurrency schema
ALL_TARGET_COLS = [
     "CurrencyAlternateKey", "CurrencyName", "ModifiedDate", "LoadTS", 
    "__row_hash", "IsCurrent", "_year", "__ingest_ts", "__source_path", "__target_path", "__batch_id"
]

df_source.select(*[c for c in ALL_TARGET_COLS if c in df_source.columns]).write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .partitionBy("_year") \
        .save(TARGET_PATH)
print("Initial write completed.")


    
dt_target = DeltaTable.forPath(spark, TARGET_PATH)
join_cond = " AND ".join([f"target.{c} = source.{c}" for c in PK_COLS])
change_cond = "target.__row_hash != source.__row_hash"
print(f"MERGE Join Condition: {join_cond}")

dt_target.alias("target").merge(
        df_source.alias("source"),
        join_cond
    ) \
    .whenMatchedUpdate(
        condition=change_cond,
        set = {
            # Update mutable data and audit columns
            "CurrencyName": "source.CurrencyName",
            "ModifiedDate": "source.ModifiedDate",
            "LoadTS": "source.LoadTS",
            "__row_hash": "source.__row_hash",
            "__ingest_ts": "source.__ingest_ts",
            "__batch_id": "source.__batch_id"
        }
    ) \
    .whenNotMatchedInsert(
        # Insert all columns for new records
        #values = {c: f"source.{c}" for c in ALL_TARGET_COLS if c in df_source.columns}
        values = {
            # Insert all columns for new records
            #"CurrencyKey": "NULL",
            "CurrencyAlternateKey": "source.CurrencyAlternateKey",
            "CurrencyName": "source.CurrencyName",
            "ModifiedDate": "source.ModifiedDate",
            "LoadTS": "source.LoadTS",
            "__row_hash": "source.__row_hash",
            "IsCurrent": "source.IsCurrent",
            "_year": "source._year",            
            "__ingest_ts": "source.__ingest_ts",
            "__source_path": "source.__source_path",
            "__target_path": "source.__target_path",
            "__batch_id": "source.__batch_id"
        }
    ) \
    .execute()
    
print("MERGE (Incremental Load) complete. ")