In [0]:
# 1. SETUP: Import the necessary tools 
from pyspark.sql.functions import (
    col, lit, current_timestamp, trim, to_timestamp, coalesce, sha2, 
    concat_ws, year, when 
)
from delta.tables import DeltaTable
import datetime
import sys
import traceback


SOURCE_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/bronze/ResellerSales/HumanResources.Employee/"
TARGET_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/dim/dim_employee/"

PK_RAW = "EmployeeNationalIDAlternateKey,EmployeeAlternateKey" 
PRIMARY_KEYS = [c.strip() for c in PK_RAW.split(",") if c.strip()]

# 1c. Define Column Renaming and Mapping (Bronze to Silver)
COLUMN_MAP = {
    "BusinessEntityID": "EmployeeAlternateKey",
    "NationalIDNumber": "EmployeeNationalIDAlternateKey", 
    "JobTitle": "Title", 
    "HireDate": "HireDate", 
    "BirthDate": "BirthDate", 
    "MaritalStatus": "MaritalStatus", 
    "Gender": "Gender", 
    "SalariedFlag": "SalariedFlag",
    "VacationHours": "VacationHours",
    "SickLeaveHours": "SickLeaveHours",
    "LoginID": "LoginID", 
    "CurrentFlag": "CurrentFlag_Source", 
    "ModifiedDate": "ModifiedDate"
}

# 1d. Setup Storage Access (Authentication) - Replace with your key!
storage_account_name = "scrgvkrmade"
account_key = "E4VB7pXWFXttUWbbSBPY35/Dvsw6Fs6XgIWLTj3lCS6v/jCEow9Uxs+r6Usobhenv14UdWEzb+R8+AStNyS0dg=="
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", account_key)
PK_COLS = PRIMARY_KEYS


In [0]:

df_source = (spark.read
              .option("mergeSchema", "true")
              .option("recursiveFileLookup", "true")
              .parquet(SOURCE_PATH))


print(f"Read OK. Rows: {df_source.count()} Columns: {df_source.columns}")



for source_col, target_col in COLUMN_MAP.items():
    if source_col in df_source.columns:
        df_source = df_source.withColumnRenamed(source_col, target_col)
        
df_source = df_source.withColumn("EmployeeNationalIDAlternateKey", trim(coalesce(col("EmployeeNationalIDAlternateKey"), lit("N/A"))))
df_source = df_source.withColumn("Title", trim(coalesce(col("Title"), lit("Unknown Title"))))
df_source = df_source.withColumn("HireDate", to_timestamp(col("HireDate")))
df_source = df_source.withColumn("ModifiedDate", to_timestamp(col("ModifiedDate")))
df_source = df_source.withColumn("BirthDate", to_timestamp(col("BirthDate")))
df_source = df_source.withColumn("VacationHours", col("VacationHours").cast("integer"))
df_source = df_source.withColumn("SickLeaveHours", col("SickLeaveHours").cast("integer"))
df_source = df_source.dropDuplicates(PK_COLS)


df_source = df_source.withColumn("LoadTS", current_timestamp())
df_source = df_source.withColumn("__ingest_ts", current_timestamp())
df_source = df_source.withColumn("__source_path", lit(SOURCE_PATH)) 
df_source = df_source.withColumn("__target_path", lit(TARGET_PATH)) 
df_source = df_source.withColumn("__batch_id", lit("Batch-" + datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')))
df_source = df_source.withColumn("_year", year(col("ModifiedDate")))
hash_cols = ["EmployeeNationalIDAlternateKey","EmployeeAlternateKey","Title", "MaritalStatus", "Gender", "VacationHours", "SickLeaveHours"]
df_source = df_source.withColumn(
    "__row_hash", sha2(concat_ws("||", *[coalesce(col(c).cast("string"), lit("")) for c in hash_cols]), 256)
)
#df_source = df_source.withColumn("EmployeeKey", lit(None).cast("long"))
df_source = df_source.withColumn("IsCurrent", lit(True))
df_source = df_source.withColumn("StartDate", col("HireDate"))
df_source = df_source.withColumn("EndDate", lit(None).cast("timestamp"))
df_source = df_source.withColumn("Status", when(col("CurrentFlag_Source") == lit(False), lit("INACTIVE")).otherwise(lit("ACTIVE")))
#df_source = df_source.drop("CurrentFlag_Source")


# --- 4. LOAD INTO DELTA LAKE USING MERGE ---

# 4a. EXPLICIT ALL_TARGET_COLS List
ALL_TARGET_COLS = [
    "EmployeeAlternateKey", "EmployeeNationalIDAlternateKey",
    "Title", "HireDate", "BirthDate", "MaritalStatus", "Gender", "SalariedFlag", 
    "VacationHours", "SickLeaveHours", "LoginID", "ModifiedDate", 
    "StartDate", "EndDate", "IsCurrent", "Status","CurrentFlag_Source",
    "LoadTS", "__row_hash", "_year", "__ingest_ts", "__source_path", 
    "__target_path", "__batch_id"
]

target_exists = DeltaTable.isDeltaTable(spark, TARGET_PATH)

if not target_exists:
    print(f"\n4. Target table not found. Creating initial table at: {TARGET_PATH}")
    
    # 4b. Initial Write: Select only the required columns and save
    df_source.select(*[c for c in ALL_TARGET_COLS if c in df_source.columns]).write \
        .format("delta").mode("overwrite").option("overwriteSchema", "true").partitionBy("_year").save(TARGET_PATH)
    print("Initial table created. âœ…")

else:
    print(f"\n4. Target table exists. Performing Incremental MERGE...")
    dt_target = DeltaTable.forPath(spark, TARGET_PATH)
    
    # Join condition uses the correct column name
    join_cond = " AND ".join([f"target.{c} = source.{c}" for c in PK_COLS])
    change_cond = "target.__row_hash != source.__row_hash"
    
    dt_target.alias("target").merge(df_source.alias("source"), join_cond) \
    .whenMatchedUpdate(
        condition=change_cond,
        set = {
            # Update mutable data and audit columns "EmployeeAlternateKey", "EmployeeNationalIDAlternateKey",
            "EmployeeAlternateKey": "source.EmployeeAlternateKey", "EmployeeNationalIDAlternateKey": "source.EmployeeNationalIDAlternateKey", 
            "Title": "source.Title", "MaritalStatus": "source.MaritalStatus", 
            "Gender": "source.Gender", "VacationHours": "source.VacationHours", 
            "SickLeaveHours": "source.SickLeaveHours", "Status": "source.Status",
            "LoadTS": "source.LoadTS", "__row_hash": "source.__row_hash",
            "CurrentFlag_Source":"source.CurrentFlag_Source"
        }
    ) \
    .whenNotMatchedInsert(
        # --- EXPLICIT INSERT VALUES FOR EMPLOYEE ---
        values = {
            # Keys and Identifiers
           # "EmployeeKey": "source.EmployeeKey",
            "EmployeeAlternateKey": "source.EmployeeAlternateKey",
            "EmployeeNationalIDAlternateKey": "source.EmployeeNationalIDAlternateKey",
            
            # Data Columns
            "Title": "source.Title",
            "HireDate": "source.HireDate",
            "BirthDate": "source.BirthDate",
            "MaritalStatus": "source.MaritalStatus",
            "Gender": "source.Gender",
            "SalariedFlag": "source.SalariedFlag",
            "VacationHours": "source.VacationHours",
            "SickLeaveHours": "source.SickLeaveHours",
            "LoginID": "source.LoginID",
            "ModifiedDate": "source.ModifiedDate",
            
            # SCD/Status Columns
            "StartDate": "source.StartDate",
            "EndDate": "source.EndDate",
            "IsCurrent": "source.IsCurrent",
            "Status": "source.Status",
            
            # Audit Columns
            "LoadTS": "source.LoadTS",
            "__row_hash": "source.__row_hash",
            "_year": "source._year",
            "__ingest_ts": "source.__ingest_ts",
            "__source_path": "source.__source_path",
            "__target_path": "source.__target_path",
            "__batch_id": "source.__batch_id"
        }
    ) \
    .execute()
    
    print("MERGE (Incremental Load) complete.")