### **dim_currency**

In [0]:
from pyspark.sql.functions import col, lit, sha2, concat_ws, current_timestamp
from delta.tables import DeltaTable

# --- 1. Define Paths and Mappings ---

# Source Path (Input)
CURRENCY_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/stg_currency/"
TARGET_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/gold/dim/dim_currency/"

# Define the Primary Key column for matching existing rows
PK_COL = "CurrencyName"

# Columns needed for the Gold table
# Note: Source columns are in stg_currency, Target columns are in dbo.DimCurrency
CURRENCY_MAPPING = [
    ("CurrencyCode", "CurrencyAlternateKey"), # Source PK becomes Target Alternate Key
    ("Name", "CurrencyName"),
    ("ModifiedDate", "ModifiedDate")
]

# Configure Authentication (use the DFS endpoint)
storage_account_name = "scrgvkrmade"
account_key = "E4VB7pXWFXttUWbbSBPY35/Dvsw6Fs6XgIWLTj3lCS6v/jCEow9Uxs+r6Usobhenv14UdWEzb+R8+AStNyS0dg=="
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
    account_key
)
print("Configured Spark authentication.")


# --- 2. Read and Prepare Source Data (df_source) ---
print(f"\nReading Currency data from: {CURRENCY_PATH}")
df_source = spark.read.format("delta").load(CURRENCY_PATH)

# Select and Rename columns
selected_expr = [col(src).alias(tgt) for src, tgt in CURRENCY_MAPPING if src in df_source.columns]
df_source = df_source.select(*selected_expr)

# Create a hash for change detection (simplest way to see if data changed)
# Use the core business columns for hashing
hash_cols = ["CurrencyAlternateKey", "CurrencyName"]
df_source = df_source.withColumn(
    "__row_hash",
    sha2(concat_ws("||", *[col(c).cast("string") for c in hash_cols]), 256)
)

# Add required DimCurrency columns (placeholders)
df_source = df_source.withColumn("LoadTS", current_timestamp())
df_source = df_source.withColumn("IsCurrent", lit(True))
df_source = df_source.withColumn("CurrencyKey", lit(None).cast("long")) # Placeholder for surrogate key


# --- 3. Check and Create Target Table (if needed) ---
target_exists = DeltaTable.isDeltaTable(spark, TARGET_PATH)

if not target_exists:
    print(f"Target table not found. Creating initial DimCurrency table at: {TARGET_PATH}")
    # We must include all required DimCurrency columns for the first write
    final_cols = ["CurrencyKey", "CurrencyAlternateKey", "CurrencyName", "ModifiedDate","LoadTS", "__row_hash", "IsCurrent"]
    
    # Select only the required columns and write the initial table
    df_source.select(*final_cols).write \
        .format("delta") \
        .mode("overwrite") \
        .save(TARGET_PATH)
    print("Initial DimCurrency table created. Skipping merge.")

else:
    # --- 4. Perform MERGE (Incremental Upsert) ---
    print(f"\nTarget table exists. Performing MERGE (Incremental Load)...")
    
    dt_target = DeltaTable.forPath(spark, TARGET_PATH)
    
    # Define the condition to match an existing row (Primary Key)
    join_cond = f"target.{PK_COL} = source.{PK_COL}"
    
    # Define the condition to detect a change (Row Hash)
    change_cond = "target.__row_hash != source.__row_hash"

    dt_target.alias("target").merge(
        df_source.alias("source"),
        join_cond
    ) \
    .whenMatchedUpdate(
        condition=change_cond,
        set = {
            # Only update if the hash has changed
            "CurrencyName": "source.CurrencyName",
            "ModifiedDate": "source.ModifiedDate",
            "LoadTS": "source.LoadTS",
            "__row_hash": "source.__row_hash"
        }
    ) \
    .whenNotMatchedInsert(
        values = {
            "CurrencyKey": "NULL", # Key will be populated elsewhere or remains NULL
            "CurrencyAlternateKey": "source.CurrencyAlternateKey",
            "CurrencyName": "source.CurrencyName",
            "ModifiedDate": "source.ModifiedDate",
            "LoadTS": "source.LoadTS",
            "__row_hash": "source.__row_hash",
            "IsCurrent": "source.IsCurrent"
        }
    ) \
    .execute()
    
    print("MERGE (Incremental Load) complete. ✅")

# --- 5. Final Validation (Optional) ---
# Read the final Gold table to check row count
# tgt = spark.read.format("delta").load(TARGET_PATH)
# print(f"\nFinal DimCurrency rows: {tgt.count()}")
# display(tgt.limit(5))

In [0]:
# read table root
df = spark.read.format("delta").load("wasbs://project@scrgvkrmade.blob.core.windows.net/gold/dim/dim_currency")

# read a specific partition (also allowed)
df_part = spark.read.format("delta").load(
    "wasbs://project@scrgvkrmade.blob.core.windows.net/gold/dim/dim_currency"
)
display(df_part)

### **dim_employee**

In [0]:
# KEY for ABFSS (dfs endpoint) - needed for writing Delta
spark.conf.set(
    "fs.azure.account.key.scrgvkrmade.dfs.core.windows.net",
    "E4VB7pXWFXttUWbbSBPY35/Dvsw6Fs6XgIWLTj3lCS6v/jCEow9Uxs+r6Usobhenv14UdWEzb+R8+AStNyS0dg=="
)

# Read from BLOB (works)
df = spark.read.parquet(
    "wasbs://project@scrgvkrmade.blob.core.windows.net/bronze/ResellerSales/HumanResources.Employee/*/part-*.parquet"
)

display(df)

In [0]:
# read table root
df = spark.read.format("delta").load("wasbs://project@scrgvkrmade.blob.core.windows.net/silver/stg_employee")

# read a specific partition (also allowed)
df_part = spark.read.format("delta").load(
    "wasbs://project@scrgvkrmade.blob.core.windows.net/silver/stg_employee"
)
display(df_part)

In [0]:
from pyspark.sql.functions import col, lit, sha2, concat_ws, current_timestamp, year, to_timestamp, coalesce
from delta.tables import DeltaTable

# --- 1. Define Paths and Mappings ---

# Source Path (Input)
EMPLOYEE_SOURCE_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/stg_employee/"
TARGET_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/gold/dim/dim_employee/"

# Define the Primary Key column for matching existing rows (using the Alternate Key)
PK_COL = "EmployeeNationalIDAlternateKey" # Target Column Name

# Columns needed for the Gold table (Source Column, Target Column)
# NOTE: We map BusinessEntityID to ParentEmployeeNationalIDAlternateKey as we don't have
# parent data in this simple source. We also use placeholders for missing columns.
EMPLOYEE_MAPPING = [
    # Alternate Keys (for matching)
    ("NationalIDNumber", "EmployeeNationalIDAlternateKey"),
    ("BusinessEntityID", "ParentEmployeeNationalIDAlternateKey"), 
    
    # Core Attributes (use placeholders/NULLs for missing columns)
    # Placeholder for SalesTerritoryKey (usually joined from another table)
    # (Missing from source: FirstName, LastName, MiddleName, NameStyle, Title, Phone, etc.)
    ("JobTitle", "Title"),
    ("HireDate", "HireDate"),
    ("BirthDate", "BirthDate"),
    ("LoginID", "LoginID"),
    ("MaritalStatus", "MaritalStatus"),
    ("SalariedFlag", "SalariedFlag"),
    ("Gender", "Gender"),
    ("VacationHours", "VacationHours"),
    ("SickLeaveHours", "SickLeaveHours"),
    ("CurrentFlag", "CurrentFlag"),
    ("ModifiedDate", "ModifiedDate"),
    # Audit Columns
     ("_ingest_ts", "_ingest_ts"),
    ("__source_path", "__source_path")
]

# Configure Authentication (use the DFS endpoint)
storage_account_name = "scrgvkrmade"
account_key = "E4VB7pXWFXttUWbbSBPY35/Dvsw6Fs6XgIWLTj3lCS6v/jCEow9Uxs+r6Usobhenv14UdWEzb+R8+AStNyS0dg=="
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
    account_key
)
print("Configured Spark authentication.")


# --- 2. Read and Prepare Source Data (df_source) ---
print(f"\nReading Currency data from: {EMPLOYEE_SOURCE_PATH}")
df_source = spark.read.format("delta").load(EMPLOYEE_SOURCE_PATH)


# Select and Rename columns
selected_expr = [col(src).alias(tgt) for src, tgt in EMPLOYEE_MAPPING if src in df_source.columns]
df_source = df_source.select(*selected_expr)

# Add placeholder columns for missing attributes in DimEmployee
# This ensures the target schema is complete
df_source = df_source.withColumn("EmployeeKey", lit(None).cast("long"))
df_source = df_source.withColumn("ParentEmployeeKey", lit(None).cast("long"))
df_source = df_source.withColumn("SalesTerritoryKey", lit(-1).cast("integer"))
df_source = df_source.withColumn("FirstName", lit("N/A"))
df_source = df_source.withColumn("LastName", lit("N/A"))
df_source = df_source.withColumn("MiddleName", lit(None).cast("string"))
df_source = df_source.withColumn("NameStyle", lit(0).cast("integer")) # 0 for Western, 1 for Eastern
df_source = df_source.withColumn("EmailAddress", lit(None).cast("string"))
df_source = df_source.withColumn("Phone", lit(None).cast("string"))
df_source = df_source.withColumn("EmergencyContactName", lit("N/A"))
df_source = df_source.withColumn("EmergencyContactPhone", lit("N/A"))
df_source = df_source.withColumn("PayFrequency", lit(1).cast("integer")) # Placeholder
df_source = df_source.withColumn("BaseRate", lit(0.0).cast("decimal(18,4)"))
df_source = df_source.withColumn("SalesPersonFlag", lit(False))
df_source = df_source.withColumn("DepartmentName", lit(None).cast("string"))
df_source = df_source.withColumn("StartDate", to_timestamp(col("HireDate")))
df_source = df_source.withColumn("EndDate", lit(None).cast("timestamp"))
df_source = df_source.withColumn("Status", lit("Current"))
df_source = df_source.withColumn("EmployeePhoto", lit(None).cast("binary"))
df_source = df_source.withColumn("ModifiedDate", lit(None).cast("timestamp"))
df_source = df_source.withColumn("_ingest_ts", lit(None).cast("timestamp"))
df_source = df_source.withColumn("__source_path", lit(None).cast("string"))

# Create a hash for change detection (using core business keys and mutable attributes)
hash_cols = [
    "EmployeeNationalIDAlternateKey", "Title", "MaritalStatus", 
    "Gender", "SalariedFlag", "VacationHours", "SickLeaveHours"
]
df_source = df_source.withColumn(
    "__row_hash",
    sha2(concat_ws("||", *[coalesce(col(c).cast("string"), lit("")) for c in hash_cols]), 256)
)

# Add audit columns
df_source = df_source.withColumn("LoadTS", current_timestamp())
df_source = df_source.withColumn("IsCurrent", lit(True)) # Using IsCurrent for SDC Type 1/Type 2 flag
df_source = df_source.withColumn("_year", year(to_timestamp(col("HireDate")))) # For partitioning


# --- 3. Check and Create Target Table (if needed) ---
target_exists = DeltaTable.isDeltaTable(spark, TARGET_PATH)

# List of ALL columns in the target DimEmployee table
GOLD_COLUMNS = [
    "EmployeeKey", "ParentEmployeeKey", "EmployeeNationalIDAlternateKey", "ParentEmployeeNationalIDAlternateKey", 
    "SalesTerritoryKey", "FirstName", "LastName", "MiddleName", "NameStyle", "Title", "HireDate", 
    "BirthDate", "LoginID", "EmailAddress", "Phone", "MaritalStatus", "EmergencyContactName", 
    "EmergencyContactPhone", "SalariedFlag", "Gender", "PayFrequency", "BaseRate", "VacationHours", 
    "SickLeaveHours", "CurrentFlag", "SalesPersonFlag", "DepartmentName", "StartDate", "EndDate", 
    "Status", "EmployeePhoto", "ModifiedDate", "LoadTS", "__row_hash", "IsCurrent", "_year"
]


if not target_exists:
    print(f"Target table not found. Creating initial DimEmployee table at: {TARGET_PATH}")
    
    # Select only the required columns and write the initial table
    df_source.select(*GOLD_COLUMNS).write \
        .format("delta") \
        .mode("overwrite") \
        .partitionBy("_year") \
        .save(TARGET_PATH)
    print("Initial DimEmployee table created. Skipping merge.")

else:
    # --- 4. Perform MERGE (Incremental Upsert) ---
    print(f"\nTarget table exists. Performing MERGE (Incremental Load)...")
    
    dt_target = DeltaTable.forPath(spark, TARGET_PATH)
    
    # Define the condition to match an existing row (Alternate Key)
    join_cond = f"target.{PK_COL} = source.{PK_COL}"
    
    # Define the condition to detect a change (Row Hash)
    change_cond = "target.__row_hash != source.__row_hash"

    # Create the dictionary for all columns to update
    update_set = {col_name: f"source.{col_name}" for col_name in GOLD_COLUMNS if col_name not in ["EmployeeNationalIDAlternateKey", "_year", "IsCurrent"]} # Don't update SK, IsCurrent, or partition year
    
    # Create the dictionary for all columns to insert
    insert_values = {col_name: f"source.{col_name}" for col_name in GOLD_COLUMNS}


    dt_target.alias("target").merge(
        df_source.alias("source"),
        join_cond
    ) \
    .whenMatchedUpdate(
        condition=change_cond,
        set = update_set
    ) \
    .whenNotMatchedInsert(
        values = insert_values
    ) \
    .execute()
    
    print("MERGE (Incremental Load) complete. ✅")

print("DimEmployee job finished.")

In [0]:
# read table root
df = spark.read.format("delta").load("wasbs://project@scrgvkrmade.blob.core.windows.net/gold/dim/dim_currency")

# read a specific partition (also allowed)
df_part = spark.read.format("delta").load(
    "wasbs://project@scrgvkrmade.blob.core.windows.net/gold/dim/dim_currency"
)
display(df_part)

In [0]:
# read table root
df = spark.read.format("delta").load("wasbs://project@scrgvkrmade.blob.core.windows.net/gold/dim/dim_employee")

# read a specific partition (also allowed)
df_part = spark.read.format("delta").load(
    "wasbs://project@scrgvkrmade.blob.core.windows.net/gold/dim/dim_employee"
)
display(df_part)