In [0]:
#dbo.DimCurrency	[CurrencyKey] , [CurrencyAlternateKey] , [CurrencyName] 

In [0]:
from pyspark.sql.functions import col, lit, sha2, concat_ws, current_timestamp
from delta.tables import DeltaTable

# --- 1. Define Paths and Mappings ---

# Source Path (Input)
CURRENCY_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/stg_currency/"
TARGET_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/gold/dim/dim_currency/"

# Define the Primary Key column for matching existing rows
PK_COL = "CurrencyName"

# Columns needed for the Gold table
# Note: Source columns are in stg_currency, Target columns are in dbo.DimCurrency
CURRENCY_MAPPING = [
    ("CurrencyCode", "CurrencyAlternateKey"), # Source PK becomes Target Alternate Key
    ("Name", "CurrencyName"),
    ("ModifiedDate", "ModifiedDate")
]

# Configure Authentication (use the DFS endpoint)
storage_account_name = "scrgvkrmade"
account_key = "E4VB7pXWFXttUWbbSBPY35/Dvsw6Fs6XgIWLTj3lCS6v/jCEow9Uxs+r6Usobhenv14UdWEzb+R8+AStNyS0dg=="
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
    account_key
)
print("Configured Spark authentication.")


# --- 2. Read and Prepare Source Data (df_source) ---
print(f"\nReading Currency data from: {CURRENCY_PATH}")
df_source = spark.read.format("delta").load(CURRENCY_PATH)

# Select and Rename columns
selected_expr = [col(src).alias(tgt) for src, tgt in CURRENCY_MAPPING if src in df_source.columns]
df_source = df_source.select(*selected_expr)

# Create a hash for change detection (simplest way to see if data changed)
# Use the core business columns for hashing
hash_cols = ["CurrencyAlternateKey", "CurrencyName"]
df_source = df_source.withColumn(
    "__row_hash",
    sha2(concat_ws("||", *[col(c).cast("string") for c in hash_cols]), 256)
)

# Add required DimCurrency columns (placeholders)
df_source = df_source.withColumn("LoadTS", current_timestamp())
df_source = df_source.withColumn("IsCurrent", lit(True))
df_source = df_source.withColumn("CurrencyKey", lit(None).cast("long")) # Placeholder for surrogate key


# --- 3. Check and Create Target Table (if needed) ---
target_exists = DeltaTable.isDeltaTable(spark, TARGET_PATH)

if not target_exists:
    print(f"Target table not found. Creating initial DimCurrency table at: {TARGET_PATH}")
    # We must include all required DimCurrency columns for the first write
    final_cols = ["CurrencyKey", "CurrencyAlternateKey", "CurrencyName", "ModifiedDate","LoadTS", "__row_hash", "IsCurrent"]
    
    # Select only the required columns and write the initial table
    df_source.select(*final_cols).write \
        .format("delta") \
        .mode("overwrite") \
        .save(TARGET_PATH)
    print("Initial DimCurrency table created. Skipping merge.")

else:
    # --- 4. Perform MERGE (Incremental Upsert) ---
    print(f"\nTarget table exists. Performing MERGE (Incremental Load)...")
    
    dt_target = DeltaTable.forPath(spark, TARGET_PATH)
    
    # Define the condition to match an existing row (Primary Key)
    join_cond = f"target.{PK_COL} = source.{PK_COL}"
    
    # Define the condition to detect a change (Row Hash)
    change_cond = "target.__row_hash != source.__row_hash"

    dt_target.alias("target").merge(
        df_source.alias("source"),
        join_cond
    ) \
    .whenMatchedUpdate(
        condition=change_cond,
        set = {
            # Only update if the hash has changed
            "CurrencyName": "source.CurrencyName",
            "ModifiedDate": "source.ModifiedDate",
            "LoadTS": "source.LoadTS",
            "__row_hash": "source.__row_hash"
        }
    ) \
    .whenNotMatchedInsert(
        values = {
            "CurrencyKey": "NULL", # Key will be populated elsewhere or remains NULL
            "CurrencyAlternateKey": "source.CurrencyAlternateKey",
            "CurrencyName": "source.CurrencyName",
            "ModifiedDate": "source.ModifiedDate",
            "LoadTS": "source.LoadTS",
            "__row_hash": "source.__row_hash",
            "IsCurrent": "source.IsCurrent"
        }
    ) \
    .execute()
    
    print("MERGE (Incremental Load) complete. ✅")

# --- 5. Final Validation (Optional) ---
# Read the final Gold table to check row count
# tgt = spark.read.format("delta").load(TARGET_PATH)
# print(f"\nFinal DimCurrency rows: {tgt.count()}")
# display(tgt.limit(5))

In [0]:
# read table root
df = spark.read.format("delta").load("wasbs://project@scrgvkrmade.blob.core.windows.net/gold/dim/dim_currency")

# read a specific partition (also allowed)
df_part = spark.read.format("delta").load(
    "wasbs://project@scrgvkrmade.blob.core.windows.net/gold/dim/dim_currency"
)
display(df_part)

In [0]:
# from pyspark.sql.functions import col, trim, current_timestamp, lit, year, to_timestamp

# # --- 1. Define Paths and Mappings (The Setup) ---
# # NOTE: In Spark, it's easiest to define multiple paths as separate strings.

# # Source Paths (Input)
# CURRENCY_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/stg_currency/"
# CURRENCY_RATE_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/stg_currency_rate/"

# # Target Path (Output - Gold Layer)
# TARGET_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/gold/dim/dim_currency/"

# # Define the join key (must be present in both source DataFrames)
# JOIN_KEY = "CurrencyCode" # Assuming this is the common field

# # Define Mappings (Source Column Name, Target Column Name)

# # Columns to select from the Currency table
# CURRENCY_MAPPING = [
#     ("CurrencyCode", "CurrencyAlternateKey"), # Used as the Primary Key
#     ("CurrencyName", "CurrencyName"),
#     ("ModifiedDate", "ModifiedDate"),
#     ("CurrentFlag", "IsCurrent") # Placeholder: assuming IsCurrent logic later
# ]

# # Columns to select from the Currency Rate table
# CURRENCY_RATE_MAPPING = [
#     ("FromCurrencyCode", "CurrencyCode"), # Rename this to match the join key
#     ("AverageRate", "LatestRateToUSD_AverageRate"),
#     ("CurrencyRateDate", "LatestRateToUSD_Date")
# ]

# # --- 2. Read and Prepare DataFrames ---

# # A) Read Currency Data (stg_currency)
# print(f"Reading Currency data from: {CURRENCY_PATH}")
# df_currency = spark.read.format("delta").load(CURRENCY_PATH)

# # Apply Currency Mapping (Select and Rename)
# selected_currency_expr = [col(src).alias(tgt) for src, tgt in CURRENCY_MAPPING if src in df_currency.columns]
# df_currency_prep = df_currency.select(*selected_currency_expr)

# # B) Read Currency Rate Data (stg_currency_rate)
# print(f"Reading Currency Rate data from: {CURRENCY_RATE_PATH}")
# df_rate = spark.read.format("delta").load(CURRENCY_RATE_PATH)

# # Apply Currency Rate Mapping (Select and Rename)
# # NOTE: We select the maximum rate for each currency for the latest date.
# df_rate.createOrReplaceTempView("stg_currency_rate_temp")
# df_rate_prep = spark.sql(f"""
#     SELECT 
#         FromCurrencyCode,
#         MAX(AverageRate) as LatestRateToUSD_AverageRate,
#         MAX(CurrencyRateDate) as LatestRateToUSD_Date
#     FROM 
#         stg_currency_rate_temp
#     GROUP BY 
#         FromCurrencyCode
# """)
# # Now rename the key column for joining
# df_rate_prep = df_rate_prep.withColumnRenamed("FromCurrencyCode", JOIN_KEY)


# # --- 3. Join the DataFrames ---
# print(f"Joining DataFrames on key: {JOIN_KEY}")

# # Perform a LEFT JOIN, keeping all currency codes even if they have no rate
# df_joined = df_currency_prep.alias("c").join(
#     df_rate_prep.alias("r"),
#     col(f"c.{JOIN_KEY}") == col(f"r.{JOIN_KEY}"),
#     "left"
# ).select(
#     # Select all columns from the left (currency) table and the rate columns
#     col("c.CurrencyAlternateKey"),
#     col("c.CurrencyName"),
#     col("c.ModifiedDate"),
#     col("r.LatestRateToUSD_AverageRate"),
#     col("r.LatestRateToUSD_Date")
# )

# # --- 4. Final Cleaning and Audit Columns ---
# # Add audit and partition columns
# df_final = df_joined \
#     .withColumn("LoadTS", current_timestamp()) \
#     .withColumn("_year", year(to_timestamp(col("ModifiedDate")))) \
#     .withColumn("IsCurrent", lit(True)) # Placeholder value

# # Final Check
# print(f"Final Gold table schema:")
# df_final.printSchema()
# # display(df_final.limit(5)) # Uncomment in Databricks to view data

# # --- 5. Write Data to Gold Layer ---
# print(f"Writing data to Gold layer: {TARGET_PATH}")

# # Overwrite mode for simplicity (as in the previous example)
# df_final.write.format("delta") \
#     .mode("overwrite") \
#     .option("overwriteSchema", "true") \
#     .partitionBy("_year") \
#     .save(TARGET_PATH)

# print("Data integration and loading complete! ✅")

In [0]:
# -------------------------
# Simple DimCurrency loader - tailored to your Silver columns
# -------------------------
from pyspark.sql.functions import col, sha2, concat_ws, current_timestamp, lit
from delta.tables import DeltaTable

# --- 1. Paths & mapping (edit paths if needed) ---
CURRENCY_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/stg_currency/"
TARGET_PATH   = "abfss://project@scrgvkrmade.dfs.core.windows.net/gold/dim/dim_currency/"

# Primary key in the TARGET after mapping (we'll alias CurrencyCode -> CurrencyAlternateKey)
PK_COL = "CurrencyAlternateKey"

# Manual mapping: (source_column_name, target_column_name)
CURRENCY_MAPPING = [
    ("CurrencyCode", "CurrencyAlternateKey"),  # CurrencyCode exists in your source
    ("Name", "CurrencyName"),                  # Name exists in your source
    ("ModifiedDate", "ModifiedDate")
]

# --- 2. Read source (Delta preferred) ---
print("Reading source from:", CURRENCY_PATH)
try:
    df_source = spark.read.format("delta").load(CURRENCY_PATH)
    print("Loaded source as Delta")
except Exception:
    df_source = spark.read.parquet(CURRENCY_PATH)
    print("Loaded source as Parquet")

print("Source columns:", df_source.columns)

# --- 3. Select & rename based on mapping (skip missing) ---
selected_expr = []
selected_target_cols = []
for src_col, tgt_col in CURRENCY_MAPPING:
    if src_col in df_source.columns:
        selected_expr.append(col(src_col).alias(tgt_col))
        selected_target_cols.append(tgt_col)
    else:
        print("Skipping missing source column:", src_col)

# also bring through audit cols that exist in your source
for a in ["__ingest_ts","__source_file","__source_path","__batch_id","__row_hash"]:
    if a in df_source.columns and a not in selected_target_cols:
        selected_expr.append(col(a))
        selected_target_cols.append(a)

if not selected_expr:
    raise RuntimeError("No valid mapping columns found. Check CURRENCY_MAPPING.")

df = df_source.select(*selected_expr)
print("Selected columns for processing:", df.columns)
display(df.limit(5))

# --- 4. Compute row hash if needed (you already have __row_hash in source; keep it) ---
# (We will still compute a business hash for safety)
hash_cols = [c for c in ("CurrencyAlternateKey","CurrencyName") if c in df.columns]
if not hash_cols:
    hash_cols = df.columns
df = df.withColumn("__business_hash", sha2(concat_ws("||", *[col(c).cast("string") for c in hash_cols]), 256))

# Add LoadTS
df = df.withColumn("LoadTS", current_timestamp())

print("Prepared df columns:", df.columns)
df = df.withColumn("IsCurrent", col("IsCurrent") if "IsCurrent" in df.columns else current_timestamp() * 0 + 1)  # placeholder if not present
target_exists = True
try:
    spark.read.format("delta").load(TARGET_PATH)
except Exception:
    target_exists = False

if not target_exists:
    print("Target not found -> creating initial DimCurrency at:", TARGET_PATH)
    # Build initial DataFrame: CurrencyKey (NULL) + mapped columns + meta
    from pyspark.sql.functions import lit
    write_df = df.select(*[c for c in selected_target_cols if c in df.columns])
    write_df = write_df.withColumn("CurrencyKey", lit(None).cast("long")).withColumn("LoadTS", current_timestamp()).withColumn("__row_hash", col("__row_hash") if "__row_hash" in df.columns else col("__business_hash")).withColumn("IsCurrent", lit(True))
    # reorder: CurrencyKey first
    cols_out = ["CurrencyKey"] + [c for c in write_df.columns if c != "CurrencyKey"]
    write_df = write_df.select(*cols_out)
    write_df.write.format("delta").mode("overwrite").save(TARGET_PATH)
    print("Initial DimCurrency created.")
else:
    # MERGE upsert
    print("Target exists -> merging into:", TARGET_PATH)
    # ensure PK present in df
    if PK_COL not in df.columns:
        raise RuntimeError(f"PK '{PK_COL}' not in selected dataframe columns. Make sure CurrencyCode is mapped to {PK_COL}.")

    # create temp view and run SQL MERGE for clarity
    tmp_view = "_src_currency_tmp"
    df.createOrReplaceTempView(tmp_view)

    match_cond = f"target.`{PK_COL}` = source.`{PK_COL}`"
    # update when hash differs
    update_pairs = []
    for tgt in [c for c in selected_target_cols if c in df.columns]:
        update_pairs.append(f"target.`{tgt}` = source.`{tgt}`")
    update_pairs.append("target.LoadTS = source.LoadTS")
    update_pairs.append("target.__row_hash = source.__row_hash")
    update_sql = ",\n    ".join(update_pairs)

    insert_cols = ["CurrencyKey"] + [c for c in selected_target_cols if c in df.columns] + ["LoadTS","__row_hash","IsCurrent"]
    insert_cols_sql = ", ".join([f"`{c}`" for c in insert_cols])
    insert_vals_sql = ", ".join(["NULL"] + [f"source.`{c}`" for c in selected_target_cols if c in df.columns] + ["source.LoadTS","source.__row_hash","true"])

    merge_sql = f"""
    MERGE INTO delta.`{TARGET_PATH}` AS target
    USING (SELECT * FROM {tmp_view}) AS source
    ON {match_cond}
    WHEN MATCHED AND target.__row_hash <> source.__row_hash
      THEN UPDATE SET
        {update_sql}
    WHEN NOT MATCHED
      THEN INSERT ({insert_cols_sql}) VALUES ({insert_vals_sql})
    """

    spark.sql(merge_sql)
    print("Merge complete ✅")

# --- 6. Quick validation ---
try:
    tgt = spark.read.format("delta").load(TARGET_PATH)
    print("Target rows approx:", tgt.count())
    display(tgt.limit(5))
except Exception as e:
    print("Could not read target for validation:", e)

print("Done.")


In [0]:
from pyspark.sql.functions import col, lit, sha2, concat_ws, current_timestamp
from delta.tables import DeltaTable

# --- 1. Define Paths and Mappings ---

# Source Path (Input)
CURRENCY_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/stg_currency/"
TARGET_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/gold/dim/dim_currency_1/"

# Define the Primary Key column for matching existing rows
PK_COL = "CurrencyAlternateKey"

# Core business columns needed for the Gold table
CORE_MAPPING = [
    ("CurrencyCode", "CurrencyAlternateKey"), # Source PK becomes Target Alternate Key
    ("CurrencyName", "CurrencyName"),
    ("ModifiedDate", "ModifiedDate")
]

# All Audit/Lineage columns you want to carry forward
AUDIT_COLUMNS = [
    "_ingestion_ts",
    "_ingestion_date",
    "_source_file",
    "_source_path",
    "_job_id",
    "_run_id",
    "_batch_id",
    "_year",
    "__ingest_ts",
    "__source_file",
    "__source_path",
    "__batch_id",
    "__row_hash"
]

# Combine all required columns (including audit)
ALL_REQUIRED_COLUMNS = [src for src, tgt in CORE_MAPPING] + AUDIT_COLUMNS


# Configure Authentication
storage_account_name = "scrgvkrmade"
account_key = "E4VB7pXWFXttUWbbSBPY35/Dvsw6Fs6XgIWLTj3lCS6v/jCEow9Uxs+r6Usobhenv14UdWEzb+R8+AStNyS0dg=="
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
    account_key
)
print("Configured Spark authentication.")

# --- 2. Read and Prepare Source Data (df_source) ---
print(f"\nReading Currency data from: {CURRENCY_PATH}")
df_source = spark.read.format("delta").load(CURRENCY_PATH)

# Select and Rename core columns, and select all audit columns
selected_expr = [col(src).alias(tgt) for src, tgt in CORE_MAPPING if src in df_source.columns]
selected_expr.extend([col(c) for c in AUDIT_COLUMNS if c in df_source.columns])

df_source = df_source.select(*selected_expr)

# Ensure the required partition column is present (using _year from audit)
if "_year" not in df_source.columns:
    df_source = df_source.withColumn("_year", lit(year(current_timestamp())))


# Create a hash for change detection (using the core business columns only)
hash_cols = ["CurrencyAlternateKey", "CurrencyName"]
# If __row_hash already exists in the Silver layer, use it. Otherwise, compute it.
if "__row_hash" not in df_source.columns:
    df_source = df_source.withColumn(
        "__row_hash",
        sha2(concat_ws("||", *[col(c).cast("string") for c in hash_cols]), 256)
    )

# Add internal Gold audit columns
df_source = df_source.withColumn("LoadTS", current_timestamp())
df_source = df_source.withColumn("IsCurrent", lit(True))
df_source = df_source.withColumn("CurrencyKey", lit(None).cast("long")) 


# --- 3. Check and Create Target Table (if needed) ---
target_exists = DeltaTable.isDeltaTable(spark, TARGET_PATH)

# List of columns to insert/update in Gold
GOLD_COLUMNS = ["CurrencyKey", "CurrencyAlternateKey", "CurrencyName", 
                "LoadTS", "__row_hash", "IsCurrent", "ModifiedDate",
                "_year"] + AUDIT_COLUMNS # All audit columns are included here

if not target_exists:
    print(f"Target table not found. Creating initial DimCurrency table at: {TARGET_PATH}")
    
    # Filter the source DataFrame to only include columns we need for the Gold table
    insert_df = df_source.select(*[c for c in GOLD_COLUMNS if c in df_source.columns])
    
    insert_df.write \
        .format("delta") \
        .mode("overwrite") \
        .partitionBy("_year") \
        .save(TARGET_PATH)
    print("Initial DimCurrency table created. Skipping merge.")

else:
    # --- 4. Perform MERGE (Incremental Upsert) ---
    print(f"\nTarget table exists. Performing MERGE (Incremental Load)...")
    
    dt_target = DeltaTable.forPath(spark, TARGET_PATH)
    
    # Define the condition to match an existing row (Primary Key)
    join_cond = f"target.{PK_COL} = source.{PK_COL}"
    
    # Define the condition to detect a change (Row Hash)
    change_cond = "target.__row_hash != source.__row_hash"

    # Create the dictionary for all columns to insert/update
    update_set = {
        # Core Columns (UPDATE if hash changed)
        "CurrencyName": "source.CurrencyName",
        "ModifiedDate": "source.ModifiedDate",
        "__row_hash": "source.__row_hash",
        "LoadTS": "source.LoadTS",
        
        # Audit Columns (Update audit fields when business data changes)
        "_ingestion_ts": "source._ingestion_ts",
        "_ingestion_date": "source._ingestion_date",
        "_source_file": "source._source_file",
        "_source_path": "source._source_path",
        "_job_id": "source._job_id",
        "_run_id": "source._run_id",
        "_batch_id": "source._batch_id",
        "_year": "source._year",
        "__ingest_ts": "source.__ingest_ts",
        "__source_file": "source.__source_file",
        "__source_path": "source.__source_path",
        "__batch_id": "source.__batch_id"
    }
    
    # Insert values (for new rows)
    insert_values = {col_name: f"source.{col_name}" for col_name in GOLD_COLUMNS if col_name in df_source.columns}
    
    # Execute Merge
    dt_target.alias("target").merge(
        df_source.alias("source"),
        join_cond
    ) \
    .whenMatchedUpdate(
        condition=change_cond,
        set = update_set
    ) \
    .whenNotMatchedInsert(
        values = insert_values
    ) \
    .execute()
    
    print("MERGE (Incremental Load) complete. ✅")
    
print("DimCurrency job finished. All audit columns carried forward.")