In [0]:
from pyspark.sql import functions as F
import json
from pyspark.sql.functions import col, monotonically_increasing_id
from pyspark.sql.types import (
    StructType, StructField,
    StringType, DoubleType, LongType
)

In [0]:
# Parameters extraction
Parameters = dbutils.widgets.get("Parameters")
Parameters = json.loads(Parameters)

ProcessInstanceId = 0
ProcessQueueId = 0
StageId = 0
TableName = ""

for p in Parameters:
    if p.get("TableName") == "Products":
        ProcessInstanceId = int(p.get("ProcessInstanceId"))
        ProcessQueueId = int(p.get("ProcessQueueId"))
        StageId = int(p.get("StageId"))
        TableName = str(p.get("TableName"))

In [0]:
# Mark current table as InProgress
if StageId == 4:
    spark.sql(f"""
        update control.processqueue
        set ProcessStatus = 'InProgress',
            ProcessStartTime = current_timestamp()
        where StageId = {StageId}
            and ProcessInstanceId = {ProcessInstanceId}
            and ProcessQueueId = {ProcessQueueId}
            and TableName = '{TableName}';
    """)
else:
    raise Exception(f"Stage Id is not relavent to R2B-transformation for table: {TableName}")

In [0]:
status = False

try: 
    # -------------------------------------------------
    # Read Bronze Products
    # -------------------------------------------------
    products_df = spark.table("workspace.bronze.products")

    # -------------------------------------------------
    # Clean & standardize
    # -------------------------------------------------
    dim_product_df = (
        products_df
        .select(
            col("ProductId").cast(StringType()),
            col("ProductName").cast(StringType()),
            col("ProductCategory").cast(StringType()),
            col("ProductPrice").cast(DoubleType())
        )
        .dropDuplicates(["ProductId"])
    )

    # -------------------------------------------------
    # Add surrogate key
    # -------------------------------------------------
    dim_product_df = dim_product_df.withColumn(
        "ProductKey",
        monotonically_increasing_id()
    )

    # -------------------------------------------------
    # Define schema for Unknown Product
    # -------------------------------------------------
    unknown_schema = StructType([
        StructField("ProductId", StringType(), False),
        StructField("ProductName", StringType(), False),
        StructField("ProductCategory", StringType(), False),
        StructField("ProductPrice", DoubleType(), False),
        StructField("ProductKey", LongType(), False)
    ])

    unknown_product = spark.createDataFrame(
        [("-1", "Unknown", "Unknown", 0.0, -1)],
        schema=unknown_schema
    )

    # -------------------------------------------------
    # Union Unknown record
    # -------------------------------------------------
    dim_product_df = dim_product_df.unionByName(unknown_product)

    # -------------------------------------------------
    # Final column order
    # -------------------------------------------------
    dim_product_df = dim_product_df.select(
        "ProductKey",
        "ProductId",
        "ProductName",
        "ProductCategory",
        "ProductPrice"
    )

    # -------------------------------------------------
    # Write to Gold
    # -------------------------------------------------
    (
        dim_product_df
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable("workspace.gold.dim_product")
    )
    status = True

    # SCD-2 Implementation : ############################

    from pyspark.sql.functions import *
    from delta.tables import DeltaTable

    # -------------------------------------------------
    # Source Data
    # -------------------------------------------------
    source_df = (
        spark.table("workspace.bronze.products")
        .select(
            col("ProductId").cast("string"),
            col("ProductName"),
            col("ProductCategory"),
            col("ProductPrice")
        )
        .withColumn(
            "RecordHash",
            sha2(
                concat_ws(
                    "||",
                    "ProductName",
                    "ProductCategory",
                    "ProductPrice"
                ),
                256
            )
        )
        .withColumn("StartDate", current_timestamp())
        .withColumn("EndDate", lit(None).cast("timestamp"))
        .withColumn("IsCurrent", lit(True))
    )

    # -------------------------------------------------
    # Target Delta Table
    # -------------------------------------------------
    target = DeltaTable.forName(
        spark,
        "workspace.gold.dim_product"
    )

    # -------------------------------------------------
    # SCD Type-2 MERGE (Corrected)
    # -------------------------------------------------
    (
        target.alias("t")
        .merge(
            source_df.alias("s"),
            "t.ProductId = s.ProductId"
        )
        # Expire existing record when data changes
        .whenMatchedUpdate(
            condition="t.IsCurrent = true AND t.RecordHash <> s.RecordHash",
            set={
                "EndDate": "current_timestamp()",
                "IsCurrent": "false"
            }
        )
        # Insert new version
        .whenNotMatchedInsert(
            values={
                "ProductId": "s.ProductId",
                "ProductName": "s.ProductName",
                "ProductCategory": "s.ProductCategory",
                "ProductPrice": "s.ProductPrice",
                "StartDate": "s.StartDate",
                "EndDate": "s.EndDate",
                "IsCurrent": "s.IsCurrent",
                "RecordHash": "s.RecordHash"
            }
        )
        .execute()
    )
    status = True

except Exception as e:
    print(f"Error: {e}")
    status = False



In [0]:
# Mark file as Success/Failed

if status == True:
    spark.sql(f"""
        UPDATE control.processqueue
        SET
            ProcessStatus = 'Succeeded',
            ProcessEndTime = current_timestamp(),
            ProcessDuration = CAST(
                (unix_timestamp(current_timestamp()) - unix_timestamp(ProcessStartTime)) / 60
                AS BIGINT
            )
        WHERE
            StageId = {StageId}
            AND ProcessInstanceId = {ProcessInstanceId}
            AND ProcessQueueId = {ProcessQueueId}
            AND TableName = '{TableName}'
            """)
    print(f"{TableName} Marked as Successful")
elif status == False:
        spark.sql(f"""
        UPDATE control.processqueue
        SET
            ProcessStatus = 'Failed',
            ProcessEndTime = current_timestamp(),
            ProcessDuration = CAST(
                (unix_timestamp(current_timestamp()) - unix_timestamp(ProcessStartTime)) / 60
                AS BIGINT
            )
        WHERE
            StageId = {StageId}
            AND ProcessInstanceId = {ProcessInstanceId}
            AND ProcessQueueId = {ProcessQueueId}
            AND TableName = '{TableName}'
            """)
        print(f"{TableName} Marked as Failed")
        raise Exception(f"Hard failure: {TableName} Failure detected")