In [0]:
from pyspark.sql import functions as F
import json
from pyspark.sql.functions import col, monotonically_increasing_id, coalesce, lit


In [0]:
# Parameters extraction
Parameters = dbutils.widgets.get("Parameters")
Parameters = json.loads(Parameters)

ProcessInstanceId = 0
ProcessQueueId = 0
StageId = 0
TableName = ""

for p in Parameters:
    if p.get("TableName") == "Orders":
        ProcessInstanceId = int(p.get("ProcessInstanceId"))
        ProcessQueueId = int(p.get("ProcessQueueId"))
        StageId = int(p.get("StageId"))
        TableName = str(p.get("TableName"))

In [0]:
# Mark current table as InProgress
if StageId == 4:
    spark.sql(f"""
        update control.processqueue
        set ProcessStatus = 'InProgress',
            ProcessStartTime = current_timestamp()
        where StageId = {StageId}
            and ProcessInstanceId = {ProcessInstanceId}
            and ProcessQueueId = {ProcessQueueId}
            and TableName = '{TableName}';
    """)
else:
    raise Exception(f"Stage Id is not relavent to R2B-transformation for table: {TableName}")

In [0]:
status = False

try: 
    # -------------------------------------------------
    # Read source fact data
    # -------------------------------------------------
    orders_df = spark.table("workspace.bronze.orders")

    # -------------------------------------------------
    # Read dimensions
    # -------------------------------------------------
    dim_customer = spark.table("workspace.gold.dim_customer")
    dim_product  = spark.table("workspace.gold.dim_product")
    dim_date     = spark.table("workspace.gold.dim_date")

    # -------------------------------------------------
    # Join with DimCustomer
    # -------------------------------------------------
    orders_with_customer = (
        orders_df
        .join(
            dim_customer,
            orders_df.CustomerId == dim_customer.CustomerId,
            "left"
        )
    )

    # -------------------------------------------------
    # Join with DimProduct
    # -------------------------------------------------
    orders_with_product = (
        orders_with_customer
        .join(
            dim_product,
            orders_with_customer.OrdersProductId == dim_product.ProductId,
            "left"
        )
    )

    # -------------------------------------------------
    # Join with DimDate
    # -------------------------------------------------
    orders_with_date = (
        orders_with_product
        .join(
            dim_date,
            orders_with_product.OrderDate == dim_date.FullDate,
            "left"
        )
    )

    # -------------------------------------------------
    # Handle missing dimension lookups (Unknown keys)
    # -------------------------------------------------
    fact_orders = (
        orders_with_date
        .select(
            col("OrderId").cast("string"),
            coalesce(dim_customer.CustomerKey, lit(-1)).alias("CustomerKey"),
            coalesce(dim_product.ProductKey, lit(-1)).alias("ProductKey"),
            coalesce(dim_date.DateKey, lit(-1)).alias("OrderDateKey"),
            col("OrderPrice").cast("double")
        )
    )

    # -------------------------------------------------
    # Add Fact surrogate key
    # -------------------------------------------------
    fact_orders = fact_orders.withColumn(
        "OrderKey",
        monotonically_increasing_id()
    )

    # -------------------------------------------------
    # Final column order
    # -------------------------------------------------
    fact_orders = fact_orders.select(
        "OrderKey",
        "OrderId",
        "CustomerKey",
        "ProductKey",
        "OrderDateKey",
        "OrderPrice"
    )

    # -------------------------------------------------
    # Write to Gold
    # -------------------------------------------------
    (
        fact_orders
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable("workspace.gold.fact_orders")
    )
    status = True
    
except Exception as e:
    print(f"Error: {e}")
    status = False


In [0]:
# Mark file as Success/Failed

if status == True:
    spark.sql(f"""
        UPDATE control.processqueue
        SET
            ProcessStatus = 'Succeeded',
            ProcessEndTime = current_timestamp(),
            ProcessDuration = CAST(
                (unix_timestamp(current_timestamp()) - unix_timestamp(ProcessStartTime)) / 60
                AS BIGINT
            )
        WHERE
            StageId = {StageId}
            AND ProcessInstanceId = {ProcessInstanceId}
            AND ProcessQueueId = {ProcessQueueId}
            AND TableName = '{TableName}'
            """)
    print(f"{TableName} Marked as Successful")
elif status == False:
        spark.sql(f"""
        UPDATE control.processqueue
        SET
            ProcessStatus = 'Failed',
            ProcessEndTime = current_timestamp(),
            ProcessDuration = CAST(
                (unix_timestamp(current_timestamp()) - unix_timestamp(ProcessStartTime)) / 60
                AS BIGINT
            )
        WHERE
            StageId = {StageId}
            AND ProcessInstanceId = {ProcessInstanceId}
            AND ProcessQueueId = {ProcessQueueId}
            AND TableName = '{TableName}'
            """)
        print(f"{TableName} Marked as Failed")
        raise Exception(f"Hard failure: {TableName} Failure detected")