In [1]:
#Build dimension model in Golden Layer
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Window
from pyspark.sql.functions import row_number
import traceback

StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 3, Finished, Available, Finished)

In [2]:
#Parameterize workspace and lakehouse name
Workspace = "NewRetailCompany"
Lakehouse = "Semantic_Zone"

StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 4, Finished, Available, Finished)

In [3]:
sz_rebate_df = spark.read.table("sz_rebate")

StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 5, Finished, Available, Finished)

In [4]:
sz_rebate_df.printSchema()

StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 6, Finished, Available, Finished)

root
 |-- SalesRepID: string (nullable = true)
 |-- SalesRepName: string (nullable = true)
 |-- SalesRepRegion: string (nullable = true)
 |-- CustomerCountry: string (nullable = true)
 |-- CustomerRegion: string (nullable = true)
 |-- CurrencyCode: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- SaleTransactionID: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- RebateDatetime: timestamp (nullable = true)
 |-- RebateTransactionID: string (nullable = true)
 |-- RebateType: string (nullable = true)
 |-- RebateAmount: decimal(18,2) (nullable = true)
 |-- SaleRegion: string (nullable = true)
 |-- SaleCountry: string (nullable = true)



In [5]:
#Create dim table
# CustomerKey will be surrogate key 
customerDim = (sz_rebate_df
    .select("CustomerID", "CustomerName", "CustomerRegion", "CustomerCountry")
    .dropDuplicates(["CustomerID"])
    .withColumn("CustomerKey", F.monotonically_increasing_id())
    .select("CustomerKey", "CustomerID", "CustomerName", "CustomerRegion", "CustomerCountry")
)

StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 7, Finished, Available, Finished)

In [6]:
#Contain Sales representative data
#SalesRepKey is suggorate key
salesRepDim = (sz_rebate_df
    .select("SalesRepID", "SalesRepName", "SalesRepRegion")
    .dropDuplicates(["SalesRepID"])
    .withColumn("SalesRepKey", F.monotonically_increasing_id())
    .select("SalesRepKey", "SalesRepID", "SalesRepName", "SalesRepRegion")
)

StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 8, Finished, Available, Finished)

In [7]:
#Currency code
currencyDim = (sz_rebate_df
    .select("CurrencyCode")
    .dropDuplicates()
    .withColumn("CurrencyKey", F.monotonically_increasing_id())
    .select("CurrencyKey", "CurrencyCode")
)

StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 9, Finished, Available, Finished)

In [8]:
#Rebate details

rebateTypeDim = (sz_rebate_df
    .select("RebateType")
    .dropDuplicates()
    .withColumn("RebateTypeKey", F.monotonically_increasing_id())
    .select("RebateTypeKey", "RebateType")
)

StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 10, Finished, Available, Finished)

In [9]:
#Date dim
dateDim = (sz_rebate_df
    .select(F.col("RebateDatetime").alias("FullDate"))
    .dropna()
    .dropDuplicates()
    .withColumn("DateKey", F.date_format("FullDate", "yyyyMMdd").cast("int"))
    .withColumn("Year", F.year("FullDate"))
    .withColumn("Quarter", F.quarter("FullDate"))
    .withColumn("Month", F.month("FullDate"))
    .withColumn("Week", F.weekofyear("FullDate"))
    .withColumn("DayOfMonth", F.dayofmonth("FullDate"))
    .withColumn("DayOfWeek", F.date_format("FullDate", "E"))  # Monday
    .select("DateKey", "FullDate", "Year", "Quarter", "Month", "Week", "DayOfMonth", "DayOfWeek")
)

StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 11, Finished, Available, Finished)

Fact Table - fact_rebate

In [10]:
# Join Rebate transactions fact with dimensions
#fact table is just made up with suggorate keys from all dim tables
try:
        
    fact_rebate_df = (sz_rebate_df.alias("r")
        .join(customerDim.alias("c"), F.col("r.CustomerID") == F.col("c.CustomerID"), "left")
        .join(salesRepDim.alias("s"), F.col("r.SalesRepID") == F.col("s.SalesRepID"), "left")
        .join(dateDim.alias("d"), F.to_date("r.RebateDatetime") == F.col("d.FullDate"), "left")
        .join(rebateTypeDim.alias("reb"), F.col("r.RebateType") == F.col("reb.RebateType"), "left")
        .join(currencyDim.alias("e"), F.col("r.CurrencyCode") == F.col("e.CurrencyCode"), "left")
        .select(
            # Foreign Keys
            F.col("c.CustomerKey"),
            F.col("s.SalesRepKey"),
            F.col("d.DateKey"),
            F.col("reb.RebateTypeKey"),
            F.col("e.CurrencyKey"),
            F.col("r.RebateAmount"),
            F.col("r.SaleTransactionID"),
            F.col("r.RebateTransactionID")
        )
    )

    print("Golden layer FactRebate built successfully!")

except AnalysisException as e:
    print("Spark AnalysisException in Golden layer:", str(e))
    traceback.print_exc()



StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 12, Finished, Available, Finished)

Golden layer FactRebate built successfully!


In [11]:
fact_rebate_df.printSchema()

StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 13, Finished, Available, Finished)

root
 |-- CustomerKey: long (nullable = true)
 |-- SalesRepKey: long (nullable = true)
 |-- DateKey: integer (nullable = true)
 |-- RebateTypeKey: long (nullable = true)
 |-- CurrencyKey: long (nullable = true)
 |-- RebateAmount: decimal(18,2) (nullable = true)
 |-- SaleTransactionID: string (nullable = true)
 |-- RebateTransactionID: string (nullable = true)



In [12]:
fact_rebate_df.head(1)

StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 14, Finished, Available, Finished)

[Row(CustomerKey=766, SalesRepKey=20, DateKey=None, RebateTypeKey=1, CurrencyKey=2, RebateAmount=Decimal('1.16'), SaleTransactionID='8221046822414', RebateTransactionID='0001062538722')]

Write all tables to Lakehouse

In [13]:
# Write to Golden Layer Fact Table

try:
    customerDim.write \
        .format("delta") \
        .mode("overwrite") \
        .save("abfss://"+Workspace+"@onelake.dfs.fabric.microsoft.com/"+Lakehouse+".Lakehouse/Tables/dim_customer")

    salesRepDim.write \
        .format("delta") \
        .mode("overwrite") \
        .save("abfss://"+Workspace+"@onelake.dfs.fabric.microsoft.com/"+Lakehouse+".Lakehouse/Tables/dim_salesrep")

    dateDim.write \
        .format("delta") \
        .mode("overwrite") \
        .save("abfss://"+Workspace+"@onelake.dfs.fabric.microsoft.com/"+Lakehouse+".Lakehouse/Tables/dim_date")

    rebateTypeDim.write \
        .format("delta") \
        .mode("overwrite") \
        .save("abfss://"+Workspace+"@onelake.dfs.fabric.microsoft.com/"+Lakehouse+".Lakehouse/Tables/dim_rebatetype")

    currencyDim.write \
        .format("delta") \
        .mode("overwrite") \
        .save("abfss://"+Workspace+"@onelake.dfs.fabric.microsoft.com/"+Lakehouse+".Lakehouse/Tables/dim_currency")

    fact_rebate_df.write \
        .format("delta") \
        .mode("overwrite") \
        .save("abfss://"+Workspace+"@onelake.dfs.fabric.microsoft.com/"+Lakehouse+".Lakehouse/Tables/fact_rebate")

    
    print("Golden layer Rebate Dim and Fact tables saved successfully!")


except Exception as e:
    print("Unexpected error occurred:", str(e))
    traceback.print_exc()

StatementMeta(, 01440301-fe48-4bf3-91a9-7a191ab98172, 15, Finished, Available, Finished)

Golden layer Rebate Dim and Fact tables saved successfully!
