In [1]:
#Build dimension model in Golden Layer
#import libraries
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Window
from pyspark.sql.functions import row_number
import traceback

StatementMeta(, e21d1249-a31a-46ab-8653-bee809df0cbb, 3, Finished, Available, Finished)

In [2]:
#Parameterize workspace and lakehouse name
Workspace = "NewRetailCompany"
Lakehouse = "Semantic_Zone"

StatementMeta(, e21d1249-a31a-46ab-8653-bee809df0cbb, 4, Finished, Available, Finished)

In [3]:
sz_sales_df = spark.read.table("sz_sales_transaction")

StatementMeta(, e21d1249-a31a-46ab-8653-bee809df0cbb, 5, Finished, Available, Finished)

In [4]:
sz_sales_df.printSchema()

StatementMeta(, e21d1249-a31a-46ab-8653-bee809df0cbb, 6, Finished, Available, Finished)

root
 |-- SaleTransactionID: string (nullable = true)
 |-- SalesRepID: string (nullable = true)
 |-- SalesRepName: string (nullable = true)
 |-- SalesRepCountry: string (nullable = true)
 |-- SalesRepCountryCode: string (nullable = true)
 |-- SalesRepRegion: string (nullable = true)
 |-- SalesCountryCode: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- CustomerCountry: string (nullable = true)
 |-- CustomerRegion: string (nullable = true)
 |-- CustomerCountryCode: string (nullable = true)
 |-- UnitQuantity: integer (nullable = true)
 |-- SaleAmount: decimal(18,2) (nullable = true)
 |-- SaleDatetime: timestamp (nullable = true)
 |-- CurrencyCode: string (nullable = true)
 |-- Category1: string (nullable = true)
 |-- Category2: string (nullable = true)
 |-- Category3: string (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- SaleRegion: string (nullable = true)
 |-- SaleCountry: string (nullable = true)
 |

In [5]:
#Create dim tables
# ProductKey will be surrogate key
productDim = (sz_sales_df
    .select(
        "Category1","Category2","Category3","ProductName",
        "Level1Code","Level2Code","Level3Code","SpecificProductCode"
    )
    .dropDuplicates()
    .withColumn("ProductKey", F.monotonically_increasing_id())
    .select("ProductKey","Category1","Category2","Category3","ProductName",
            "Level1Code","Level2Code","Level3Code","SpecificProductCode")
)

StatementMeta(, e21d1249-a31a-46ab-8653-bee809df0cbb, 7, Finished, Available, Finished)

In [6]:
#Customer key is suggorate key

customerDim = (sz_sales_df
    .select(
        F.col("CustomerID"),
        F.col("CustomerName"),
        F.col("CustomerCountry"),
        F.col("CustomerRegion"),
        F.col("CustomerCountryCode")
    )
    .dropDuplicates()
    .withColumn("CustomerKey", F.monotonically_increasing_id())
    .select("CustomerKey","CustomerID","CustomerName","CustomerCountry","CustomerRegion","CustomerCountryCode")
)


StatementMeta(, e21d1249-a31a-46ab-8653-bee809df0cbb, 8, Finished, Available, Finished)

In [7]:
#SalesRepKey  is suggorate key

salesRepDim = (sz_sales_df
    .select(
        "SalesRepID","SalesRepName","SalesRepCountry","SalesRepCountryCode","SalesRepRegion"
    )
    .dropDuplicates()
    .withColumn("SalesRepKey", F.monotonically_increasing_id())
    .select("SalesRepKey","SalesRepID","SalesRepName","SalesRepCountry","SalesRepCountryCode","SalesRepRegion")
)

StatementMeta(, e21d1249-a31a-46ab-8653-bee809df0cbb, 9, Finished, Available, Finished)

In [8]:
#DateKey key is suggorate key

dateDim = (sz_sales_df
    .select(F.col("SaleDatetime").alias("FullDate"))
    .dropna()
    .dropDuplicates()
    .withColumn("DateKey", F.date_format("FullDate","yyyyMMdd").cast("int"))
    .withColumn("Year", F.year("FullDate"))
    .withColumn("Quarter", F.quarter("FullDate"))
    .withColumn("Month", F.month("FullDate"))
    .withColumn("Week", F.weekofyear("FullDate"))
    .withColumn("DayOfMonth", F.dayofmonth("FullDate"))
    .withColumn("DayOfWeek", F.date_format("FullDate","E"))
    .withColumn("Hour", F.hour("FullDate"))
    .select("DateKey","FullDate","Year","Quarter","Month","Week","DayOfMonth","DayOfWeek","Hour")
)


StatementMeta(, e21d1249-a31a-46ab-8653-bee809df0cbb, 10, Finished, Available, Finished)

In [9]:
#Currency code
currencyDim = (sz_sales_df
    .select("CurrencyCode")
    .dropDuplicates()
    .withColumn("CurrencyKey", F.monotonically_increasing_id())
    .select("CurrencyKey", "CurrencyCode")
)

StatementMeta(, e21d1249-a31a-46ab-8653-bee809df0cbb, 11, Finished, Available, Finished)

Fact table - Sales fact

In [13]:
try:
    fact_sales = (sz_sales_df.alias("s")
        .join(customerDim.alias("c"), F.col("s.CustomerID") == F.col("c.CustomerID"), "left")
        .join(salesRepDim.alias("r"), F.col("s.SalesRepID") == F.col("r.SalesRepID"), "left")
        .join(productDim.alias("p"), F.col("s.SpecificProductCode") == F.col("p.SpecificProductCode"), "left")
        .join(dateDim.alias("d"), F.to_date("s.SaleDatetime") == F.col("d.FullDate"), "left")
        .join(currencyDim.alias("e"), F.col("s.CurrencyCode") == F.col("e.CurrencyCode"), "left")
        .select(
            F.col("s.SaleTransactionID"),
            F.col("c.CustomerKey"),
            F.col("r.SalesRepKey"),
            F.col("p.ProductKey"),
            F.col("d.DateKey"),
            F.col("s.UnitQuantity"),
            F.col("s.SaleAmount"),
            F.col("s.CurrencyCode"),
            F.col("s.SaleRegion"),
            F.col("s.SaleCountry"),
            F.col("s.SaleDatetime"),
            
        ).withColumn("Hour", F.hour("s.SaleDatetime"))
    )

   
    print("Golden layer FactSales built successfully!")

except AnalysisException as e:
    print("Spark AnalysisException in Golden layer:", str(e))
    traceback.print_exc()


StatementMeta(, e21d1249-a31a-46ab-8653-bee809df0cbb, 15, Finished, Available, Finished)

Golden layer FactSales built successfully!


In [14]:
fact_sales.printSchema()

StatementMeta(, e21d1249-a31a-46ab-8653-bee809df0cbb, 16, Finished, Available, Finished)

root
 |-- SaleTransactionID: string (nullable = true)
 |-- CustomerKey: long (nullable = true)
 |-- SalesRepKey: long (nullable = true)
 |-- ProductKey: long (nullable = true)
 |-- DateKey: integer (nullable = true)
 |-- UnitQuantity: integer (nullable = true)
 |-- SaleAmount: decimal(18,2) (nullable = true)
 |-- CurrencyCode: string (nullable = true)
 |-- SaleRegion: string (nullable = true)
 |-- SaleCountry: string (nullable = true)
 |-- SaleDatetime: timestamp (nullable = true)
 |-- Hour: integer (nullable = true)



In [15]:
#write fact table to lakehouse
#rest dim tables will saved to lakehouse in rebate notebook

try:
    productDim.write \
        .format("delta") \
        .mode("overwrite") \
        .save("abfss://"+Workspace+"@onelake.dfs.fabric.microsoft.com/"+Lakehouse+".Lakehouse/Tables/dim_product")

    fact_sales.write \
        .format("delta") \
        .mode("overwrite") \
        .save("abfss://"+Workspace+"@onelake.dfs.fabric.microsoft.com/"+Lakehouse+".Lakehouse/Tables/fact_sales")

    print("Golden layer Sales Dim and Fact tables saved successfully!")

except Exception as e:
    print("Unexpected error occurred:", str(e))
    traceback.print_exc()


StatementMeta(, e21d1249-a31a-46ab-8653-bee809df0cbb, 17, Finished, Available, Finished)

Golden layer Sales Dim and Fact tables saved successfully!
