# 03_model_gold

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import date_format

spark = SparkSession.builder.appName("GoldModel").getOrCreate()

# Load silver data

In [None]:
silver_path = "dbfs:/FileStore/silver/online_retail"
silver_df = spark.read.format("delta").load(silver_path)

# Dim: Products

In [None]:
dim_product = silver_df.select("StockCode", "Description").distinct()
dim_product.write.format("delta") \
    .mode("overwrite") \
    .save("dbfs:/FileStore/gold/dim_product")

# Dim: Customers


In [None]:
dim_customer = silver_df.select("CustomerID", "Country").distinct()
dim_customer.write.format("delta") \
    .mode("overwrite") \
    .save("dbfs:/FileStore/gold/dim_customer")

# Dim: Date


In [None]:
dim_date = silver_df.select("InvoiceDateOnly") \
    .distinct() \
    .withColumn("DateKey", date_format("InvoiceDateOnly","yyyyMMdd").cast("int")) \
    .withColumn("Year", date_format("InvoiceDateOnly","yyyy").cast("int")) \
    .withColumn("Month", date_format("InvoiceDateOnly","MM").cast("int"))

dim_date.write.format("delta") \
    .mode("overwrite") \
    .save("dbfs:/FileStore/gold/dim_date")

# Fact: Sales

In [None]:

fact_sales = silver_df \
    .withColumn("DateKey", date_format("InvoiceDateOnly","yyyyMMdd").cast("int")) \
    .select(
    "InvoiceNo",
    "DateKey",
    "StockCode",
    "CustomerID",
    "Quantity",
    "UnitPrice",
    "SalesAmount"
)

fact_sales.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("DateKey") \
    .save("dbfs:/FileStore/gold/fact_sales")

for tbl in ["dim_product","dim_customer","dim_date","fact_sales"]:
    path = f"dbfs:/FileStore/gold/{tbl}"
    cnt = spark.read.format("delta").load(path).count()
    print(f"{tbl}: {cnt} rows")