In [0]:
%sql
drop table if exists processed_data.fact_sales_performance;

In [0]:
from pyspark.sql import *
from pyspark.sql.functions import *

#Load required tables
df_trans = spark.sql("SELECT * FROM raw_data.fact_trans")  # Transaction Data
df_avg_cst = spark.sql("SELECT * FROM raw_data.fact_avg_cst")  # Cost Data
df_date = spark.sql("SELECT * FROM processed_data.dim_date")  # Date Table
df_prdct = spark.sql("SELECT * FROM raw_data.dim_prdct")  # Product Table
df_pos_site = spark.sql("SELECT * FROM raw_data.dim_pos_site")  # Store/Site Data

#Filter and Clean Transactions
df_trans = df_trans.select(
    col("order_id"),
    col("line_id"),
    col("fscldt_id"),
    col("pos_site_id").alias("store_id"),
    col("sku_id"),
    col('price_substate_id'),
    col("sales_units"),
    col("sales_dollars"),
    col("discount_dollars")
)

#Join with Product Table
df_sales = df_trans.join(df_prdct.select("sku_id", "cat_id", "cat_label", "dept_id", "dept_label"),
                         on="sku_id", how="left")

#Join with Cost Data
df_sales = df_sales.join(df_avg_cst.select("sku_id", "fscldt_id", "average_unit_landedcost"),
                         on=["sku_id", "fscldt_id"], how="left")

#Join with Date Table
df_sales = df_sales.join(df_date.select("fscldt_id", "fsclwk_id", "fsclmth_id", "fsclqrtr_id", "fsclyr_id"),
                         on="fscldt_id", how="left")

#Join with Store Data
df_sales = df_sales.join(df_pos_site.select("site_id", "site_label", "chnl_label"),
                         df_sales["store_id"] == df_pos_site["site_id"], how="left") \
                   .drop("site_id")  # Remove duplicate column after join

#Calculate Metrics
df_sales = df_sales.withColumn("net_sales", col("sales_dollars") - col("discount_dollars"))
df_sales = df_sales.withColumn("gross_profit", col("sales_dollars") - (col("sales_units") * col("average_unit_landedcost")))
df_sales = df_sales.withColumn("profit_margin", round(col("gross_profit") / col("sales_dollars"), 2))

#Assign Unique Key
df_sales = df_sales.withColumn("fact_sales_id", monotonically_increasing_id())

#Write to Table
df_sales.write.mode("overwrite").saveAsTable("processed_data.fact_sales_performance")


In [0]:
%sql
select * from processed_data.fact_sales_performance 


In [0]:
%sql
select * from processed_data.dim_date