In [0]:
%sql
drop table processed_data.fact_holiday_sales


In [0]:
from pyspark.sql import *
from pyspark.sql.functions import *

# Load Dimension and Fact Tables
dim_date = spark.read.table("processed_data.dim_date")  # Date Dimension
dim_holiday = spark.read.table("processed_data.dim_time_hldy")  # Holiday Dimension
fact_trans = spark.read.table("hive_metastore.raw_data.fact_trans")  # Transactions Fact Table

# Step 1: Map Holiday Dates with Date Dimension
holiday_dates_df = dim_holiday.join(dim_date, dim_holiday["date"] == dim_date["date"], "inner").select(
    dim_date["fscldt_id"], dim_holiday["hldy_id"]
)

# Step 2: Filter Transactions for Holiday Dates
holiday_sales_df = (
    fact_trans
    .join(holiday_dates_df, "fscldt_id", "inner")  # Keep only transactions on holidays
    .select(
        col("order_id"),  # Use order_id as primary key
        col("fscldt_id"),
        col("hldy_id"),
        col("pos_site_id").alias("location_id"),
        col("sku_id").alias("product_id"),
        col("sales_units").alias("sales_qty"),
        col("sales_dollars").alias("sales_revenue"),
        col("discount_dollars").alias("discount_amount"),
        expr("sales_revenue - discount_amount").alias("net_sales")
    )
)

# Step 3: Save as Fact Table in Databricks
holiday_sales_df.write.mode("overwrite").saveAsTable("processed_data.fact_holiday_sales")

# Show Sample Data
holiday_sales_df.show(10)

In [0]:
%sql
select * from processed_data.fact_holiday_sales

In [0]:
%sql
select * from hive_metastore.raw_data.fact_trans limit 20