In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
%sql
drop catalog if exists goldscm cascade;
create catalog goldscm;

In [0]:
%sql
USE catalog goldscm;

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS goldscm.gold_schema;

**LOAD THE SILVER DATA**

In [0]:
%python
# Load Customer Data
df_cust = spark.read.format("delta") \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .load('abfss://silver@scmdataset2025.dfs.core.windows.net/customer')
# Load Orders Data
df_orders = spark.read.format("delta") \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .load('abfss://silver@scmdataset2025.dfs.core.windows.net/orders')
# Load Warehouse Data
df_warehouse = spark.read.format("delta") \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .load('abfss://silver@scmdataset2025.dfs.core.windows.net/warehouse')
# Load Logistics Data
df_logistics = spark.read.format("delta") \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .load('abfss://silver@scmdataset2025.dfs.core.windows.net/logistics')
# Load Products Data
df_products = spark.read.format("delta") \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .load('abfss://silver@scmdataset2025.dfs.core.windows.net/product')
# Load Supply Data
df_supply = spark.read.format("delta") \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .load('abfss://silver@scmdataset2025.dfs.core.windows.net/supply')

**Orders vs Total Cost and Shipment Cost**

In [0]:
df_order_shipment_cost = df_orders.join(df_logistics, "Order_ID") \
 .select("Order_ID", "Total_Cost", "Shipment_Cost") \
 .orderBy("Order_ID", ascending=True)
df_order_shipment_cost.display()

**Order_Date Vs Shipment_Date Vs Delivery_Date**

In [0]:
df_orders_delivery_info = df_orders.join(df_logistics, "Order_ID") \
 .select(df_orders.Order_ID, df_orders.Order_Date,
df_logistics.Shipment_Date, df_logistics.Delivery_Date) \
 .orderBy("Order_ID")
df_orders_delivery_info.display()

**Total Orders Revenue Over Time**

In [0]:
orders_summary = df_orders.groupBy("Order_Date") \
 .agg(sum("Total_Cost").alias("Total_Revenue")) \
 .orderBy("Order_Date", ascending=True)
display(orders_summary)

**Top-Selling Products**

In [0]:
from pyspark.sql.functions import sum, col, round

total_quantity = df_orders.agg(sum("Quantity_Ordered").alias("Total_Quantity")).collect()[0]["Total_Quantity"]
top_selling_quantities = df_orders.join(df_products, "Product_ID") \
 .groupBy("Product_ID", "Product_Name") \
 .agg(sum("Quantity_Ordered").alias("Total_Quantity_Ordered")) \
 .withColumn("Percentage_of_Allotment",
round((col("Total_Quantity_Ordered") / total_quantity) * 100, 2)) \
 .orderBy("Total_Quantity_Ordered", ascending=False)
top_selling_quantities.dsiaplay()