In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
%sql
drop catalog if exists goldscm cascade;
create catalog goldscm;

In [0]:
%sql
USE catalog goldscm;

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS goldscm.gold_schema;

**LOAD THE SILVER DATA**

In [0]:
%python
# Load Customer Data
df_cust = spark.read.format("delta") \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .load('abfss://silver@scmdataset2025.dfs.core.windows.net/Customer')
# Load Orders Data
df_orders = spark.read.format("delta") \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .load('abfss://silver@scmdataset2025.dfs.core.windows.net/orders')
# Load Warehouse Data
df_warehouse = spark.read.format("delta") \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .load('abfss://silver@scmdataset2025.dfs.core.windows.net/Warehouse')
# Load Logistics Data
df_logistics = spark.read.format("delta") \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .load('abfss://silver@scmdataset2025.dfs.core.windows.net/Logistics')
# Load Products Data
df_products = spark.read.format("delta") \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .load('abfss://silver@scmdataset2025.dfs.core.windows.net/Product')
# Load Supply Data
df_supply = spark.read.format("delta") \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .load('abfss://silver@scmdataset2025.dfs.core.windows.net/Supply')

**Orders vs Total Cost and Shipment Cost**

In [0]:
df_order_shipment_cost = df_orders.join(df_logistics, "Order_ID") \
 .select("Order_ID", "Total_Cost", "Shipment_Cost") \
 .orderBy("Order_ID", ascending=True)
df_order_shipment_cost.display()

**Order_Date Vs Shipment_Date Vs Delivery_Date**

In [0]:
df_orders_delivery_info = df_orders.join(df_logistics, "Order_ID") \
 .select(df_orders.Order_ID, df_orders.Order_Date,
df_logistics.Shipment_Date, df_logistics.Delivery_Date) \
 .orderBy("Order_ID")
df_orders_delivery_info.display()

**Total Orders Revenue Over Time**

In [0]:
orders_summary = df_orders.groupBy("Order_Date") \
 .agg(sum("Total_Cost").alias("Total_Revenue")) \
 .orderBy("Order_Date", ascending=True)
display(orders_summary)

**Top-Selling Products**

In [0]:
from pyspark.sql.functions import sum, col, round

total_quantity = df_orders.agg(sum("Quantity_Ordered").alias("Total_Quantity")).collect()[0]["Total_Quantity"]
top_selling_quantities = df_orders.join(df_products, "Product_ID") \
 .groupBy("Product_ID", "Product_Name") \
 .agg(sum("Quantity_Ordered").alias("Total_Quantity_Ordered")) \
 .withColumn("Percentage_of_Allotment",
round((col("Total_Quantity_Ordered") / total_quantity) * 100, 2)) \
 .orderBy("Total_Quantity_Ordered", ascending=False)
top_selling_quantities.display()

**Most Orders By Customer**

In [0]:
most_orders_by_customer = df_orders.join(df_cust,
"Customer_ID").groupBy("Customer_ID", "Customer_Location").agg(
 count("Order_ID").alias("Total_orders")
).orderBy("Total_orders", ascending=False)
most_orders_by_customer.display()


**Warehouse wise Top-Orders**

In [0]:
warehouse_wise_top_orders = df_orders.join(df_warehouse,
"Warehouse_ID") \
 .groupBy("Warehouse_ID", "Warehouse_Location") \
 .agg(count("Order_ID").alias("Total_Orders")) \
 .orderBy("Total_Orders", ascending=False)
warehouse_wise_top_orders.display()

**Customer Location vs Order Volume**

In [0]:
%python
Cust_behavior = df_cust.join(df_orders, "Customer_ID") \
    .join(df_products, "Product_ID") \
    .groupBy("Customer_Location", "Product_Name") \
    .agg(sum("Quantity_Ordered").alias("Total_Quantity_Ordered")) \
    .orderBy("Total_Quantity_Ordered", ascending=False)

display(Cust_behavior)

**Customer Order Frequency**

In [0]:
from pyspark.sql.functions import countDistinct
df_customer_order_frequency = df_orders.join(df_cust,
"Customer_ID").groupBy(
 "Customer_ID", "Customer_Location"
).agg(
 countDistinct("Order_Date").alias("Order_Frequency")
).orderBy( "Order_Frequency", ascending=False)

df_customer_order_frequency.display()

**Average delivery time by Warehouse**