In [0]:
%run ./00_setup_and_config

In [0]:
try:
    df_silver = spark.read \
        .format("delta") \
        .load(silver_delta_path)
except Exception as e:
    print(f"Error loading from {silver_delta_path}: {e}")
    raise e

In [0]:
df_silver.show(5)
df_silver.printSchema()
df_silver.rdd.getNumPartitions()

In [0]:
import pyspark.sql.functions as func

### Key Metrics for each Customer
- Total Spend
- Distinct_Items_Purchased
- Total_Transactions
- First_Purchase_Date
- Last_Purchase_Date

In [0]:
df_customer_aggregates = df_silver.groupBy("CustomerID") \
    .agg(
        func.sum("TotalPrice").alias("TotalSpend"),
        func.countDistinct("StockCode").alias("DistinctItemsPurchases"),
        func.countDistinct("InvoiceNo").alias("TotalTransactions"),
        func.min("InvoiceDate").alias("FirstPurchaseDate"),
        func.max("InvoiceDate").alias("LastPurchaseDate")
    )

In [0]:
df_customer_aggregates.show(5)
df_customer_aggregates.rdd.getNumPartitions()

In [0]:
try:
    df_customer_aggregates.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .save(gold_customer_path)
except Exception as e:
    print(f"Error writing to {gold_customer_path}: {e}")
    raise e

### Key Metrics for each Product
- Total_Quantity_Sold
- Total_Product_Revenue
- Total_Unique_Orders_Containing_Product

In [0]:
df_product_aggregates = df_silver.groupBy("StockCode") \
    .agg(
        func.sum("Quantity").alias("TotalQuantitySold"),
        func.sum("TotalPrice").alias("TotalProductRevenue"),
        func.countDistinct("InvoiceNo").alias("TotalUniqueOrdersContainingProduct")
    )

In [0]:
df_product_aggregates.show(5)

In [0]:
try:
    df_product_aggregates.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .save(gold_product_path)
except Exception as e:
    print(f"Error writing to {gold_product_path}: {e}")
    raise e

### Daily Revenue & Order Count Per Country

In [0]:
df_daily_sales = df_silver.groupBy("Invoice_Date", "Country") \
    .agg(
        func.sum("TotalPrice").alias("DailyRevenue"),
        func.countDistinct("InvoiceNo").alias("DailyOrderCount")
    ) \
    .orderBy("Invoice_Date", "Country")

In [0]:
df_daily_sales.show(5)

In [0]:
try:
    df_daily_sales.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .save(gold_daily_sales_path)
except Exception as e:
    print(f"Error writing to {gold_daily_sales_path}: {e}")
    raise e