In [0]:
%load_ext autoreload
%autoreload 2

import sys
from config.table_config import PROJECT_PATH

if PROJECT_PATH not in sys.path:
    sys.path.append(PROJECT_PATH)

from config import *
from utils import *
from pyspark.sql.functions import col, to_date, count, sum as _sum, avg, min as _min, max as _max

configure_spark(spark)

In [0]:
def sales_summary_transform(source_dfs):
    """
    Aggregate orders by customer, seller, and date.
    
    OPTIMIZATION: Only use legacy orders (have prices).
    Reduces data from 1.53B to 700M rows.
    """
    
    orders_df = source_dfs[SILVER_ORDERS_TABLE]
    
    # Filter to legacy orders only (have order_total_price)
    log_info("Filtering to legacy orders only (with prices)...")
    orders_with_price = orders_df.filter(col("source_system") == "legacy")
    
    # Repartition for parallel aggregation
    log_info("Repartitioning for parallel aggregation (64 partitions)...")
    orders_with_price = orders_with_price.repartition(64)
    
    log_info("Aggregating by customer, seller, date...")
    fact_sales_summary = orders_with_price \
        .groupBy(
            col("customer_id").alias("customer_key"),
            col("seller_id").alias("seller_key"),
            to_date(col("order_date")).alias("order_date_key")
        ) \
        .agg(
            count("*").alias("order_count"),
            _sum("order_total_price").alias("total_revenue"),
            avg("order_total_price").alias("avg_order_value"),
            _min("order_total_price").alias("min_order_value"),
            _max("order_total_price").alias("max_order_value")
        )
    
    log_info(f"Aggregation complete")
    
    return fact_sales_summary

In [0]:
GoldTransformation(
    spark=spark,
    table_display_name="fact_sales_summary",
    table_type="fact",
    source_tables=[SILVER_ORDERS_TABLE],
    target_table=f"{CATALOG_NAME}.gold.fact_sales_summary"
).run(
    catalog_name=CATALOG_NAME,
    gold_schema="gold",
    gold_container=GOLD_BASE,
    transform_logic=sales_summary_transform,
    clustering_cols=["order_date_key"],
    scd_type=1,
    write_mode="overwrite",
    add_metadata=False  # No bloat
)