In [0]:
%load_ext autoreload
%autoreload 2

import sys
from config.table_config import PROJECT_PATH

if PROJECT_PATH not in sys.path:
    sys.path.append(PROJECT_PATH)

from config import *
from utils import *
from pyspark.sql.functions import col, to_date, coalesce, lit

configure_spark(spark)

In [0]:
def orders_fact_transform(source_dfs):
    """
    Transform silver.orders to fact_orders.
    
    LEAN FACT: Minimize columns to reduce size.
    Fact grain: One row per order
    """
    
    orders_df = source_dfs[SILVER_ORDERS_TABLE]
    
    fact_orders = orders_df.select(
        # Surrogate key
        col("order_id").alias("order_key"),
        
        # Foreign keys (references to dimensions)
        col("customer_id").alias("customer_key"),
        col("seller_id").alias("seller_key"),
        to_date(col("order_date")).alias("order_date_key"),
        
        # Degenerate dimensions
        col("payment_method"),
        col("source_system"),
        
        # Measures
        coalesce(col("order_total_price"), lit(0.0)).alias("order_amount")
    )
    
    return fact_orders

In [0]:
GoldTransformation(
    spark=spark,
    table_display_name="fact_orders",
    table_type="fact",
    source_tables=[SILVER_ORDERS_TABLE],
    target_table=f"{CATALOG_NAME}.gold.fact_orders"
).run(
    catalog_name=CATALOG_NAME,
    gold_schema="gold",
    gold_container=GOLD_BASE,
    transform_logic=orders_fact_transform,
    partition_cols=None,
    clustering_cols=["order_date_key"],  # Add clustering
    scd_type=1,
    write_mode="overwrite",
    add_metadata=False  # Don't add created_at/updated_at
)