In [0]:
%run "/Workspace/Users/sundarasandeepteja@gmail.com/E-Commerce Analytics Medallion Architecture with GenAI/config/project_config"

In [0]:
# Databricks notebook source
# ======================================
# GOLD LAYER: FACT_TRANSACTIONS
# ======================================

# MAGIC %run ../config/project_config

from pyspark.sql import functions as F

print("ü•á GOLD LAYER: Creating fact_transactions")
print("=" * 60)

# ======================================
# LOAD SOURCE DATA
# ======================================
print("\nüì• Loading source data...")

# Silver transactions
silver_txn = spark.table(SILVER_TRANSACTIONS_TABLE)
print(f"  Silver transactions: {silver_txn.count():,}")

# Dimension tables (for surrogate keys)
dim_products = spark.table(GOLD_DIM_PRODUCTS_TABLE).select("product_key", "product_id")
dim_customers = spark.table(GOLD_DIM_CUSTOMERS_TABLE).select("customer_key", "customer_id")
dim_date = spark.table(GOLD_DIM_DATE_TABLE).select("date_key", "date")

print(f"  dim_products: {dim_products.count():,}")
print(f"  dim_customers: {dim_customers.count():,}")
print(f"  dim_date: {dim_date.count():,}")

# ======================================
# JOIN WITH DIMENSIONS
# ======================================
print("\nüîó Joining with dimension tables...")

fact_transactions = silver_txn \
    .join(
        dim_products,
        silver_txn.product_id == dim_products.product_id,
        "left"
    ) \
    .join(
        dim_customers,
        silver_txn.customer_id == dim_customers.customer_id,
        "left"
    ) \
    .join(
        dim_date,
        silver_txn.transaction_date == dim_date.date,
        "left"
    )

print("  ‚úÖ Dimension joins complete")

# ======================================
# SELECT FACT COLUMNS
# ======================================
print("\nüìã Selecting fact columns...")

# Add transaction_hour column before selecting
fact_transactions = fact_transactions.withColumn(
    "transaction_hour",
    F.hour(F.col("transaction_timestamp"))
)

fact_final = fact_transactions.select(
    "product_key",
    "customer_key",
    "date_key",
    silver_txn.transaction_id,
    "quantity",
    "unit_price",
    "total_amount",
    "discount_percent",
    "discount_amount",
    "final_amount",
    "shipping_cost",
    "gross_profit",
    silver_txn.product_id,
    silver_txn.customer_id,
    "payment_method",
    "status",
    "order_size",
    "discount_flag",
    "is_weekend",
    "transaction_timestamp",
    "transaction_date",
    "transaction_year",
    "transaction_month",
    "transaction_hour",  # Now exists
    "day_of_week",
    "_batch_id",
    F.current_timestamp().alias("_loaded_at")
)

print(f"  Fact records: {fact_final.count():,}")

# ======================================
# DATA QUALITY CHECK
# ======================================
print("\nüîç Running data quality checks...")

# Check for null keys
null_product_keys = fact_final.filter(F.col("product_key").isNull()).count()
null_customer_keys = fact_final.filter(F.col("customer_key").isNull()).count()
null_date_keys = fact_final.filter(F.col("date_key").isNull()).count()

print(f"  Null product_key: {null_product_keys:,}")
print(f"  Null customer_key: {null_customer_keys:,}")
print(f"  Null date_key: {null_date_keys:,}")

if null_product_keys > 0 or null_customer_keys > 0 or null_date_keys > 0:
    print("  ‚ö†Ô∏è Warning: Found null surrogate keys (orphan records)")
else:
    print("  ‚úÖ All surrogate keys populated")

# ======================================
# WRITE TO GOLD
# ======================================
print("\nüíæ Writing to Gold layer...")

fact_final.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("transaction_year", "transaction_month") \
    .option("overwriteSchema", "true") \
    .saveAsTable(GOLD_FACT_TRANSACTIONS_TABLE)

print(f"  ‚úÖ Created: {GOLD_FACT_TRANSACTIONS_TABLE}")

# Optimize
spark.sql(f"""
    OPTIMIZE {GOLD_FACT_TRANSACTIONS_TABLE} 
    ZORDER BY (customer_key, product_key, date_key)
""")
print("  ‚úÖ Optimized with ZORDER on (customer_key, product_key, date_key)")

# ======================================
# VERIFY AND SUMMARIZE
# ======================================
print("\nüìä Fact Table Summary:")

final_count = spark.table(GOLD_FACT_TRANSACTIONS_TABLE).count()
print(f"  Total transactions: {final_count:,}")

# Aggregate metrics
print("\nüìä Key Metrics:")
spark.table(GOLD_FACT_TRANSACTIONS_TABLE) \
    .filter(F.col("status") == "Completed") \
    .agg(
        F.count("*").alias("total_orders"),
        F.round(F.sum("final_amount"), 2).alias("total_revenue"),
        F.round(F.sum("gross_profit"), 2).alias("total_profit"),
        F.round(F.avg("final_amount"), 2).alias("avg_order_value"),
        F.countDistinct("customer_key").alias("unique_customers"),
        F.countDistinct("product_key").alias("unique_products")
    ) \
    .show()

# By year
print("üìä Revenue by Year:")
spark.table(GOLD_FACT_TRANSACTIONS_TABLE) \
    .filter(F.col("status") == "Completed") \
    .groupBy("transaction_year") \
    .agg(
        F.count("*").alias("orders"),
        F.round(F.sum("final_amount"), 2).alias("revenue")
    ) \
    .orderBy("transaction_year") \
    .show()

# By status
print("üìä Transactions by Status:")
spark.table(GOLD_FACT_TRANSACTIONS_TABLE) \
    .groupBy("status") \
    .agg(
        F.count("*").alias("count"),
        F.round(F.sum("final_amount"), 2).alias("amount")
    ) \
    .orderBy(F.desc("count")) \
    .show()

print("\nü•á FACT_TRANSACTIONS COMPLETE!")