In [0]:
from pyspark.sql.functions import col, sum as spark_sum

# 1. Validate Bronze Layer
print("=== Bronze Layer Validation ===")
bronze_df = spark.read.table("workspace.default.bronze_sales_transactions")
bronze_count = bronze_df.count()
print(f"Bronze records count: {bronze_count}")

# 2. Validate Silver Layer
print("=== Silver Layer Validation ===")
silver_df = spark.read.table("workspace.default.silver_sales_transactions")
silver_count = silver_df.count()
print(f"Silver valid records count: {silver_count}")

# Check for nulls or invalid total_amount
invalid_total_df = silver_df.filter(col("total_amount") != (col("quantity") * col("unit_price") - col("discount")))
invalid_count = invalid_total_df.count()
print(f"Silver records with incorrect total_amount: {invalid_count}")

# Quarantine table validation
quarantine_df = spark.read.table("workspace.default.silver_sales_quarantine")
quarantine_count = quarantine_df.count()
print(f"Silver quarantined records count: {quarantine_count}")

# 3. Validate Gold Layer
print("=== Gold Layer Validation ===")
gold_df = spark.read.table("workspace.default.gold_daily_sales")
gold_count = gold_df.count()
print(f"Gold records count: {gold_count}")

# Validate aggregation accuracy
agg_total = gold_df.agg(spark_sum("total_daily_revenue").alias("total_amount_gold")).collect()[0]["total_amount_gold"]

silver_total = silver_df.agg(spark_sum("total_amount").alias("total_amount_silver")).collect()[0]["total_amount_silver"]

print(f"Total amount in Silver: {silver_total}")
print(f"Total amount in Gold: {agg_total}")

if abs(agg_total - silver_total) < 0.001:
    print("Gold aggregation matches Silver totals")
else:
    print("Discrepancy in Gold aggregation!")

# 4. Validate Logging
print("=== Logging Validation ===")
logs_df = spark.read.table("workspace.default.pipeline_logs")
logs_df.show(truncate=False)

# 5. Optional: Validate incremental load
# Insert a small test record in Bronze, run Silver/Gold transformations, check counts
print("Incremental load validation can be done by adding a test record and re-running pipeline.")


=== Bronze Layer Validation ===
Bronze records count: 1000
=== Silver Layer Validation ===
Silver valid records count: 651
Silver records with incorrect total_amount: 126
Silver quarantined records count: 349
=== Gold Layer Validation ===
Gold records count: 30
Total amount in Silver: 264289
Total amount in Gold: 264289
Gold aggregation matches Silver totals
=== Logging Validation ===
+--------------+-----------------+----------------+---------+--------------------------+--------------------------+-----------------------------------+
|pipeline_layer|records_processed|records_rejected|status   |start_ts                  |end_ts                    |remarks                            |
+--------------+-----------------+----------------+---------+--------------------------+--------------------------+-----------------------------------+
|Silver Layer  |651              |349             |COMPLETED|2025-12-23 01:16:23.318448|2025-12-23 01:16:23.318448|Invalid records moved to quarantine|
|Bro