In [0]:
%run "/Workspace/Users/ruchika.b.mhetre@v4c.ai/vstone_project/vstone_databricks_pipeline/src/notebooks/00_Setup/project_config"

In [0]:
from pyspark.sql.functions import col, count

def test_dimensions():
    # TEST: Geography Uniqueness
    # Ensure geo_id (the join key) is unique
    geo_df = spark.table(f"{catalog_name}.{schema_name}.dim_geography")
    geo_dupes = geo_df.groupBy("geo_id").count().filter("count > 1").count()
    assert geo_dupes == 0, f"Critical: Found {geo_dupes} duplicate geo_ids in dim_geography!"

    # TEST: Car Content Coverage
    # Check if photo_count is correctly calculated (should be >= 0)
    content_df = spark.table(f"{catalog_name}.{schema_name}.dim_car_content")
    invalid_photos = content_df.filter("photo_count < 0").count()
    assert invalid_photos == 0, "Logic Error: photo_count cannot be negative."

    # TEST: Technical Specs Logic
    # Ensure distinct worked and we don't have exact row duplicates
    specs_df = spark.table(f"{catalog_name}.{schema_name}.dim_technical_specs")
    assert specs_df.count() == specs_df.distinct().count(), "Duplicate specs found in dim_technical_specs."

    print("✅ Dimension Tests Passed: Uniqueness and logic verified.")

In [0]:
def test_fact_and_quarantine():
    bronze_count = spark.table(f"{catalog_name}.{schema_name}.bronze_transactions_unified").count()
    fact_count = spark.table(f"{catalog_name}.{schema_name}.fact_transactions").count()
    quarantine_count = spark.table(f"{catalog_name}.{schema_name}.quarantine_transactions").count()

    # 1. Total Conservation Test
    # Fact + Quarantine should equal Bronze (unless you have a filter in both)
    assert bronze_count == (fact_count + quarantine_count), \
        f"Data Loss: {bronze_count} rows in Bronze, but only {fact_count + quarantine_count} accounted for."

    # 2. Fact Integrity: Price Check
    # Ensure no rows in Fact violate the business rule (> 1000.0)
    price_violations = spark.table(f"{catalog_name}.{schema_name}.fact_transactions").filter("cost <= 1000.0").count()
    assert price_violations == 0, "Business Rule Violation: Low cost items found in fact_transactions!"

    # 3. Quarantine Accuracy: Reason Check
    # Ensure all quarantine rows actually have a reason assigned
    missing_reason = spark.table(f"{catalog_name}.{schema_name}.quarantine_transactions").filter("quarantine_reason IS NULL").count()
    assert missing_reason == 0, "Logic Error: Rows quarantined without a reason."

    print(f"✅ Fact/Quarantine Tests Passed: {fact_count} cleaned, {quarantine_count} quarantined.")

In [0]:
test_dimensions()
test_fact_and_quarantine()