In [0]:
%run "/Workspace/Users/ruchika.b.mhetre@v4c.ai/vstone_project/vstone_databricks_pipeline/src/notebooks/00_Setup/project_config"

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, expr, lit, current_timestamp, count, avg, round, sum, to_date, to_timestamp

# Test: Star Schema PK/FK Join Logic

In [0]:
def test_gold_star_schema_join():
    # Setup mock data for Fact and Dim
    sales_data = [("TXN101", "CITY_01", "Toyota", "Camry")]
    catalog_data = [("Toyota", "Camry", "Gen8", "Sedan")]
    
    df_sales = spark.createDataFrame(sales_data, ["transaction_id", "geo_id", "marka", "model"])
    df_cat = spark.createDataFrame(catalog_data, ["marka", "model", "generation", "body_type"])
    
    # Test the join logic used in Gold
    result_df = df_sales.join(df_cat, ["marka", "model"], "left")
    
    # Verify attributes from Dim are present in the result
    row = result_df.collect()[0]
    assert row["body_type"] == "Sedan", "FK Join to Dim_Catalogs failed"
    assert "generation" in result_df.columns, "Dimension attributes missing from Fact table"
    print("Test 1 Passed: Star Schema PK/FK joins are working correctly.")

# Test: Geographical KPI Mathematics

In [0]:
from pyspark.sql.functions import avg, sum, count

def test_gold_geo_kpi_math():
    # Setup mock sales for a single city
    data = [
        ("TXN1", "Moscow", 10000.0),
        ("TXN2", "Moscow", 20000.0)
    ]
    df = spark.createDataFrame(data, ["transaction_id", "city_name", "cost"])
    
    # Test aggregation logic
    kpi_df = df.groupBy("city_name").agg(
        count("transaction_id").alias("listing_count"),
        avg("cost").alias("avg_listing_price"),
        sum("cost").alias("total_market_value")
    )
    
    res = kpi_df.collect()[0]
    assert res["listing_count"] == 2, "Listing count calculation failed"
    assert res["avg_listing_price"] == 15000.0, "Average price calculation failed"
    assert res["total_market_value"] == 30000.0, "Total market value calculation failed"
    print("Test 2 Passed: Geographical KPI calculations are accurate.")

# Test: Visual Content Impact Classification

In [0]:
from pyspark.sql.functions import expr

def test_content_impact_ranges():
    # Setup data with different photo counts
    data = [(0,), (3,), (10,)]
    df = spark.createDataFrame(data, ["photo_count"])
    
    # Apply your Gold Layer logic
    result_df = df.withColumn("photo_range", 
        expr("CASE WHEN photo_count = 0 THEN 'No Photos' WHEN photo_count < 5 THEN 'Low' ELSE 'High' END"))
    
    results = {row["photo_count"]: row["photo_range"] for row in result_df.collect()}
    assert results[0] == "No Photos", "Zero photo range logic failed"
    assert results[3] == "Low", "Low photo range logic failed"
    assert results[10] == "High", "High photo range logic failed"
    print("Test 3 Passed: Visual Content impact ranges are correctly classified.")

# Test: Full Dataset Integration (Mandate 2)

In [0]:
def test_gold_dataset_completeness():
    # FIX: Use spark.table() instead of dlt.read() for interactive cell testing
    # Ensure you use your actual catalog and schema names
    gold_table_path = "vstone_project.db_project.fact_car_sales"
    
    try:
        actual_cols = spark.table(gold_table_path).columns
        
        # Define required columns from each of the 5 files [Mandate 2]
        required = {
            "cost": "From 1_main.csv",
            "generation": "From catalogs.csv",
            "city_name": "From final_geographic.csv",
            "photo_count": "From 1_photo.csv",
            "description": "From 1_text.csv"
        }
        
        for col_name, source in required.items():
            assert col_name in actual_cols, f"Missing data from {source} in Gold Layer"
            
        print("Test 4 Passed: All 5 dataset files are successfully integrated into the Gold Fact table.")
    except Exception as e:
        print(f"Test 4 Failed: Could not read Gold table. Error: {e}")

# Run the final test
test_gold_dataset_completeness()

In [0]:
test_gold_star_schema_join()
test_gold_geo_kpi_math()
test_content_impact_ranges()
test_gold_dataset_completeness()