In [0]:
%run "/Workspace/Users/ruchika.b.mhetre@v4c.ai/vstone_project/vstone_databricks_pipeline/src/notebooks/00_Setup/project_config"

# Test: Transaction Quarantine Logic

In [0]:
def test_transaction_quarantine():
    # Setup: Create a mock dataframe with one valid and one invalid record
    data = [
        ("1", "CityA", "Toyota", "Camry", 2020, 15000.0), # Valid
        (None, "CityB", "Honda", "Civic", 2019, 500.0)    # Invalid: Missing ID & Cost < 1000
    ]
    columns = ["id", "place", "marka", "model", "year", "cost"]
    df = spark.createDataFrame(data, columns)
    
    # Test logic for Silver Transactions
    valid_df = df.filter("(id IS NOT NULL) AND (cost > 1000.0)")
    # Test logic for Quarantine
    quarantine_df = df.filter("NOT ((id IS NOT NULL) AND (cost > 1000.0))")
    
    assert valid_df.count() == 1, "Should have 1 valid record"
    assert quarantine_df.count() == 1, "Should have 1 quarantined record"
    print("Test 1 Passed: Quarantine logic is accurate.")

# Test: Geography Deduplication

In [0]:
def test_geography_deduplication():
    # Setup: Duplicate entries for the same prepositional ID
    data = [
        ("в Москве", "Moscow", 55.7, 37.6),
        ("в Москве", "Moscow", 55.7, 37.6)
    ]
    columns = ["city_prepositional", "city_name", "lat", "lon"]
    df = spark.createDataFrame(data, columns)
    
    # Test deduplication
    silver_geo_test = df.dropDuplicates(["city_prepositional"])
    
    assert silver_geo_test.count() == 1, "Geography must be unique per city_prepositional"
    print("Test 2 Passed: Geography dimension is deduplicated.")

# Test: Catalog Key Standardization (Trimming)

In [0]:
# Import all necessary Spark functions to prevent NameErrors
from pyspark.sql.functions import col, trim, count

def test_catalog_key_standardization():
    # Setup: Data with inconsistent spacing
    data = [(" Toyota ", " Camry ")]
    columns = ["marka", "model"]
    df = spark.createDataFrame(data, columns)
    
    # Test trimming - 'col' is now defined via the import above
    silver_catalog_test = df.select(
        trim(col("marka")).alias("marka"), 
        trim(col("model")).alias("model")
    )
    
    row = silver_catalog_test.collect()[0]
    assert row["marka"] == "Toyota", "Marka should be trimmed"
    assert row["model"] == "Camry", "Model should be trimmed"
    print("Test 3 Passed: Catalog join keys are standardized.")


# Test: Car Content Enrichment

In [0]:
def test_car_content_enrichment():
    # Setup
    text_data = [("101", "Excellent car")]
    photo_data = [("101", "url1"), ("101", "url2")]
    
    df_text = spark.createDataFrame(text_data, ["id", "text"])
    df_photos = spark.createDataFrame(photo_data, ["id", "photo_url"])
    
    # Execution
    photo_agg = df_photos.groupBy("id").agg(count("photo_url").alias("photo_count"))
    result_df = df_text.join(photo_agg, on="id", how="left")
    
    row = result_df.collect()[0]
    assert row["photo_count"] == 2, "Photo count aggregation failed"
    print("Test 4 Passed: Content enrichment join is working.")

In [0]:
test_transaction_quarantine()
test_geography_deduplication()
test_catalog_key_standardization()
test_car_content_enrichment()