In [0]:
%run "/Workspace/Users/ruchika.b.mhetre@v4c.ai/vstone_project/vstone_databricks_pipeline/src/notebooks/00_Setup/project_config"

# Logic Unit Test

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import DateType, IntegerType, DoubleType, BooleanType

# 1. Simulate JSON data using a simple list of Row objects or a collection
# This avoids spark.sparkContext.parallelize
json_data = [
    {"id": "A1", "year": "2024", "cost": "1500.50", "has_license": "true", "date": "2024-01-01"},
    {"id": "A2", "year": "2023", "cost": "2000", "has_license": "false", "date": "2023-12-31"}
]

# Create the DataFrame directly from the list of dictionaries
raw_df = spark.createDataFrame(json_data)

# 2. Apply your transformation logic
# (Using F.expr for try_cast as we discussed earlier for ANSI safety)
processed_df = raw_df.select(
    F.col("id").cast("string"),
    F.expr("try_cast(year as int)").alias("year"),
    F.expr("try_cast(cost as double)").alias("cost"),
    F.expr("try_cast(has_license as boolean)").alias("has_license"),
    F.col("date").cast("date"),
    F.lit("chunk3_json").alias("source_file"),
    F.current_timestamp().alias("load_timestamp")
)

# 3. Assertions
sample = processed_df.collect()

# Note: In Python, Spark's DoubleType maps to 'float' 
# and IntegerType maps to 'int'.
assert isinstance(sample[0]["year"], int), "Year should be converted to Integer"
assert isinstance(sample[0]["cost"], float), "Cost should be converted to Double/Float"
assert sample[0]["has_license"] is True, "has_license should be Boolean True"
assert sample[0]["source_file"] == "chunk3_json", "Source file metadata missing"

print("✅ JSON Logic Test Passed: Serverless compatible and schema validated.")

# Integration & Ingestion Verification

In [0]:
table_full_name = f"{catalog_name}.{schema_name}.bronze_transactions"

# 1. Wait for stream to finish (Safety check)
# In a real notebook, ensure the streaming cell has finished running first.

# 2. Assert data exists from this specific source
json_ingest_count = spark.table(table_full_name) \
    .filter(F.col("source_file") == "chunk3_json") \
    .count()

if json_ingest_count == 0:    print(f"Warning: No rows found for source 'chunk3_json' in {table_full_name}")
else:
    assert json_ingest_count > 0, f"Failure: No rows found for source 'chunk3_json' in {table_full_name}"


# 3. Assert no schema corruption
# Auto Loader sometimes puts failed parses into _rescued_data if configured.
# We check if 'has_license' or 'year' contain unexpected NULLs that suggest a hint failure.
null_check = spark.table(table_full_name) \
    .filter(F.col("source_file") == "chunk3_json") \
    .filter(F.col("year").isNull() | F.col("has_license").isNull()) \
    .count()

assert null_check == 0, f"Data Quality Error: {null_check} rows have NULL values in hinted columns!"

print(f"✅ Ingestion Verified: {json_ingest_count} records successfully merged into Bronze.")