In [0]:
%run "/Workspace/Users/ruchika.b.mhetre@v4c.ai/vstone_project/vstone_databricks_pipeline/src/notebooks/00_Setup/project_config"

In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import col, expr
from datetime import date

# 1. Create mock data mimicking raw XML read (where everything is often initially a string)
mock_xml_raw = spark.createDataFrame([
    Row(id="XML_01", marka="BMW", model="X5", year="2022", cost="60000.50", 
        has_license="true", date="2023-01-15T10:00:00Z", power="300", 
        probeg="5000", R="255", G="0", B="0"),
    Row(id="XML_02", marka="Audi", model="A6", year="bad_year", cost="55000", 
        has_license="0", date="12.12.2022", power="250", 
        probeg="12000", R="0", G="255", B="0")
])

# 2. Apply your transformation logic (The code you provided)
# Note: I'm using the logic from your snippet
df_xml_test = mock_xml_raw.select(
    col("id").cast("string"),
    expr("try_cast(year as int)").alias("year"),
    expr("try_cast(cost as double)").alias("cost"),
    expr("try_cast(has_license as boolean)").alias("has_license"),
    expr("""
        coalesce(
            try_to_date(date, "yyyy-MM-dd'T'HH:mm:ss'Z'"),
            try_to_date(date, 'dd.MM.yyyy'),
            try_to_date(date, 'yyyy-MM-dd')
        )
    """).alias("date"),
    expr("try_cast(power as int)").alias("power"),
    expr("try_cast(R as int)").alias("R")
)

# 3. Assertions
results = df_xml_test.collect()

# Test Case: ISO Timestamp to Date
assert results[0]["date"] == date(2023, 1, 15), "Failed: ISO XML date conversion"

# Test Case: Dotted Format to Date
assert results[1]["date"] == date(2022, 12, 12), "Failed: Dotted XML date conversion"

# Test Case: Error Handling for Years
assert results[1]["year"] is None, "Failed: Non-numeric year should be Null"

# Test Case: Boolean Mapping
assert results[0]["has_license"] is True, "Failed: 'true' string to Boolean"

# Test Case: RGB Integers
assert results[0]["R"] == 255, "Failed: RGB color casting"

print("✅ XML Logic Transformation Test: PASSED")

In [0]:
target_table = f"{catalog_name}.{schema_name}.bronze_transactions"

# 1. Assert the source exists in the table
xml_records = spark.table(target_table).filter(col("source_file") == "chunk4_xml")
record_count = xml_records.count()
assert record_count > 0, f"Failure: No records found for 'chunk4_xml' in {target_table}"

# 2. Assert Schema Evolution (New columns should exist)
actual_columns = spark.table(target_table).columns
expected_new_cols = ["power", "probeg", "R", "G", "B"]

for c in expected_new_cols:
    assert c in actual_columns, f"Schema Error: Column {c} was not added to the Delta table!"

# 3. Data Quality Check: power should be numeric
# We check if 'power' contains only integers (or nulls), not string representations
power_dtype = [dtype for name, dtype in spark.table(target_table).dtypes if name == 'power'][0]
assert power_dtype == 'int', f"Type Error: 'power' column is {power_dtype}, expected int"

# 4. Filter Check: ID should never be null (due to your .filter(col("id").isNotNull()) logic)
null_id_count = xml_records.filter(col("id").isNull()).count()
assert null_id_count == 0, "Logic Error: Found NULL IDs in XML ingestion despite filter."

print(f"✅ XML Ingestion Verification: PASSED ({record_count} records verified)")