In [0]:
%run "/Workspace/Users/ruchika.b.mhetre@v4c.ai/vstone_project/vstone_databricks_pipeline/src/notebooks/00_Setup/project_config"

In [0]:
# Read the first few lines of the first part file
target_file = f"{volume_path}/chunks/chunk4_xml/part-00000-tid-5112369767730743437-a4612fb7-bd5e-4cac-bb7f-0bf5de637469-347-1-c000.xml"
raw_xml_data = dbutils.fs.head(target_file)
print(raw_xml_data)

# XML Ingestion and Schema Mapping

In [0]:
from pyspark.sql.functions import lit, current_timestamp, col, expr

# 1. Read XML with the correct rowTag
df_raw_xml = (spark.read
    .format("xml")
    .option("rowTag", "transaction") 
    .load(f"{volume_path}/chunks/chunk4_xml"))


In [0]:
# 2. Robust Transformation & Type Casting
# We explicitly cast to match the Bronze Table Schema
df_xml_final = df_raw_xml.select(
    col("id").cast("string"),
    col("marka").cast("string"),
    col("model").cast("string"),
    expr("try_cast(year as int)").alias("year"),
    expr("try_cast(cost as double)").alias("cost"),
    col("currency").cast("string"),
    expr("try_cast(has_license as boolean)").alias("has_license"),
    col("place").cast("string"),
    # Multi-format Date Logic
    expr("""
        coalesce(
            try_to_date(date, "yyyy-MM-dd'T'HH:mm:ss'Z'"),
            try_to_date(date, 'dd.MM.yyyy'),
            try_to_date(date, 'yyyy-MM-dd')
        )
    """).alias("date"),
    col("engine").cast("string"),
    # --- ADDED MISSING COLUMNS BELOW ---
    expr("try_cast(power as int)").alias("power"),
    col("gear").cast("string"),
    expr("try_cast(probeg as long)").alias("probeg"),
    col("sWheel").cast("string"),
    col("transmission").cast("string"),
    expr("try_cast(R as int)").alias("R"),
    expr("try_cast(G as int)").alias("G"),
    expr("try_cast(B as int)").alias("B"),
    # --- METADATA ---
    lit("chunk4_xml").alias("source_file"),
    current_timestamp().alias("load_timestamp")
).filter(col("id").isNotNull())

In [0]:
# Pre-write verification
xml_count = df_xml_final.count()
print(f"âœ… XML Data Prepared: {xml_count} records ready for ingestion.")

In [0]:
target_table = f"{catalog_name}.{schema_name}.bronze_transactions"

if xml_count > 0:
    # Append mode ensures we don't overwrite the CSV/JSON data already present
    (df_xml_final.write
        .format("delta")
        .mode("append")
        .option("mergeSchema", "true") 
        .saveAsTable(target_table))
    print(f"Success: {xml_count} records appended to {target_table}")
else:
    print("Skipping write: No records found in the XML source.")

In [0]:
%sql
SELECT COUNT(*) FROM vstone_project.db_project.bronze_transactions;