In [0]:
%run "/Workspace/Users/ruchika.b.mhetre@v4c.ai/vstone_project/vstone_databricks_pipeline/src/notebooks/00_Setup/project_config"

In [0]:
from pyspark.sql.functions import lit, current_timestamp, col, expr

# 2. Re-ingest with multiline support
csv_path = f"{volume_path}/chunks/chunk1_initial"
df_csv = (spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .option("multiLine", "true") 
    .option("quote", "\"")
    .option("escape", "\"")
    .load(csv_path))

# 3. Robust Date and Type Transformation
df_csv_cleaned = df_csv.select(
    col("id").cast("string"),
    col("marka").cast("string"),
    col("model").cast("string"),
    expr("try_cast(year as int)").alias("year"),
    expr("try_cast(cost as double)").alias("cost"),
    col("currency").cast("string"),
    expr("try_cast(has_license as boolean)").alias("has_license"),
    col("place").cast("string"),
    # THE FIX: Try ISO format first, then European dots
    expr("""
        coalesce(
            try_to_date(date, "yyyy-MM-dd'T'HH:mm:ss'Z'"),
            try_to_date(date, 'dd.MM.yyyy'),
            try_to_date(date, 'yyyy-MM-dd')
        )
    """).alias("date"),
    col("engine").cast("string"),
    lit("chunk1_initial.csv").alias("source_file"),
    current_timestamp().alias("load_timestamp")
)

# 4. Append to Bronze
target_table = f"{catalog_name}.{schema_name}.bronze_transactions"
(df_csv_cleaned.write.format("delta")
    .mode("append")
    .option("mergeSchema", "true")
    .saveAsTable(target_table))

print(f"‚úÖ Success! Re-ingested {df_csv_cleaned.count()} rows with mixed date formats.")

In [0]:
from pyspark.sql.functions import col

# 2. Use the 'bronze_main' variable which holds 'vstone_project.db_project.bronze_transactions'
try:
    sources = spark.table(bronze_main).select("source_file").distinct().collect()
    source_list = [row['source_file'] for row in sources]

    expected_sources = ["chunk1_initial.csv", "chunk3_json", "chunk4_xml"]

    # 3. Check if every expected source is in our table
    for expected in expected_sources:
        assert expected in source_list, f"Missing Source Check Failed: {expected} not found!"

    print(f"‚úÖ Format Check Passed. Sources found: {source_list}")

except Exception as e:
    print(f"‚ùå Error accessing table: {e}")

In [0]:
from pyspark.sql.functions import col, count, when

# 2. Define critical columns
critical_columns = ["id", "cost", "marka", "model"]

# 3. Perform a single pass aggregation for efficiency
print(f"üîç Analyzing null percentages in {bronze_main}...")

# Create dynamic expressions to count nulls for each critical column
null_exprs = [count(when(col(c).isNull(), c)).alias(c) for c in critical_columns]
total_count = spark.table(bronze_main).count()

# Execute the null count
null_counts_row = spark.table(bronze_main).select(*null_exprs).collect()[0]

# 4. Verify results
for column in critical_columns:
    null_count = null_counts_row[column]
    null_percentage = (null_count / total_count) * 100
    
    # Assert check
    assert null_percentage < 90, f"‚ùå Schema Check Failed: Column '{column}' is {null_percentage:.2f}% null!"
    print(f"   ‚úÖ Column '{column}': {null_percentage:.2f}% null (Total nulls: {null_count})")

print(f"\n‚ú® Data Quality Check Passed for {total_count} records.")

In [0]:
%sql
DESCRIBE TABLE EXTENDED vstone_project.db_project.bronze_transactions;