In [0]:
%run "/Workspace/Users/ruchika.b.mhetre@v4c.ai/vstone_project/vstone_databricks_pipeline/src/notebooks/00_Setup/project_config"

In [0]:
import re
from pyspark.sql import functions as F

# Schema Initialization (DDL)

In [0]:
%sql

USE CATALOG vstone_project;
USE SCHEMA db_project;

DROP TABLE IF EXISTS bronze_transactions;

-- Explicitly define the schema for multi-format support
CREATE TABLE bronze_transactions (
  id STRING,
  marka STRING,
  model STRING,
  year INT,
  cost DOUBLE,
  currency STRING,  
  has_license BOOLEAN, 
  place STRING,
  date DATE,         
  engine STRING,
  source_file STRING,
  load_timestamp TIMESTAMP
)
USING DELTA;

# Ingestion via COPY INTO

In [0]:
source_location = f"{volume_path}/chunks/chunk1_initial"

spark.sql(f"""
COPY INTO {catalog_name}.{schema_name}.bronze_transactions
FROM (
  SELECT 
    id::string,
    marka::string,
    model::string,
    try_cast(year AS int) AS year,
    try_cast(cost AS double) AS cost,
    currency::string,
    try_cast(has_license AS boolean) AS has_license,
    place::string,
    -- Multi-format Date Logic:
    CAST(COALESCE(
      try_to_timestamp(date, 'dd.MM.yyyy'),
      try_to_timestamp(date, "yyyy-MM-dd'T'HH:mm:ss'Z'"),
      try_to_timestamp(date, 'yyyy-MM-dd')
    ) AS DATE) AS date,
    engine::string,
    _metadata.file_name AS source_file,
    current_timestamp() AS load_timestamp
  FROM '{source_location}'
)
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true', 'inferSchema' = 'true', 'mergeSchema' = 'true')
COPY_OPTIONS ('mergeSchema' = 'true')
""")

print("Success! Bronze ingestion now handles multiple date formats and malformed years.")

# Data Quality Audit

In [0]:
from pyspark.sql import functions as F

df_bronze = spark.table(f"{catalog_name}.{schema_name}.bronze_transactions")

# Check rows where the date was successfully parsed
display(df_bronze.filter(F.col("date").isNotNull()).limit(10))

# Identify if any dates still failed (returned NULL)
failed_dates = df_bronze.filter(F.col("date").isNull()).count()
print(f"Total Rows: {df_bronze.count()}")
print(f"Rows with unparseable dates: {failed_dates}")

# Delta History

In [0]:
%sql
DESCRIBE HISTORY vstone_project.db_project.bronze_transactions;