# Bronze Layer Data Ingestion
## Raw Data Ingestion to Delta Lake

This notebook implements the Bronze layer of our medallion architecture:
1. Read data from source systems (JSON files, APIs, databases)
2. Minimal transformations (parsing, type conversion)
3. Store raw data in Delta format with metadata
4. Track data lineage and ingestion timestamps

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import *
import json
from datetime import datetime

# Initialize Spark session
spark = SparkSession.builder \
    .appName("VehicleMaintenance-Bronze") \
    .getOrCreate()

# Set database
spark.sql("USE vehicle_maintenance")

In [None]:
# Define schema for raw maintenance records
maintenance_schema = StructType([
    StructField("vehicle_id", StringType(), True),
    StructField("maintenance_date", TimestampType(), True),
    StructField("service_type", StringType(), True),
    StructField("mileage", LongType(), True),
    StructField("cost", DoubleType(), True),
    StructField("technician", StringType(), True),
    StructField("notes", StringType(), True),
    StructField("parts_used", ArrayType(StringType()), True)
])

# Function to ingest raw data from JSON files
def ingest_maintenance_records(source_path):
    # Read JSON files
    raw_df = spark.read.schema(maintenance_schema).json(source_path)
    
    # Add metadata columns
    enriched_df = raw_df.withColumn("ingestion_timestamp", current_timestamp())
    
    # Write to bronze Delta table
    enriched_df.write.format("delta") \
        .mode("append") \
        .saveAsTable("vehicle_maintenance.bronze_maintenance")
    
    return enriched_df

# Example ingestion (replace with your source path)
source_path = "/mnt/vehicle-data/raw/maintenance/*.json"
try:
    df = ingest_maintenance_records(source_path)
    print(f"Ingested {df.count()} records to bronze layer")

## Data Quality Checks

Implement basic data quality checks on ingested data:
1. Schema validation
2. Null checks for required fields
3. Data type validation
4. Record count monitoring

In [None]:
# Function to perform data quality checks
def check_data_quality(df):
    total_records = df.count()
    
    # Check for nulls in required fields
    null_checks = df.select([
        count(when(col(c).isNull(), c)).alias(f"{c}_nulls")
        for c in ["vehicle_id", "maintenance_date", "service_type", "mileage"]
    ])
    
    # Check for invalid dates (future dates)
    future_dates = df.filter(col("maintenance_date") > current_timestamp()).count()
    
    # Check for negative values
    invalid_mileage = df.filter(col("mileage") < 0).count()
    invalid_cost = df.filter(col("cost") < 0).count()
    
    print("Data Quality Report:")
    print(f"Total Records: {total_records}")
    null_checks.show()
    print(f"Future Dates: {future_dates}")
    print(f"Invalid Mileage Records: {invalid_mileage}")
    print(f"Invalid Cost Records: {invalid_cost}")
    
    return total_records

# Run quality checks on ingested data
total_records = check_data_quality(df)
print(f"\nData quality checks completed for {total_records} records")