# Silver Layer Transformations
## Data Cleaning and Enrichment

This notebook implements the Silver layer transformations:
1. Clean and standardize raw data
2. Apply data quality rules
3. Enrich with additional context
4. Create clean, validated Delta tables

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import *

# Initialize Spark session
spark = SparkSession.builder \
    .appName("VehicleMaintenance-Silver") \
    .getOrCreate()

# Set database
spark.sql("USE vehicle_maintenance")

In [None]:
# Create silver table if not exists
spark.sql("""
CREATE TABLE IF NOT EXISTS vehicle_maintenance.silver_maintenance (
    vehicle_id STRING,
    maintenance_date DATE,
    service_type STRING,
    service_category STRING,
    mileage LONG,
    cost DOUBLE,
    labor_cost DOUBLE,
    parts_cost DOUBLE,
    technician STRING,
    notes STRING,
    parts_used ARRAY<STRING>,
    maintenance_status STRING,
    next_maintenance_date DATE,
    processing_timestamp TIMESTAMP
)
USING DELTA
LOCATION '/mnt/vehicle-data/delta/silver/maintenance'
""")

# Function to categorize service types
def categorize_service(service_type):
    return when(service_type.isin(['Oil Change', 'Filter Replacement'], 'Routine') \
        .when(service_type.isin(['Brake Repair', 'Engine Repair'], 'Repair')) \
        .when(service_type.isin(['Inspection', 'Diagnostic'], 'Inspection')) \
        .otherwise('Other')

# Function to transform bronze data to silver
def transform_to_silver():
    # Read from bronze
    bronze_df = spark.table("vehicle_maintenance.bronze_maintenance")
    
    # Apply transformations
    silver_df = bronze_df \
        .withColumn("maintenance_date", to_date("maintenance_date")) \
        .withColumn("service_category", categorize_service(col("service_type"))) \
        .withColumn("labor_cost", col("cost") * 0.6) \  # Estimated split
        .withColumn("parts_cost", col("cost") * 0.4) \
        .withColumn("maintenance_status", 
                   when(current_date() > add_months(col("maintenance_date"), 6), "Due")
                   .otherwise("Current")) \
        .withColumn("next_maintenance_date", 
                   add_months(col("maintenance_date"), 6)) \
        .withColumn("processing_timestamp", current_timestamp())
    
    # Write to silver table
    silver_df.write.format("delta") \
        .mode("append") \
        .saveAsTable("vehicle_maintenance.silver_maintenance")
    
    return silver_df

# Transform data
try:
    df = transform_to_silver()
    print(f"Transformed {df.count()} records to silver layer")

## Data Quality Validations

Implement Silver layer data quality checks:
1. Validate date ranges
2. Check cost allocations
3. Verify service categorization
4. Monitor data completeness

In [None]:
# Function to validate silver data quality
def validate_silver_quality(df):
    # Check date ranges
    invalid_dates = df.filter(
        (col("maintenance_date") > current_date()) |
        (col("maintenance_date") < to_date(lit("2000-01-01")))
    ).count()
    
    # Verify cost calculations
    cost_mismatch = df.filter(
        abs((col("labor_cost") + col("parts_cost")) - col("cost")) > 0.01
    ).count()
    
    # Check category assignments
    uncategorized = df.filter(col("service_category") == "Other").count()
    
    # Verify completeness
    total = df.count()
    null_fields = df.select([
        count(when(col(c).isNull(), c)).alias(f"{c}_nulls")
        for c in df.columns
    ])
    
    print("Silver Layer Quality Report")
    print(f"Total Records: {total}")
    print(f"Invalid Dates: {invalid_dates}")
    print(f"Cost Calculation Mismatches: {cost_mismatch}")
    print(f"Uncategorized Services: {uncategorized}")
    print("\nNull Value Report:")
    null_fields.show()
    
    return total

# Run quality validation
total_records = validate_silver_quality(df)
print(f"\nQuality validation completed for {total_records} records")