In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import current_timestamp

# --- Chicago bronze
@dlt.table(
    name="midterm_project.bronze.chicago_bronze",
    comment="Chicago bronze file"
)
def chicago_bronze():
    df = (
        spark.read.format("csv")
        .option("header", True)
        .option("delimiter", "\t")
        .option("multiLine", True)
        .load("/Volumes/midterm_project/raw/d_store/Chicago_raw.tsv")
        .withColumn("source_city", lit("Chicago"))
        .withColumn("load_dt", current_timestamp())
    )
    # Sanitize column names: replace spaces with underscores
    df = df.toDF(*[c.replace(" ", "_") for c in df.columns])
    return df


In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window

# ============================================
# SILVER LAYER - INSPECTIONS (ONE ROW PER INSPECTION)
# ============================================

@dlt.table(
    name="midterm_project.silver.silver_inspections_chicago",
    comment="Cleaned and validated Chicago food inspection data - CHICAGO ONLY",
    table_properties={
        "quality": "silver",
        "pipelines.autoOptimize.managed": "true",
        "delta.columnMapping.mode": "name"
    }
)
@dlt.expect_all_or_drop({
    "valid_business_name": "BUSINESS_NAME IS NOT NULL",
    "valid_inspection_date": "INSPECTION_DATE IS NOT NULL", 
    "valid_inspection_type": "INSPECTION_TYPE IS NOT NULL",
    "valid_zip_code": "ZIP_CODE IS NOT NULL",
    "valid_zip_format": "CAST(ZIP_CODE AS STRING) RLIKE '^[0-9]+$'",
    "valid_zip_range": "ZIP_CODE BETWEEN 60007 AND 60827",
    "valid_results": "INSPECTION_RESULT IS NOT NULL"
})
def silver_inspections_chicago():

    base_df = (
        spark.read.table("chicago_bronze") 
        
        # Remove duplicates based on Inspection_ID + Violations
        .dropDuplicates(["Inspection_ID", "Violations"])
        
        # Filter - Remove records with no violations EARLY
        .filter(col("Violations").isNotNull() & (trim(col("Violations")) != ""))
        
        # Standardize columns and convert data types
        .select(
            col("Inspection_ID").cast("bigint").alias("INSPECTION_ID"),
            col("DBA_Name").cast("string").alias("BUSINESS_NAME"),
            col("AKA_Name").cast("string").alias("AKA_NAME"),
            regexp_replace(col("License_#"), "\\.0$", "").cast("bigint").alias("LICENSE_NUMBER"),
            col("Facility_Type").cast("string").alias("FACILITY_TYPE"),
            
            # Extract only "High", "Medium", or "Low"
            regexp_extract(col("Risk"), "(High|Medium|Low)", 1).alias("RISK_LEVEL"),
            
            col("Address").cast("string").alias("STREET_ADDRESS"),
            
            # Store raw city for cleaning
            upper(trim(col("City"))).alias("CITY_RAW"),
            
            col("State").cast("string").alias("STATE"),
            
            # Clean zip codes - remove .0 and convert to int
            regexp_replace(col("Zip"), "\\.0$", "").cast("int").alias("ZIP_CODE"),
            
            # Parse date properly
            to_date(col("Inspection_Date"), "yyyy-MM-dd").alias("INSPECTION_DATE"),
            
            col("Inspection_Type").cast("string").alias("INSPECTION_TYPE"),
            col("Results").cast("string").alias("INSPECTION_RESULT"),
            col("Violations").cast("string").alias("VIOLATIONS_TEXT"),
            
            # Convert lat/long to double (keep NULLs for now)
            col("Latitude").cast("double").alias("LATITUDE"),
            col("Longitude").cast("double").alias("LONGITUDE"),
            
            # Derived inspection score based on results
            when(col("Results") == "Pass", 90)
                .when(col("Results") == "Pass w/ Conditions", 80)
                .when(col("Results") == "Fail", 70)
                .when(col("Results") == "No Entry", 0)
                .when(col("Results") == "Out of Business", lit(None))
                .otherwise(lit(None)).cast("integer").alias("INSPECTION_SCORE"),
            
            # Count violations (for reference)
            when(col("Violations").isNull() | (trim(col("Violations")) == ""), 0)
                .otherwise(size(split(col("Violations"), "\\|"))).cast("integer").alias("VIOLATION_COUNT"),
            
            # Metadata
            lit("chicago").alias("SOURCE_SYSTEM"),
            current_timestamp().alias("INGESTION_TIMESTAMP")
        )
        
        # ============================================
        # CITY CLEANING LOGIC - FIX MISSPELLINGS
        # ============================================
        .withColumn("CITY",
            when(
                # Case 1: If City is null or empty, default to CHICAGO
                col("CITY_RAW").isNull() | (col("CITY_RAW") == ""), 
                lit("CHICAGO")
            ).when(
                # Case 2: Fix common Chicago misspellings (matches CCHICAGO, CHICAGOO, CHICAGOCHICAGO, etc.)
                col("CITY_RAW").rlike("^C+H+I+C+A+G+O+.*$"),
                lit("CHICAGO")
            ).when(
                # Case 3: Fix specific known misspellings
                col("CITY_RAW").isin(
                    "CHICAGOC",
                    "CCHICAGO", 
                    "CHICAGOO",
                    "CHICAGOCHICAGO",
                    "312CHICAGO",
                    "CHICAGO.",
                    "CH"
                ),
                lit("CHICAGO")
            ).when(
                # Case 4: Exact match (already correct)
                col("CITY_RAW") == "CHICAGO",
                lit("CHICAGO")
            ).otherwise(
                # Case 5: Keep original for non-Chicago cities (will be filtered out)
                col("CITY_RAW")
            )
        )
        .drop("CITY_RAW")  # Remove temporary column
        
        # *** CRITICAL: EXPLICITLY FILTER FOR CHICAGO ONLY ***
        .filter(col("CITY") == "CHICAGO")
    )
    
    # Step 2: Calculate ZIP code average coordinates
    zip_window = Window.partitionBy("ZIP_CODE")
    
    base_df = base_df.withColumn("ZIP_LAT_AVG", avg(col("LATITUDE")).over(zip_window))
    base_df = base_df.withColumn("ZIP_LONG_AVG", avg(col("LONGITUDE")).over(zip_window))
    
    # Step 3: Replace NULL lat/long with ZIP average, then Chicago center
    return (
        base_df
        .withColumn("LATITUDE", 
            when(col("LATITUDE").isNull(), 
                coalesce(col("ZIP_LAT_AVG"), lit(41.8781)))
            .otherwise(col("LATITUDE"))
        )
        .withColumn("LONGITUDE",
            when(col("LONGITUDE").isNull(), 
                coalesce(col("ZIP_LONG_AVG"), lit(-87.6298)))
            .otherwise(col("LONGITUDE"))
        )
        .drop("ZIP_LAT_AVG", "ZIP_LONG_AVG")
    )


# ============================================
# SILVER LAYER - VIOLATIONS (ONE ROW PER VIOLATION)
# ============================================

@dlt.table(
    name="midterm_project.silver.silver_violations_chicago",
    comment="Normalized Chicago violations - ONE ROW PER VIOLATION per inspection - CHICAGO ONLY",
    table_properties={
        "quality": "silver",
        "pipelines.autoOptimize.managed": "true",
        "delta.columnMapping.mode": "name"
    }
)
@dlt.expect_or_drop("valid_violation_description", "VIOLATION_DESCRIPTION IS NOT NULL AND VIOLATION_DESCRIPTION != ''")
def silver_violations_chicago():
    """
    Reads from silver_inspections_chicago and explodes violations.
    Each row represents one violation from one inspection.
    CHICAGO ONLY - inherits from silver_inspections_chicago filter
    """
    return (
        dlt.read("midterm_project.silver.silver_inspections_chicago")
        
        .filter(col("VIOLATIONS_TEXT").isNotNull() & (trim(col("VIOLATIONS_TEXT")) != ""))
        
        .select(
            col("INSPECTION_ID"),
            col("BUSINESS_NAME"),
            col("AKA_NAME"),
            col("LICENSE_NUMBER"),
            col("FACILITY_TYPE"),
            col("RISK_LEVEL"),
            col("STREET_ADDRESS"),
            col("CITY"),
            col("STATE"),
            col("ZIP_CODE"),
            col("INSPECTION_DATE"),
            col("INSPECTION_TYPE"),
            col("INSPECTION_RESULT"),
            col("INSPECTION_SCORE"),
            col("VIOLATIONS_TEXT"),
            col("LATITUDE"),
            col("LONGITUDE")
        )
        
        .withColumn("VIOLATION_ARRAY", split(col("VIOLATIONS_TEXT"), "\\|"))
        .withColumn("INDIVIDUAL_VIOLATION", explode(col("VIOLATION_ARRAY")))
        .withColumn("INDIVIDUAL_VIOLATION", trim(col("INDIVIDUAL_VIOLATION")))
        
        .filter(col("INDIVIDUAL_VIOLATION") != "")
        
        .withColumn("VIOLATION_CODE", 
            regexp_extract(col("INDIVIDUAL_VIOLATION"), "^(\\d+)\\.", 1).cast("int")
        )
        
        .withColumn("VIOLATION_DESCRIPTION",
            trim(
                regexp_extract(
                    col("INDIVIDUAL_VIOLATION"), 
                    "^\\d+\\.\\s*(.+?)(?=\\s*-\\s*Comment[s]?:|$)", 
                    1
                )
            ).cast("string")
        )
        
        .select(
            col("INSPECTION_ID"),
            col("BUSINESS_NAME"),
            col("AKA_NAME"),
            col("LICENSE_NUMBER"),
            col("FACILITY_TYPE"),
            col("RISK_LEVEL"),
            col("STREET_ADDRESS"),
            col("CITY"),
            col("STATE"),
            col("ZIP_CODE"),
            col("INSPECTION_DATE"),
            col("INSPECTION_TYPE"),
            col("INSPECTION_RESULT"),
            col("INSPECTION_SCORE"),
            col("LATITUDE"),
            col("LONGITUDE"),
            col("VIOLATION_CODE"),
            col("VIOLATION_DESCRIPTION")
        )
        
        .dropDuplicates(["INSPECTION_ID", "VIOLATION_CODE"])
    )