In [0]:
import sys
sys.path.insert(0, "../utils")
from logger import log_silver_ingestion
from silver_ingestion import run_dimension_pipeline
import pyspark.sql.functions as F
import uuid

In [0]:
# 1. SETUP & IMPORTS
run_id = str(uuid.uuid4())
print(f"Starting Taxi Zones Load. Run ID: {run_id}")

# 2. CONFIGURATION

taxi_zones_config = {
    # 1. Identity & Locations
    "dataset_name": "taxi_zones",
    "bronze_table": "nyc_taxi.bronze.taxi_zones",
    "silver_table": "nyc_taxi.silver.dim_taxi_zones",
    "log_table": "nyc_taxi.logs.silver_ingestion_logs",
    
    # 2. Schema Management
    "rename_mapping": {
        "locationid": "location_id",
        "run_id":"bronze_id"

    },
    
    "type_mapping": {
        "location_id": "int",
        "borough": "string",
        "zone": "string",
        "service_zone": "string",
        "bronze_id": "string" 
    },
    
    # 3. Logic Control
    "business_key": "location_id",
    "scd_type": 1,
    
    # Which columns should trigger an update? 
    "comparison_columns": ["borough", "zone", "service_zone"]
}


In [0]:
# 3. EXECUTION
# The 'run_dimension_pipeline' handles reading, cleaning, merging, and logging.

run_dimension_pipeline(
    spark=spark, 
    config=taxi_zones_config, 
    run_id=run_id, 
    log_fn=log_silver_ingestion
)