In [0]:
import sys
sys.path.insert(0, "../utils")
from logger import log_silver_ingestion
from silver_ingestion import run_dimension_pipeline
import pyspark.sql.functions as F
import uuid

In [0]:

# 1. SETUP & IMPORTS

run_id = str(uuid.uuid4())
print(f"Starting Taxi Licenses Load. Run ID: {run_id}")

# 2. CONFIGURATION (The Contract)

licences_config = {
    # 1. Identity & Locations
    "dataset_name": "licenses",
    "bronze_table": "nyc_taxi.bronze.license_mapping",
    "silver_table": "nyc_taxi.silver.dim_licenses",
    "log_table": "nyc_taxi.logs.silver_ingestion_logs",
    
    # 2. Schema Management
    "rename_mapping": {
        "high_volume": "high_volume_license_number",
        "run_id":"bronze_id"

    },
    "type_mapping": {
        "high_volume_license_number": "string",
        "license_number": "string",
        "base_name":"string",
        "app_company_affiliation": "string",
        "bronze_id": "string" 
    },
    
    # 3. Logic Control
    "business_key": "license_number",
    "scd_type": 2,
    
    # Which columns should trigger an update? 
    "comparison_columns": ["license_number", "base_name", "app_company_affiliation"]
}


In [0]:
# 3. EXECUTION
# The 'run_dimension_pipeline' handles reading, cleaning, merging, and logging.

run_dimension_pipeline(
    spark=spark, 
    config=licences_config, 
    run_id=run_id, 
    log_fn=log_silver_ingestion
)