In [1]:
"""
Configuration and environment setup for Bronze layer ingestion.
"""
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any
import logging
import os
import sys


# ============================================================================
# PIPELINE CONFIGURATION
# ============================================================================
@dataclass
class PipelineConfig:
    """Centralized configuration for the Bronze ingestion pipeline."""

    # Pipeline metadata
    pipeline_name: str = "nyc_yellow_taxi_bronze_ingestion"
    pipeline_version: str = "1.3.0"  # Updated for AMD EPYC optimization
    environment: str = field(default_factory=lambda: os.getenv("ENV", "development"))

    # Spark configuration - AMD EPYC 7742 (64 cores) + 128GB RAM
    # Reserve 8 cores for Linux GUI + OS, use 56 for Spark
    app_name: str = "NYCTaxiBronze"
    driver_memory: str = "16g"    # Driver just coordinates
    executor_memory: str = "80g"  # Executor does heavy lifting
    executor_cores: int = 56      # 64 - 8 reserved for OS/GUI
    parallelism: int = 56         # Match available cores
    shuffle_partitions: int = 112 # 2x cores for shuffle operations
    
    # Batch processing - process files in chunks to avoid OOM
    batch_size: int = 20  # Process 20 files at a time

    # Data paths (will be set from settings)
    source_dir: Optional[Path] = None
    target_dir: Optional[Path] = None

    # Processing options
    write_mode: str = "overwrite"  # overwrite, append
    partition_by: List[str] = field(default_factory=lambda: ["year"])
    enable_data_quality_checks: bool = True

    # Schema version detection by year
    # V1: 2009 (vendor_name, Trip_Pickup_DateTime)
    # V2: 2010 (vendor_id, pickup_datetime) 
    # V3: 2011+ (VendorID, tpep_pickup_datetime)
    v1_years: List[str] = field(default_factory=lambda: ["2009"])
    v2_years: List[str] = field(default_factory=lambda: ["2010"])
    # All other years are V3 (new format)

    def __post_init__(self):
        """Validate configuration after initialization."""
        assert self.write_mode in ["overwrite", "append"], (
            f"Invalid write_mode: {self.write_mode}"
        )
        assert self.driver_memory.endswith("g"), (
            "Memory should be specified in gigabytes (e.g., '8g')"
        )


# ============================================================================
# LOGGING SETUP
# ============================================================================
def setup_logging(config: PipelineConfig) -> logging.Logger:
    """Configure logging for the pipeline."""
    logger = logging.getLogger(config.pipeline_name)
    logger.setLevel(logging.INFO)

    # Clear existing handlers
    logger.handlers = []

    # Console handler with formatting
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter(
        fmt="%(asctime)s | %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
    )
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    return logger


# Initialize configuration and logger
config = PipelineConfig()
logger = setup_logging(config)

logger.info("=" * 60)
logger.info(f"Pipeline: {config.pipeline_name} v{config.pipeline_version}")
logger.info(f"Environment: {config.environment}")
logger.info(f"Started at: {datetime.now().isoformat()}")
logger.info(f"Memory: Driver={config.driver_memory}, Executor={config.executor_memory}")
logger.info(f"CPU: {config.executor_cores}/64 cores (8 reserved for OS/GUI)")
logger.info(f"Batch size: {config.batch_size} files per batch")
logger.info("Schema versions: V1 (2009), V2 (2010), V3 (2011+)")
logger.info("=" * 60)

2025-11-29 15:21:30 | INFO     | Pipeline: nyc_yellow_taxi_bronze_ingestion v1.3.0
2025-11-29 15:21:30 | INFO     | Environment: development
2025-11-29 15:21:30 | INFO     | Started at: 2025-11-29T15:21:30.015174
2025-11-29 15:21:30 | INFO     | Memory: Driver=16g, Executor=80g
2025-11-29 15:21:30 | INFO     | CPU: 56/64 cores (8 reserved for OS/GUI)
2025-11-29 15:21:30 | INFO     | Batch size: 20 files per batch
2025-11-29 15:21:30 | INFO     | Schema versions: V1 (2009), V2 (2010), V3 (2011+)


In [2]:
"""
Spark Session initialization with Delta Lake support.

⚠️ IMPORTANT: You MUST restart the kernel for memory/CPU changes to take effect!
   Kernel → Restart Kernel, then run all cells from the beginning.

Optimized for: AMD EPYC 7742 (64 cores) + 128GB RAM
"""
from delta.pip_utils import configure_spark_with_delta_pip
import pyspark.sql


def create_spark_session(config: PipelineConfig) -> pyspark.sql.SparkSession:
    """
    Create and configure Spark session with Delta Lake support.

    Args:
        config: Pipeline configuration object

    Returns:
        Configured SparkSession
    """
    logger.info("Initializing Spark session...")

    builder = (
        pyspark.sql.SparkSession.builder.appName(config.app_name)
        # Delta Lake extensions
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config(
            "spark.sql.catalog.spark_catalog",
            "org.apache.spark.sql.delta.catalog.DeltaCatalog",
        )
        # Memory configuration
        .config("spark.driver.memory", config.driver_memory)
        .config("spark.executor.memory", config.executor_memory)
        .config("spark.driver.maxResultSize", "4g")
        .config("spark.memory.fraction", "0.8")
        .config("spark.memory.storageFraction", "0.2")
        # Off-heap memory for extra headroom
        .config("spark.memory.offHeap.enabled", "true")
        .config("spark.memory.offHeap.size", "16g")
        # CPU/Core configuration - AMD EPYC 7742 optimization
        .config("spark.executor.cores", str(config.executor_cores))
        .config("spark.default.parallelism", str(config.parallelism))
        .config("spark.sql.shuffle.partitions", str(config.shuffle_partitions))
        # Performance tuning
        .config("spark.sql.parquet.mergeSchema", "false")
        .config("spark.sql.adaptive.enabled", "true")
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
        .config("spark.sql.adaptive.skewJoin.enabled", "true")
        # Delta optimizations
        .config("spark.databricks.delta.optimizeWrite.enabled", "true")
        .config("spark.databricks.delta.autoCompact.enabled", "true")
        # File handling
        .config("spark.sql.files.maxPartitionBytes", "128m")
    )

    spark = configure_spark_with_delta_pip(builder).getOrCreate()

    logger.info(f"Spark version: {spark.version}")
    logger.info(f"Driver memory: {config.driver_memory}")
    logger.info(f"Executor memory: {config.executor_memory}")
    logger.info(f"Executor cores: {config.executor_cores}")
    logger.info(f"Default parallelism: {config.parallelism}")
    logger.info(f"Shuffle partitions: {config.shuffle_partitions}")

    return spark


# Create Spark session
spark = create_spark_session(config)

2025-11-29 15:21:39 | INFO     | Initializing Spark session...


:: loading settings :: url = jar:file:/home/administrator/Desktop/datascience/github/nyc-taxi-eta/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/administrator/.ivy2.5.2/cache
The jars for the packages stored in: /home/administrator/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-61bf43bc-69c7-4a50-b20b-deb7319d3258;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 151ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	---------------------------------------------------------------------
	|               

2025-11-29 15:21:44 | INFO     | Spark version: 4.0.0
2025-11-29 15:21:44 | INFO     | Driver memory: 16g
2025-11-29 15:21:44 | INFO     | Executor memory: 80g
2025-11-29 15:21:44 | INFO     | Executor cores: 56
2025-11-29 15:21:44 | INFO     | Default parallelism: 56
2025-11-29 15:21:44 | INFO     | Shuffle partitions: 112


In [4]:
df = spark.read.format("delta").load("/home/administrator/Desktop/datascience/github/nyc-taxi-eta/data/bronze/yellow_taxi")
df.show()

25/11/29 15:24:00 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+---------+-------------------+-------------------+---------------+-------------+----------------+---------------+-----------------+----------------+---------+------------------+------------+-----------+-----+-------+----------+------------+------------+------------+------------+---------------------+--------------------+-----------+------------------+--------------------+----+
|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|rate_code|store_and_fwd_flag|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|total_amount|pulocationid|dolocationid|improvement_surcharge|congestion_surcharge|airport_fee|cbd_congestion_fee|         source_file|year|
+---------+-------------------+-------------------+---------------+-------------+----------------+---------------+-----------------+----------------+---------+------------------+------------+-----------+-----+-------+----------+----------

In [6]:
# unique vendor ids to list
vendor_ids = df.select("vendor_id").distinct().rdd.flatMap(lambda x: x).collect()
print(vendor_ids)

[Stage 15:>                                                         (0 + 1) / 1]

['2', '1', '3', '5', '4', '6', '7', 'VTS', 'DDS', 'CMT']


                                                                                