In [0]:
%run ../config/pipeline_config

In [0]:
# =============================================================================
# Bronze Layer - Safe Landing Zone
# =============================================================================

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, current_timestamp, current_date,
    lit, count, when
)
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

print("\nStep 1: Reading raw CSV...")

# -----------------------------------------------------------------------------
# 2. BRONZE SCHEMA - All String
# -----------------------------------------------------------------------------

raw_schema = StructType([
    StructField("search_id",            StringType(), True),
    StructField("user_id",              StringType(), True),
    StructField("timestamp",            StringType(), True),
    StructField("city",                 StringType(), True),
    StructField("state",                StringType(), True),
    StructField("city_tier",            StringType(), True),
    StructField("pickup_lat",           StringType(), True),
    StructField("pickup_lng",           StringType(), True),
    StructField("ride_type",            StringType(), True),
    StructField("status",               StringType(), True),
    StructField("error_type",           StringType(), True),
    StructField("device",               StringType(), True),
    StructField("app_version",          StringType(), True),
    StructField("session_duration_sec", StringType(), True),
    StructField("is_repeat_search",     StringType(), True),
])

# -----------------------------------------------------------------------------
# 3. READ RAW CSV
# -----------------------------------------------------------------------------

df_raw = spark.read \
    .option("header", "true") \
    .option("multiLine", "false") \
    .schema(raw_schema) \
    .csv(RAW_CSV_PATH)

raw_count = df_raw.count()
print(f"Records read from CSV: {raw_count:,}")

# -----------------------------------------------------------------------------
# 4. ADD AUDIT COLUMNS
# -----------------------------------------------------------------------------

print("\nStep 2: Adding audit columns...")

df_bronze = df_raw \
    .withColumn("ingestion_timestamp", current_timestamp()) \
    .withColumn("ingestion_date", current_date()) \
    .withColumn("pipeline_version", lit(PIPELINE_VERSION)) \
    .withColumn("source_file", lit(RAW_CSV_PATH)) \
    .withColumn("pipeline_layer", lit("BRONZE"))

df_bronze.printSchema()

# -----------------------------------------------------------------------------
# 5. UNITY CATALOG SETUP (Only if enabled)
# -----------------------------------------------------------------------------

if STORAGE_MODE == "unity":
    print("\nSetting up Unity Catalog...")
    spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG_NAME}")
    spark.sql(f"USE CATALOG {CATALOG_NAME}")
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA_NAME}")

# -----------------------------------------------------------------------------
# 6. WRITE TO DELTA TABLE
# -----------------------------------------------------------------------------

print(f"\nWriting Bronze table â†’ {BRONZE_TABLE_FQN}")

df_bronze.write \
    .format("delta") \
    .mode(WRITE_MODE) \
    .partitionBy("ingestion_date") \
    .option("overwriteSchema", "true") \
    .saveAsTable(BRONZE_TABLE_FQN)

print("Bronze table written successfully!")

# -----------------------------------------------------------------------------
# 7. VERIFY WRITE
# -----------------------------------------------------------------------------

df_verify = spark.table(BRONZE_TABLE_FQN)
bronze_count = df_verify.count()

print(f"Records in Bronze table : {bronze_count:,}")
print(f"Match with source       : {'YES' if bronze_count == raw_count else 'NO'}")

# -----------------------------------------------------------------------------
# 8. SAMPLE OUTPUT
# -----------------------------------------------------------------------------

df_verify.select(
    "search_id", "city", "status",
    "ingestion_timestamp", "pipeline_version"
).show(5, truncate=False)

print("=" * 60)
print("Bronze Layer Completed Successfully")
print("=" * 60)

In [0]:
spark.table(BRONZE_TABLE_FQN).dtypes