In [0]:
# -----------------------------------------
# STEP 1: Job Parameters
# -----------------------------------------

dbutils.widgets.text("start_date", "")
dbutils.widgets.text("end_date", "")

from datetime import datetime, timedelta

start_date_str = dbutils.widgets.get("start_date")
end_date_str = dbutils.widgets.get("end_date")

if not start_date_str:
    start_date = datetime.now().date()
else:
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()

if not end_date_str:
    end_date = start_date
else:
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d").date()

print(f"Processing from {start_date} to {end_date}")


In [0]:
# -----------------------------------------
# STEP 2: Bronze Ingestion Logic
# -----------------------------------------

from pyspark.sql import functions as F
import os, glob

catalog = "calgary_transit"
schema = "bronze"
table_prefix = "gtfs_"

ingest_ts = F.current_timestamp()

d = start_date
while d <= end_date:
    date_str = d.strftime("%Y-%m-%d")
    extract_dir = f"/Volumes/calgary_transit/bronze/bronze_vol/calgary_gtfs/{date_str}/"

    txt_files = glob.glob(os.path.join(extract_dir, "*.txt"))

    if not txt_files:
        print(f"⚠️ No files found for {date_str}")
        d += timedelta(days=1)
        continue

    for fp in sorted(txt_files):
        base = os.path.splitext(os.path.basename(fp))[0]
        table_name = f"{catalog}.{schema}.{table_prefix}{base}"

        df = (
            spark.read
                .option("header", True)
                .option("inferSchema", True)
                .option("mode", "PERMISSIVE")
                .csv(fp)
                .withColumn("_ingest_date", F.lit(date_str))
                .withColumn("_ingest_ts", ingest_ts)
                .withColumn("_source_file", F.lit(fp))
        )

        (df.write
            .format("delta")
            .mode("append")
            .option("mergeSchema", "true")
            .saveAsTable(table_name)
        )

        print(f"✅ {date_str}: appended -> {table_name}")

    d += timedelta(days=1)
