Staging Setup (DDL): Vor dem Full Load wurde die Staging-Tabelle stg_yellow_trips per DDL neu erstellt, um einen sauberen Startpunkt sicherzustellen. Datentypen wurden explizit definiert, um fehlerhafte automatische Typ-Inferenz zu vermeiden. Zusätzlich wurden Indizes auf Pickup-Datetime, Pickup/Dropoff-LocationIDs und source_file erstellt, um spätere Zeit-/Geo-Analysen sowie idempotente Ladeprozesse performant zu unterstützen.

In [1]:
## DB SetUp and SetUp Check
## For additional help, check the .env.example file.

import os
from dotenv import load_dotenv
from urllib.parse import quote_plus
from sqlalchemy import create_engine, text

load_dotenv()

PG_USER = os.getenv("POSTGRES_USER")
PG_PASS_RAW = os.getenv("POSTGRES_PASS")
PG_HOST = os.getenv("POSTGRES_HOST")
PG_PORT = os.getenv("POSTGRES_PORT", "5432")
PG_DB   = os.getenv("POSTGRES_DB")
PG_SCHEMA = os.getenv("POSTGRES_SCHEMA", "public")

missing = [k for k,v in {
    "POSTGRES_USER": PG_USER,
    "POSTGRES_PASS": PG_PASS_RAW,
    "POSTGRES_HOST": PG_HOST,
    "POSTGRES_DB": PG_DB,
    "POSTGRES_SCHEMA": PG_SCHEMA
}.items() if not v]
if missing:
    raise ValueError(f"Missing env vars: {missing}")

PG_PASS = quote_plus(PG_PASS_RAW)
url = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"
engine = create_engine(url, future=True)

with engine.connect() as conn:
    print(conn.execute(text("SELECT current_user, current_database(), current_schema();")).fetchone())

print("Setup OK. Schema:", PG_SCHEMA)


('patrickpaubandt', 'nf_da_onl_13102025', 'public')
Setup OK. Schema: s_patrickpaubandt


In [2]:
## DDL Create raw Staging Table

from sqlalchemy import text

ddl = f"""
DROP TABLE IF EXISTS {PG_SCHEMA}.stg_yellow_trips;

CREATE TABLE {PG_SCHEMA}.stg_yellow_trips (
  vendorid                 SMALLINT,
  tpep_pickup_datetime     TIMESTAMP,
  tpep_dropoff_datetime    TIMESTAMP,
  passenger_count          SMALLINT,
  trip_distance            DOUBLE PRECISION,
  ratecodeid               SMALLINT,
  store_and_fwd_flag       TEXT,
  pulocationid             INTEGER,
  dolocationid             INTEGER,
  payment_type             SMALLINT,
  fare_amount              NUMERIC(10,2),
  extra                    NUMERIC(10,2),
  mta_tax                  NUMERIC(10,2),
  tip_amount               NUMERIC(10,2),
  tolls_amount             NUMERIC(10,2),
  improvement_surcharge    NUMERIC(10,2),
  total_amount             NUMERIC(10,2),
  congestion_surcharge     NUMERIC(10,2),
  airport_fee              NUMERIC(10,2),
  cbd_congestion_fee       NUMERIC(10,2),

  -- metadata for reproducibility
  source_file              TEXT NOT NULL,
  ingested_at              TIMESTAMP NOT NULL DEFAULT NOW()
);

CREATE INDEX IF NOT EXISTS ix_stg_yellow_pickup_dt
  ON {PG_SCHEMA}.stg_yellow_trips (tpep_pickup_datetime);

CREATE INDEX IF NOT EXISTS ix_stg_yellow_pu_do
  ON {PG_SCHEMA}.stg_yellow_trips (pulocationid, dolocationid);

CREATE INDEX IF NOT EXISTS ix_stg_yellow_source_file
  ON {PG_SCHEMA}.stg_yellow_trips (source_file);
"""

with engine.begin() as conn:
    conn.execute(text(ddl))

print("Created stg_yellow_trips + indexes")


Created stg_yellow_trips + indexes


In [3]:
## Quick Check (optional)

from sqlalchemy import text

with engine.connect() as conn:
    n = conn.execute(text(f"SELECT COUNT(*) FROM {PG_SCHEMA}.stg_yellow_trips;")).scalar()
    print("Rows in staging (should be 0):", n)


Rows in staging (should be 0): 0
