Idempotenter Monats-Load: Vor dem Import eines Monats werden vorhandene Zeilen dieser Quelldatei über DELETE WHERE source_file = ... entfernt. Dadurch kann der Load jederzeit wiederholt oder erweitert werden, ohne Duplikate zu erzeugen.

In [3]:
## DB SetUp and SetUp Check
## For additional help, check the .env.example file.

import os
from dotenv import load_dotenv
from urllib.parse import quote_plus
from sqlalchemy import create_engine, text

load_dotenv()

PG_USER = os.getenv("POSTGRES_USER")
PG_PASS_RAW = os.getenv("POSTGRES_PASS")
PG_HOST = os.getenv("POSTGRES_HOST")
PG_PORT = os.getenv("POSTGRES_PORT", "5432")
PG_DB   = os.getenv("POSTGRES_DB")
PG_SCHEMA = os.getenv("POSTGRES_SCHEMA", "public")

missing = [k for k,v in {
    "POSTGRES_USER": PG_USER,
    "POSTGRES_PASS": PG_PASS_RAW,
    "POSTGRES_HOST": PG_HOST,
    "POSTGRES_DB": PG_DB,
    "POSTGRES_SCHEMA": PG_SCHEMA
}.items() if not v]
if missing:
    raise ValueError(f"Missing env vars: {missing}")

PG_PASS = quote_plus(PG_PASS_RAW)
url = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"
engine = create_engine(url, future=True)

with engine.connect() as conn:
    print(conn.execute(text("SELECT current_user, current_database(), current_schema();")).fetchone())

print("Setup OK. Schema:", PG_SCHEMA)

('patrickpaubandt', 'nf_da_onl_13102025', 'public')
Setup OK. Schema: s_patrickpaubandt


In [2]:
## Configure Month Downloadlink + delete idempotent

BASE_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data"
MONTHS = ["2025-09"]  # später einfach erweitern: ["2025-09", "2025-10", ...]

def file_name(month: str) -> str:
    return f"yellow_tripdata_{month}.parquet"

def file_url(month: str) -> str:
    return f"{BASE_URL}/{file_name(month)}"

# Idempotent: vorhandene Zeilen für den Monat löschen
month = MONTHS[0]
sf = file_name(month)

with engine.begin() as conn:
    deleted = conn.execute(
        text(f"DELETE FROM {PG_SCHEMA}.stg_yellow_trips WHERE source_file = :sf"),
        {"sf": sf}
    ).rowcount

print(f"Deleted existing rows for {sf}: {deleted}")
print("Will load from:", file_url(month))


Deleted existing rows for yellow_tripdata_2025-09.parquet: 0
Will load from: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-09.parquet


In [None]:
## Download + read parquet files

import os
import requests
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
from sqlalchemy import text

month = "2025-09"
sf = file_name(month)
url = file_url(month)

# Datei lokal zwischenspeichern (im Notebook-Ordner oder in einem temp-Ordner)
DATA_DIR = r"C:\Users\patri\Documents\Data Analytics - neuefische\Capstone-project-NYCTaxi\data"
local_path = os.path.join(DATA_DIR, f"yellow_tripdata_{month}.parquet")

if not os.path.exists(local_path):
    print("Downloading to:", local_path)
    with requests.get(url, stream=True, timeout=300) as r:
        r.raise_for_status()
        with open(local_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):  # 1MB
                if chunk:
                    f.write(chunk)
else:
    print("File already exists:", local_path)

# Parquet in batches lesen
pf = pq.ParquetFile(local_path)
print("Rows in file:", pf.metadata.num_rows)
print("Row groups:", pf.num_row_groups)

TARGET_COLS = [
    "vendorid","tpep_pickup_datetime","tpep_dropoff_datetime","passenger_count","trip_distance","ratecodeid",
    "store_and_fwd_flag","pulocationid","dolocationid","payment_type",
    "fare_amount","extra","mta_tax","tip_amount","tolls_amount","improvement_surcharge","total_amount",
    "congestion_surcharge","airport_fee","cbd_congestion_fee",
    "source_file"
]

def normalize_df(df: pd.DataFrame, source_file: str) -> pd.DataFrame:
    # Normalize column names
    df.columns = [c.strip().lower() for c in df.columns]

    # Add source_file
    df["source_file"] = source_file

    # Ensure all target cols exist
    for c in TARGET_COLS:
        if c not in df.columns:
            df[c] = None

    df = df[TARGET_COLS]
    return df

BATCH_SIZE = 200_000
inserted_total = 0

for i, batch in enumerate(pf.iter_batches(batch_size=BATCH_SIZE)):
    df = batch.to_pandas()
    df = normalize_df(df, sf)

    # Guard-rails: stop if location IDs are basically all NULL in this batch
    null_rates = df[["pulocationid", "dolocationid"]].isna().mean()
    if (null_rates > 0.9).any():
        raise ValueError(f"STOP: Mapping failed in batch {i}. Null rates: {null_rates.to_dict()}")

    # Insert
    df.to_sql(
        name="stg_yellow_trips",
        con=engine,
        schema=PG_SCHEMA,
        if_exists="append",
        index=False,
        method="multi",
        chunksize=10_000
    )

    inserted_total += len(df)

    # Progress print every batch
    print(f"Batch {i+1}: inserted {len(df):,} rows | total inserted: {inserted_total:,}")

print("DONE. Total inserted:", inserted_total)


Downloading to: yellow_tripdata_2025-09.parquet
Rows in file: 4251015
Row groups: 5
Batch 1: inserted 200,000 rows | total inserted: 200,000
Batch 2: inserted 200,000 rows | total inserted: 400,000
Batch 3: inserted 200,000 rows | total inserted: 600,000
Batch 4: inserted 200,000 rows | total inserted: 800,000
Batch 5: inserted 200,000 rows | total inserted: 1,000,000
Batch 6: inserted 200,000 rows | total inserted: 1,200,000
Batch 7: inserted 200,000 rows | total inserted: 1,400,000
Batch 8: inserted 200,000 rows | total inserted: 1,600,000
Batch 9: inserted 200,000 rows | total inserted: 1,800,000
Batch 10: inserted 200,000 rows | total inserted: 2,000,000
Batch 11: inserted 200,000 rows | total inserted: 2,200,000
Batch 12: inserted 200,000 rows | total inserted: 2,400,000
Batch 13: inserted 200,000 rows | total inserted: 2,600,000
Batch 14: inserted 200,000 rows | total inserted: 2,800,000
Batch 15: inserted 200,000 rows | total inserted: 3,000,000
Batch 16: inserted 200,000 rows |

In [4]:
## Validation of the Import

from sqlalchemy import text

sf = "yellow_tripdata_2025-09.parquet"

with engine.connect() as conn:
    # 1) Row count by file
    n = conn.execute(text(f"""
        SELECT COUNT(*) 
        FROM {PG_SCHEMA}.stg_yellow_trips
        WHERE source_file = :sf
    """), {"sf": sf}).scalar()

    # 2) Date coverage
    date_cov = conn.execute(text(f"""
        SELECT
          MIN(tpep_pickup_datetime) AS min_pickup,
          MAX(tpep_pickup_datetime) AS max_pickup,
          MIN(tpep_dropoff_datetime) AS min_dropoff,
          MAX(tpep_dropoff_datetime) AS max_dropoff
        FROM {PG_SCHEMA}.stg_yellow_trips
        WHERE source_file = :sf
    """), {"sf": sf}).fetchone()

    # 3) Critical NULLs (should be low for location IDs)
    nulls = conn.execute(text(f"""
        SELECT
          SUM(CASE WHEN pulocationid IS NULL THEN 1 ELSE 0 END) AS pu_nulls,
          SUM(CASE WHEN dolocationid IS NULL THEN 1 ELSE 0 END) AS do_nulls,
          SUM(CASE WHEN tpep_pickup_datetime IS NULL THEN 1 ELSE 0 END) AS pickup_nulls,
          SUM(CASE WHEN tpep_dropoff_datetime IS NULL THEN 1 ELSE 0 END) AS dropoff_nulls
        FROM {PG_SCHEMA}.stg_yellow_trips
        WHERE source_file = :sf
    """), {"sf": sf}).fetchone()

    # 4) Quick payment sanity
    payment = conn.execute(text(f"""
        SELECT payment_type, COUNT(*) AS n
        FROM {PG_SCHEMA}.stg_yellow_trips
        WHERE source_file = :sf
        GROUP BY 1
        ORDER BY n DESC
    """), {"sf": sf}).fetchall()

print("Rows loaded for file:", n)
print("Date coverage:", date_cov)
print("Critical nulls:", nulls)
print("Payment counts:", payment)


Rows loaded for file: 4251015
Date coverage: (datetime.datetime(2025, 8, 31, 23, 45, 38), datetime.datetime(2025, 10, 1, 0, 0, 11), datetime.datetime(2025, 8, 31, 23, 46, 43), datetime.datetime(2025, 10, 3, 14, 49, 31))
Critical nulls: (0, 0, 0, 0)
Payment counts: [(1, 2676305), (0, 1067195), (2, 372069), (4, 107514), (3, 27932)]
