Test import: Before loading entire months, an import test with 100,000 rows was performed. The aim was to validate column consistency, data types, and the stability of the import process. This allows schema problems and vulnerabilities to be identified early on, before several million rows are loaded.

In [27]:
##Set Up für jedes Notebook

import os
from dotenv import load_dotenv
from urllib.parse import quote_plus
from sqlalchemy import create_engine, text

load_dotenv()

PG_USER = os.getenv("POSTGRES_USER")
PG_PASS = quote_plus(os.getenv("POSTGRES_PASS"))
PG_HOST = os.getenv("POSTGRES_HOST")
PG_PORT = os.getenv("POSTGRES_PORT", "5432")
PG_DB   = os.getenv("POSTGRES_DB")
PG_SCHEMA = os.getenv("POSTGRES_SCHEMA", "public")

missing = [k for k,v in {
    "POSTGRES_USER": PG_USER,
    "POSTGRES_PASS": os.getenv("POSTGRES_PASS"),
    "POSTGRES_HOST": PG_HOST,
    "POSTGRES_DB": PG_DB,
    "POSTGRES_SCHEMA": PG_SCHEMA
}.items() if not v]
if missing:
    raise ValueError(f"Missing env vars: {missing}")

url = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"
engine = create_engine(url, future=True)

# kurzer Ping
with engine.connect() as conn:
    print(conn.execute(text("SELECT current_user, current_database(), current_schema();")).fetchone())

print("Setup OK. Schema:", PG_SCHEMA)


('patrickpaubandt', 'nf_da_onl_13102025', 'public')
Setup OK. Schema: s_patrickpaubandt


In [None]:
## Download + 100k Rows

import io, requests
import pyarrow.parquet as pq        ## to read parquet data
import pyarrow as pa
import pandas as pd

PARQUET_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-09.parquet"     ##Download-link
SOURCE_FILE = "yellow_tripdata_2025-09.parquet"         ##name of the downloaded file

resp = requests.get(PARQUET_URL, timeout=180)       ##downloads data, download-time max 180 seconds
resp.raise_for_status()                             ## When Download not successfull (404/500)

table = pq.read_table(io.BytesIO(resp.content))


table_100k = table.slice(0, 100_000)                ### only take a sample of 100k Rows

print("Rows in file:", table.num_rows)
print("Rows in sample:", table_100k.num_rows)
print("Columns:", table_100k.schema.names)


Rows in file: 4251015
Rows in sample: 100000
Columns: ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'Airport_fee', 'cbd_congestion_fee']


In [None]:
## Align columns on staging + add source_file

# Staging columns (excluding ingested_at, which sets the DB default = NOW())
# make sure that the columns-names are equal to the column names in staging-table
target_cols = [
    "vendorid","tpep_pickup_datetime","tpep_dropoff_datetime","passenger_count","trip_distance","ratecodeid",
    "store_and_fwd_flag","pulocationid","dolocationid","payment_type",
    "fare_amount","extra","mta_tax","tip_amount","tolls_amount","improvement_surcharge","total_amount",
    "congestion_surcharge","airport_fee","cbd_congestion_fee",
    "source_file"
]

# If certain fields are missing in the month, add NULL. #
# These columns are missing sometimes we make sure that the columns are still created to ensure a consistent table
for col in ["congestion_surcharge", "airport_fee", "cbd_congestion_fee"]:
    if col not in table_100k.schema.names:
        table_100k = table_100k.append_column(col, pa.nulls(table_100k.num_rows, type=pa.float64()))

# Add source_file: Generates a column with values that indicate which (Parquet) file the data comes from.
table_100k = table_100k.append_column("source_file", pa.array([SOURCE_FILE] * table_100k.num_rows))

# Select only the columns we need
table_100k = table_100k.select([c for c in target_cols if c in table_100k.schema.names])

df = table_100k.to_pandas()

# Ensure that all target_cols exist in the DF: If one is missing, this column is created and displayed with zero values.
for c in target_cols:
    if c not in df.columns:
        df[c] = None

df = df[target_cols]    #Arrange the columns exactly in the order specified by target_cols.

df.head()


Unnamed: 0,vendorid,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecodeid,store_and_fwd_flag,pulocationid,dolocationid,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,cbd_congestion_fee,source_file
0,,2025-09-01 00:19:20,2025-09-01 00:45:17,1,9.92,,N,,,1,...,6.0,0.5,10.73,0.0,1.0,66.13,2.5,,0.75,yellow_tripdata_2025-09.parquet
1,,2025-09-01 00:15:20,2025-09-01 00:26:08,2,6.82,,N,,,1,...,1.0,0.5,5.86,0.0,1.0,35.16,0.0,,0.0,yellow_tripdata_2025-09.parquet
2,,2025-09-01 00:06:07,2025-09-01 00:22:23,1,3.95,,N,,,1,...,1.0,0.5,5.11,0.0,1.0,30.66,2.5,,0.75,yellow_tripdata_2025-09.parquet
3,,2025-09-01 00:49:47,2025-09-01 01:04:49,1,3.14,,N,,,1,...,1.0,0.5,3.52,0.0,1.0,26.97,2.5,,0.75,yellow_tripdata_2025-09.parquet
4,,2025-09-01 00:05:00,2025-09-01 00:15:32,6,2.81,,N,,,1,...,1.0,0.5,4.13,0.0,1.0,24.78,2.5,,0.75,yellow_tripdata_2025-09.parquet


In [29]:
## Spaltennamen normalisieren & korrekt mappen

# 1) Arrow -> pandas (falls df noch nicht erstellt ist)
# df = table_100k.to_pandas()

# 2) Spaltennamen aus Parquet normalisieren
df = table_100k.to_pandas()
df.columns = [c.strip().lower() for c in df.columns]

# 3) TLC-Varianten auf unsere DB-Spalten mappen
df = df.rename(columns={
    "vendorid": "vendorid",          # ok nach lower()
    "ratecodeid": "ratecodeid",      # ok nach lower()
    "pulocationid": "pulocationid",  # ok nach lower()
    "dolocationid": "dolocationid",  # ok nach lower()
    "airport_fee": "airport_fee"     # ok nach lower() aus Airport_fee
})

# 4) source_file ergänzen
SOURCE_FILE = "yellow_tripdata_2025-09.parquet"
df["source_file"] = SOURCE_FILE

# 5) Auf exakt die Staging-Spalten bringen (ohne ingested_at)
target_cols = [
    "vendorid","tpep_pickup_datetime","tpep_dropoff_datetime","passenger_count","trip_distance","ratecodeid",
    "store_and_fwd_flag","pulocationid","dolocationid","payment_type",
    "fare_amount","extra","mta_tax","tip_amount","tolls_amount","improvement_surcharge","total_amount",
    "congestion_surcharge","airport_fee","cbd_congestion_fee",
    "source_file"
]

# Fehlende Spalten (falls TLC mal wieder variiert) ergänzen
for c in target_cols:
    if c not in df.columns:
        df[c] = None

df = df[target_cols]

# 6) Quick sanity: sollten jetzt nicht mehr None sein
print(df[["vendorid","ratecodeid","pulocationid","dolocationid","airport_fee"]].isna().mean())
df.head()


vendorid        0.0
ratecodeid      0.0
pulocationid    0.0
dolocationid    0.0
airport_fee     0.0
dtype: float64


Unnamed: 0,vendorid,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecodeid,store_and_fwd_flag,pulocationid,dolocationid,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,cbd_congestion_fee,source_file
0,2,2025-09-01 00:19:20,2025-09-01 00:45:17,1,9.92,1,N,138,114,1,...,6.0,0.5,10.73,0.0,1.0,66.13,2.5,1.75,0.75,yellow_tripdata_2025-09.parquet
1,2,2025-09-01 00:15:20,2025-09-01 00:26:08,2,6.82,1,N,93,157,1,...,1.0,0.5,5.86,0.0,1.0,35.16,0.0,0.0,0.0,yellow_tripdata_2025-09.parquet
2,2,2025-09-01 00:06:07,2025-09-01 00:22:23,1,3.95,1,N,68,13,1,...,1.0,0.5,5.11,0.0,1.0,30.66,2.5,0.0,0.75,yellow_tripdata_2025-09.parquet
3,2,2025-09-01 00:49:47,2025-09-01 01:04:49,1,3.14,1,N,234,87,1,...,1.0,0.5,3.52,0.0,1.0,26.97,2.5,0.0,0.75,yellow_tripdata_2025-09.parquet
4,2,2025-09-01 00:05:00,2025-09-01 00:15:32,6,2.81,1,N,230,151,1,...,1.0,0.5,4.13,0.0,1.0,24.78,2.5,0.0,0.75,yellow_tripdata_2025-09.parquet


In [38]:


df.to_sql(
    name="stg_yellow_trips",
    con=engine,
    schema=PG_SCHEMA,
    if_exists="append",
    index=False,
    method="multi",
    chunksize=10_000
)

print("Inserted rows:", len(df))


Inserted rows: 100000


In [35]:
from sqlalchemy import text

with engine.connect() as conn:
    total = conn.execute(text(f"SELECT COUNT(*) FROM {PG_SCHEMA}.stg_yellow_trips;")).scalar()
    by_file = conn.execute(text(f"""
        SELECT source_file, COUNT(*) 
        FROM {PG_SCHEMA}.stg_yellow_trips
        GROUP BY 1
        ORDER BY 1;
    """)).fetchall()

print("Total rows in staging:", total)
print("By file:", by_file)


Total rows in staging: 100000
By file: [('yellow_tripdata_2025-09.parquet', 100000)]


In [36]:
from sqlalchemy import text

with engine.begin() as conn:
    conn.execute(text(f"TRUNCATE TABLE {PG_SCHEMA}.stg_yellow_trips;"))

print("Staging truncated.")


Staging truncated.
