First, the taxi zone data is extracted. Since this table is not yet as extensive as the monthly data on taxi journeys.

For geo-evaluations (pickup/dropoff, boroughs, airports), the official taxi zone lookup table from NYC TLC is loaded and stored as the dimension table dim_taxi_zone. 

This separation (dimension vs. fact table Trips) simplifies subsequent joins, reduces redundancy and enables consistent geographical analyses.

In [None]:
## Engine und Schema setzen

import os, io, requests
import pandas as pd
from dotenv import load_dotenv
from urllib.parse import quote_plus
from sqlalchemy import create_engine, text

load_dotenv()

PG_USER = os.getenv("POSTGRES_USER")
PG_PASS = quote_plus(os.getenv("POSTGRES_PASS"))
PG_HOST = os.getenv("POSTGRES_HOST")
PG_PORT = os.getenv("POSTGRES_PORT", "5432")
PG_DB   = os.getenv("POSTGRES_DB")
PG_SCHEMA = os.getenv("POSTGRES_SCHEMA", "public")

url = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}" ##SQLAlchemy-DB-URL
engine = create_engine(url, future=True)

print("Connected. Target schema:", PG_SCHEMA)

Connected. Target schema: s_patrickpaubandt


In [None]:
##Load CSV + write to DB

zone_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"

zones = (
    pd.read_csv(zone_url)
      .rename(columns=str.strip)
)

zones.columns = zones.columns.str.lower()

zones = (
    zones[["locationid", "borough", "zone", "service_zone"]]
      .drop_duplicates()
)

zones.to_sql("dim_taxi_zone", engine, schema=PG_SCHEMA, if_exists="replace", index=False)

print("Loaded rows:", len(zones))
zones.head()



Loaded rows: 265


Unnamed: 0,locationid,borough,zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [4]:
##DB Check
with engine.connect() as conn:
    n = conn.execute(text(f"SELECT COUNT(*) FROM {PG_SCHEMA}.dim_taxi_zone;")).scalar()
    sample = conn.execute(text(f"SELECT * FROM {PG_SCHEMA}.dim_taxi_zone ORDER BY locationid LIMIT 5;")).fetchall()

print("Rows in DB:", n)
print("Sample:", sample)


Rows in DB: 265
Sample: [(1, 'EWR', 'Newark Airport', 'EWR'), (2, 'Queens', 'Jamaica Bay', 'Boro Zone'), (3, 'Bronx', 'Allerton/Pelham Gardens', 'Boro Zone'), (4, 'Manhattan', 'Alphabet City', 'Yellow Zone'), (5, 'Staten Island', 'Arden Heights', 'Boro Zone')]
