Staging table for trip data: Before importing the TLC trip data, a staging table with explicit data types was created. This prevents automatic type inference (e.g., by pandas.to_sql) from leading to incorrect data types (e.g., TEXT instead of TIMESTAMP or inaccurate numeric types). The staging table stores the raw data as close to the original as possible and adds metadata (source_file, ingested_at) for reproducibility and debugging. Indexes on pickup date and pickup/dropoff zone support subsequent time and geo analyses.

Why we are doing this now (and not importing immediately)

We want to control which data types end up in the DB (dates, monetary amounts, IDs).

pandas.to_sql() would otherwise “guess” data types → this often leads to problems with millions of rows (TEXT instead of TIMESTAMP, etc.).

Staging is “raw-ish”; we store the data close to the original so that cleaning remains traceable later.

In [3]:
import os
from dotenv import load_dotenv
from urllib.parse import quote_plus
from sqlalchemy import create_engine, text

load_dotenv()

PG_USER = os.getenv("POSTGRES_USER")
PG_PASS = quote_plus(os.getenv("POSTGRES_PASS"))
PG_HOST = os.getenv("POSTGRES_HOST")
PG_PORT = os.getenv("POSTGRES_PORT", "5432")
PG_DB   = os.getenv("POSTGRES_DB")
PG_SCHEMA = os.getenv("POSTGRES_SCHEMA", "public")

url = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"
engine = create_engine(url, future=True)

print("Setup OK:", PG_SCHEMA)


Setup OK: s_patrickpaubandt


In [None]:
from sqlalchemy import text
##f“”“...”“” is a multi-line formatted string in which placeholders such as {PG_SCHEMA} can be used
ddl = f"""                
DROP TABLE IF EXISTS {PG_SCHEMA}.stg_yellow_trips;

CREATE TABLE {PG_SCHEMA}.stg_yellow_trips (
  vendorid                 SMALLINT,
  tpep_pickup_datetime     TIMESTAMP,
  tpep_dropoff_datetime    TIMESTAMP,
  passenger_count          SMALLINT,
  trip_distance            DOUBLE PRECISION,
  ratecodeid               SMALLINT,
  store_and_fwd_flag       TEXT,
  pulocationid             INTEGER,
  dolocationid             INTEGER,
  payment_type             SMALLINT,
  fare_amount              NUMERIC(10,2),
  extra                    NUMERIC(10,2),
  mta_tax                  NUMERIC(10,2),
  tip_amount               NUMERIC(10,2),
  tolls_amount             NUMERIC(10,2),
  improvement_surcharge    NUMERIC(10,2),
  total_amount             NUMERIC(10,2),
  congestion_surcharge     NUMERIC(10,2),
  airport_fee              NUMERIC(10,2),
  cbd_congestion_fee       NUMERIC(10,2),

  -- Metadaten
  source_file              TEXT NOT NULL,
  ingested_at              TIMESTAMP NOT NULL DEFAULT NOW()
);

CREATE INDEX IF NOT EXISTS ix_stg_yellow_pickup_dt
  ON {PG_SCHEMA}.stg_yellow_trips (tpep_pickup_datetime);

CREATE INDEX IF NOT EXISTS ix_stg_yellow_pu_do
  ON {PG_SCHEMA}.stg_yellow_trips (pulocationid, dolocationid);
"""  ##Why Index? Database does not have to search every row of the table during frequent searches/filters/joins, but can find matching records more quickly.

with engine.begin() as conn:
    conn.execute(text(ddl))

print("Created table stg_yellow_trips + indexes")


In [None]:
print(ddl)

In [15]:
with engine.connect() as conn:
    cols = conn.execute(text(f"""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = :schema AND table_name = 'stg_yellow_trips'
        ORDER BY ordinal_position;
    """), {"schema": PG_SCHEMA}).fetchall()

cols[:8], len(cols)


([('vendorid', 'smallint'),
  ('tpep_pickup_datetime', 'timestamp without time zone'),
  ('tpep_dropoff_datetime', 'timestamp without time zone'),
  ('passenger_count', 'smallint'),
  ('trip_distance', 'double precision'),
  ('ratecodeid', 'smallint'),
  ('store_and_fwd_flag', 'text'),
  ('pulocationid', 'integer')],
 22)