### DB-Set Up

In [10]:
## DB SetUp and SetUp Check
## For additional help, check the .env.example file.

import os
from dotenv import load_dotenv
from urllib.parse import quote_plus
from sqlalchemy import create_engine, text

load_dotenv()

PG_USER = os.getenv("POSTGRES_USER")
PG_PASS_RAW = os.getenv("POSTGRES_PASS")
PG_HOST = os.getenv("POSTGRES_HOST")
PG_PORT = os.getenv("POSTGRES_PORT", "5432")
PG_DB   = os.getenv("POSTGRES_DB")
PG_SCHEMA = os.getenv("POSTGRES_SCHEMA", "public")

missing = [k for k,v in {
    "POSTGRES_USER": PG_USER,
    "POSTGRES_PASS": PG_PASS_RAW,
    "POSTGRES_HOST": PG_HOST,
    "POSTGRES_DB": PG_DB,
    "POSTGRES_SCHEMA": PG_SCHEMA
}.items() if not v]
if missing:
    raise ValueError(f"Missing env vars: {missing}")

PG_PASS = quote_plus(PG_PASS_RAW)
url = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"
engine = create_engine(url, future=True)

with engine.connect() as conn:
    print(conn.execute(text("SELECT current_user, current_database(), current_schema();")).fetchone())

print("Setup OK. Schema:", PG_SCHEMA)

('patrickpaubandt', 'nf_da_onl_13102025', 'public')
Setup OK. Schema: s_patrickpaubandt


### Clean View / Rules
This script creates (or replaces) the Postgres view vw_yellow_clean_tip from stg_yellow_trips and enriches it with pickup/drop-off zone info by joining dim_taxi_zone (PU + DO). It adds derived fields for time, duration, pre-tip totals, and tipping KPIs (e.g., pretip_total, is_tipped, tip_rate_fare, tip_rate_pretip). It also filters out invalid trips using plausibility rules (valid timestamps, max 6h, positive fare/distance, non-negative tips, positive pre-tip) and enforces surcharge checks (improvement_surcharge = 1.00, mta_tax in 0.00/0.50).

In [11]:
from sqlalchemy import text

CLEAN_SQL = f"""
CREATE OR REPLACE VIEW {PG_SCHEMA}.vw_yellow_clean_tip AS
SELECT
    t.vendorid,
    t.tpep_pickup_datetime,
    t.tpep_dropoff_datetime,
    t.passenger_count,
    t.trip_distance,
    t.ratecodeid,
    t.pulocationid,
    t.dolocationid,
    t.payment_type,
    t.fare_amount,
    t.extra,
    t.mta_tax,
    t.tip_amount,
    t.tolls_amount,
    t.improvement_surcharge,
    t.total_amount,
    t.congestion_surcharge,
    t.airport_fee,
    t.cbd_congestion_fee,
    t.source_file,
    t.ingested_at,

    -- time features
    EXTRACT(HOUR FROM t.tpep_pickup_datetime)::int AS pickup_hour,
    EXTRACT(DOW  FROM t.tpep_pickup_datetime)::int AS pickup_dow,
    DATE_TRUNC('day', t.tpep_pickup_datetime)::date AS pickup_date,
    DATE_TRUNC('month', t.tpep_pickup_datetime)::date AS pickup_month,

    -- duration (min)
    EXTRACT(EPOCH FROM (t.tpep_dropoff_datetime - t.tpep_pickup_datetime)) / 60.0 AS duration_min,

    -- totals
    (t.total_amount - t.tip_amount) AS pretip_total,
    ((t.total_amount - t.tip_amount) - t.fare_amount) AS extras_pretip,

    -- tip KPIs
    (t.tip_amount > 0)::int AS is_tipped,
    CASE WHEN t.fare_amount > 0 THEN (t.tip_amount / t.fare_amount) END AS tip_rate_fare,
    CASE WHEN (t.total_amount - t.tip_amount) > 0 THEN (t.tip_amount / (t.total_amount - t.tip_amount)) END AS tip_rate_pretip,

    -- geo joins
    pu.borough AS pu_borough,
    pu.zone    AS pu_zone,
    pu.service_zone AS pu_service_zone,
    do_.borough AS do_borough,
    do_.zone    AS do_zone,
    do_.service_zone AS do_service_zone

FROM {PG_SCHEMA}.stg_yellow_trips t
LEFT JOIN {PG_SCHEMA}.dim_taxi_zone pu ON t.pulocationid = pu.locationid
LEFT JOIN {PG_SCHEMA}.dim_taxi_zone do_ ON t.dolocationid = do_.locationid
WHERE
    -- plausibility rules
    t.tpep_pickup_datetime IS NOT NULL
    AND t.tpep_dropoff_datetime IS NOT NULL
    AND t.tpep_dropoff_datetime > t.tpep_pickup_datetime
    AND (t.tpep_dropoff_datetime - t.tpep_pickup_datetime) <= INTERVAL '6 hours'
    AND t.fare_amount > 0
    AND t.trip_distance > 0
    AND t.tip_amount >= 0
    AND (t.total_amount - t.tip_amount) > 0

    -- surcharge sanity (as agreed)
    AND t.improvement_surcharge = 1.00
    AND t.mta_tax IN (0.00, 0.50);
"""

with engine.begin() as conn:
    conn.execute(text(CLEAN_SQL))

print("Created/updated vw_yellow_clean_tip")


Created/updated vw_yellow_clean_tip


### Tableau View
For Tableau, a dedicated view is provided that includes all required raw and feature fields (time features, geo labels, amount columns including fees, pretip_total, extras_pretip, and two tip rates: one based on fare and one based on the pre-tip total amount).

This keeps KPI definitions centrally consistent in SQL, while Tableau can focus on visualization and interactivity.

In [12]:
from sqlalchemy import text

TABLEAU_SQL = f"""
CREATE OR REPLACE VIEW {PG_SCHEMA}.vw_tableau_tip AS
SELECT
    pickup_date,
    pickup_month,
    pickup_hour,
    pickup_dow,
    CASE pickup_dow
      WHEN 0 THEN 'Sun'
      WHEN 1 THEN 'Mon'
      WHEN 2 THEN 'Tue'
      WHEN 3 THEN 'Wed'
      WHEN 4 THEN 'Thu'
      WHEN 5 THEN 'Fri'
      WHEN 6 THEN 'Sat'
    END AS pickup_dow_name,
    

    -- core trip measures
    trip_distance,
    duration_min,

    -- amounts
    fare_amount,
    extra,
    mta_tax,
    improvement_surcharge,
    congestion_surcharge,
    airport_fee,
    cbd_congestion_fee,
    tolls_amount,
    pretip_total,
    extras_pretip,
    total_amount,

    -- tip measures
    tip_amount,
    tip_rate_fare,
    (tip_rate_fare * 100.0) AS tip_rate_fare_pct,
    tip_rate_pretip,
    (tip_rate_pretip * 100.0) AS tip_rate_pretip_pct,
    is_tipped,

    -- payment + geo
    pulocationid,
    pu_borough,
    pu_zone,
    dolocationid,
    do_borough,
    do_zone,

    CASE
      WHEN pu_zone ILIKE '%JFK%' OR do_zone ILIKE '%JFK%' THEN 1
      WHEN pu_zone ILIKE '%LaGuardia%' OR do_zone ILIKE '%LaGuardia%' THEN 1
      ELSE 0
    END AS is_airport_trip,

    source_file
FROM {PG_SCHEMA}.vw_yellow_clean_tip;
"""

with engine.begin() as conn:
    conn.execute(text(TABLEAU_SQL))

print("Created/updated vw_tableau_tip")


Created/updated vw_tableau_tip


### Quick check (rowcount and date coverage in clean)

In [13]:
from sqlalchemy import text

with engine.connect() as conn:
    n_clean = conn.execute(text(f"SELECT COUNT(*) FROM {PG_SCHEMA}.vw_yellow_clean_tip;")).scalar()
    cov = conn.execute(text(f"""
        SELECT MIN(tpep_pickup_datetime), MAX(tpep_pickup_datetime)
        FROM {PG_SCHEMA}.vw_yellow_clean_tip;
    """)).fetchone()

print("Rows in clean view:", n_clean)
print("Pickup coverage (clean):", cov)


Rows in clean view: 6795596
Pickup coverage (clean): (datetime.datetime(2025, 7, 1, 0, 0, 6), datetime.datetime(2025, 9, 30, 23, 59, 59))
