Before we extract further data from additional months, we would like to perform a small test analysis to gain a better understanding of the data. Tableau is very well suited for this purpose.

we add:
pickup_dow_name (Mon/Tue/… instead of 0–6)
is_airport_trip (JFK/LGA/EWR yes/no)
tip_rate_pct (scaled as a percentage)
optional: fare_bucket, distance_bucket

In [5]:
##Set Up für jedes Notebook

import os
from dotenv import load_dotenv
from urllib.parse import quote_plus
from sqlalchemy import create_engine, text

load_dotenv()

PG_USER = os.getenv("POSTGRES_USER")
PG_PASS = quote_plus(os.getenv("POSTGRES_PASS"))
PG_HOST = os.getenv("POSTGRES_HOST")
PG_PORT = os.getenv("POSTGRES_PORT", "5432")
PG_DB   = os.getenv("POSTGRES_DB")
PG_SCHEMA = os.getenv("POSTGRES_SCHEMA", "public")

missing = [k for k,v in {
    "POSTGRES_USER": PG_USER,
    "POSTGRES_PASS": os.getenv("POSTGRES_PASS"),
    "POSTGRES_HOST": PG_HOST,
    "POSTGRES_DB": PG_DB,
    "POSTGRES_SCHEMA": PG_SCHEMA
}.items() if not v]
if missing:
    raise ValueError(f"Missing env vars: {missing}")

url = f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"
engine = create_engine(url, future=True)

# kurzer Ping
with engine.connect() as conn:
    print(conn.execute(text("SELECT current_user, current_database(), current_schema();")).fetchone())

print("Setup OK. Schema:", PG_SCHEMA)


('patrickpaubandt', 'nf_da_onl_13102025', 'public')
Setup OK. Schema: s_patrickpaubandt


In [None]:
from sqlalchemy import text

sql = f"""
CREATE OR REPLACE VIEW {PG_SCHEMA}.vw_tableau_tip AS
SELECT
    -- IDs & Time
    pickup_date,
    pickup_month,
    pickup_hour,
    pickup_dow,
    CASE pickup_dow
      WHEN 0 THEN 'Sun'
      WHEN 1 THEN 'Mon'
      WHEN 2 THEN 'Tue'
      WHEN 3 THEN 'Wed'
      WHEN 4 THEN 'Thu'
      WHEN 5 THEN 'Fri'
      WHEN 6 THEN 'Sat'
    END AS pickup_dow_name,

    -- Trip measures
    trip_distance,
    duration_min,
    passenger_count,
    fare_amount,
    total_amount,

    -- Tip measures
    tip_amount,
    tip_rate,
    (tip_rate * 100.0) AS tip_rate_pct,
    is_tipped,

    -- Payment
    payment_type,
    payment_group,

    -- Geography (Pickup/Dropoff)
    pulocationid,
    pu_borough,
    pu_zone,
    pu_service_zone,

    dolocationid,
    do_borough,
    do_zone,
    do_service_zone,

    -- Airport flag (using zone name)
    CASE
      WHEN pu_zone ILIKE '%JFK%' OR do_zone ILIKE '%JFK%' THEN 1
      WHEN pu_zone ILIKE '%LaGuardia%' OR do_zone ILIKE '%LaGuardia%' THEN 1
      WHEN pu_borough = 'EWR' OR do_borough = 'EWR' THEN 1
      ELSE 0
    END AS is_airport_trip,

    -- Source
    source_file
FROM {PG_SCHEMA}.vw_yellow_clean_tip;
"""

with engine.begin() as conn:
    conn.execute(text(sql))

print("Created view:", f"{PG_SCHEMA}.vw_tableau_tip")


Created view: s_patrickpaubandt.vw_tableau_tip


Tableau view: A dedicated view (vw_tableau_tip) was created for visualisation in Tableau, which contains all fields relevant to the dashboard and translates codes into meaningful categories (e.g. weekday names, payment groups, airport flag). This reduces calculations in Tableau, improves performance and ensures that key figures remain consistent with the SQL-defined database.

In [7]:
from sqlalchemy import text

with engine.connect() as conn:
    res = conn.execute(text(f"""
        SELECT
          COUNT(*) AS n,
          SUM(CASE WHEN vendorid IS NULL THEN 1 ELSE 0 END) AS vendorid_nulls,
          SUM(CASE WHEN ratecodeid IS NULL THEN 1 ELSE 0 END) AS ratecodeid_nulls,
          SUM(CASE WHEN pulocationid IS NULL THEN 1 ELSE 0 END) AS pulocationid_nulls,
          SUM(CASE WHEN dolocationid IS NULL THEN 1 ELSE 0 END) AS dolocationid_nulls
        FROM {PG_SCHEMA}.stg_yellow_trips;
    """)).fetchone()

print(res)


(100000, 0, 0, 0, 0)


In [8]:
with engine.connect() as conn:
    sample = conn.execute(text(f"""
      SELECT vendorid, ratecodeid, pulocationid, dolocationid, payment_type, fare_amount, tip_amount
      FROM {PG_SCHEMA}.stg_yellow_trips
      LIMIT 5;
    """)).fetchall()

sample


[(2, 1, 138, 114, 1, Decimal('42.90'), Decimal('10.73')),
 (2, 1, 93, 157, 1, Decimal('26.80'), Decimal('5.86')),
 (2, 1, 68, 13, 1, Decimal('19.80'), Decimal('5.11')),
 (2, 1, 234, 87, 1, Decimal('17.70'), Decimal('3.52')),
 (2, 1, 230, 151, 1, Decimal('14.90'), Decimal('4.13'))]

Problem with NULL-Values in above mentioned columns (e.g. vendorid) solved!