In [None]:
# build_db.py
import duckdb, pandas as pd, zipfile, io


from pathlib import Path
GTFS_ZIP = str(Path(__file__).with_name("data") / "gtfs_m.zip")
DB_PATH = str(Path(__file__).with_name("data") / "mta_gtfs.duckdb")

# GTFS_ZIP = "gtfs_m.zip"  
# DB_PATH  = "mta_gtfs.duckdb"

## Load and setup data

### Load GTFS files

In [10]:
def read_csv(z, name, **kw):
    with z.open(name) as f:
        return pd.read_csv(io.TextIOWrapper(f, encoding=kw.pop("encoding", "utf-8")), **kw)

def to_sec(hms):
    # supports times like 24:15:00+
    h, m, s = map(int, hms.split(":"))
    return h*3600 + m*60 + s

with zipfile.ZipFile(GTFS_ZIP) as z:
    routes = read_csv(z, "routes.txt")
    trips  = read_csv(z, "trips.txt")
    stops  = read_csv(z, "stops.txt")
    stimes = read_csv(z, "stop_times.txt")
    cal    = read_csv(z, "calendar.txt")
    caldates = read_csv(z, "calendar_dates.txt") if "calendar_dates.txt" in z.namelist() else pd.DataFrame()

# times → seconds
stimes["arrival_sec"] = stimes["arrival_time"].map(to_sec)
# base stop level only
# stops = stops.rename(columns={"stop_lat":"lat","stop_lon":"lon"})
stops = stops.loc[stops.get("location_type", 0).fillna(0).eq(0), ["stop_id","stop_name","stop_lat","stop_lon","parent_station"]]

# project to EPSG:2263 (feet-based CRS for NYC)
tf = Transformer.from_crs("EPSG:4326", "EPSG:2263", always_xy=True)
x, y = tf.transform(stops["stop_lon"].values, stops["stop_lat"].values)
stops["x2263"] = x
stops["y2263"] = y

### Create Database

In [15]:
con = duckdb.connect(DB_PATH)
con.execute("INSTALL spatial; LOAD spatial;")  # lets us use ST_* if desired

# write tables
con.register("routes_df", routes)
con.register("trips_df", trips)
con.register("stops_df", stops)
con.register("stimes_df", stimes)
con.register("cal_df", cal)
if not caldates.empty:
    con.register("caldates_df", caldates)


# Base/source tables (persisted) -> raw tables from GTFS data
con.execute("CREATE OR REPLACE TABLE routes_base AS SELECT * FROM routes_df")
con.execute("""
            CREATE OR REPLACE TABLE trips_base AS
            SELECT trip_id, route_id, direction_id, trip_headsign, service_id
            FROM trips_df
""")
con.execute("""
            CREATE OR REPLACE TABLE stop_times_base AS
            SELECT trip_id, stop_id, stop_sequence, arrival_sec  -- arrival_sec already added in pandas
            FROM stimes_df
""")
con.execute("""
            CREATE OR REPLACE TABLE calendar_base AS
            SELECT CAST(start_date AS VARCHAR) AS start_date,
                CAST(end_date   AS VARCHAR) AS end_date,
                service_id, monday,tuesday,wednesday,thursday,friday,saturday,sunday
            FROM cal_df
""")
if not caldates.empty:
    con.execute("""
    CREATE OR REPLACE TABLE calendar_dates_base AS
    SELECT service_id, CAST(date AS VARCHAR) AS date, exception_type
    FROM caldates_df
    """)

# Dim tables (persisted) -> reference tables (cleaned/descriptive lookup tables)
con.execute("CREATE OR REPLACE TABLE dim_routes AS SELECT * FROM routes_base")
con.execute("""
CREATE OR REPLACE TABLE dim_stops  AS
SELECT stop_id, stop_name, stop_lat, stop_lon, parent_station, x2263, y2263
FROM stops_df
""")

# Fact table (persisted) -> big event table, every stop arrival (to be queried from dim tables)
con.execute("""
CREATE OR REPLACE TABLE fact_stop_events AS
SELECT
  t.route_id,
  t.direction_id,
  t.service_id,
  st.stop_id,
  st.stop_sequence,
  st.arrival_sec,
  t.trip_id
FROM stop_times_base st
JOIN trips_base      t ON st.trip_id = t.trip_id
""")

# Convenience copy of trips with just what you need (persisted)
con.execute("""
CREATE OR REPLACE TABLE dim_trips AS
SELECT trip_id, route_id, direction_id, trip_headsign, service_id
FROM trips_base
""")

# helper VIEWS for day types
con.execute("""
CREATE OR REPLACE VIEW svcs_weekday AS
SELECT DISTINCT service_id
FROM calendar_base
WHERE monday=1 OR tuesday=1 OR wednesday=1 OR thursday=1 OR friday=1
""")
con.execute("""
CREATE OR REPLACE VIEW svcs_saturday AS
SELECT DISTINCT service_id
FROM calendar_base
WHERE saturday=1
""")
con.execute("""
CREATE OR REPLACE VIEW svcs_sunday AS
SELECT DISTINCT service_id
FROM calendar_base
WHERE sunday=1
""")


print(f"✅ Built {DB_PATH}")

✅ Built mta_gtfs.duckdb


## Explore DuckDB

In [46]:
con.close()

In [None]:
# con = duckdb.connect(DB_PATH, read_only=True)

# # If you want spatial functions (ST_*):
# con.execute("INSTALL spatial; LOAD spatial;")

In [3]:
import os
from pathlib import Path
import duckdb
import pandas as pd

PARQ_BASE = Path(r"parquet").as_posix()

con = duckdb.connect()  # in-memory; we're just querying Parquet

In [4]:
def attach_parquet_views(base: str):
    con.execute(f"CREATE OR REPLACE VIEW dim_stops AS SELECT * FROM read_parquet('{base}/dim_stops/*.parquet')")
    con.execute(f"CREATE OR REPLACE VIEW dim_trips AS SELECT * FROM read_parquet('{base}/dim_trips/*.parquet')")
    con.execute(f"CREATE OR REPLACE VIEW dim_routes AS SELECT * FROM read_parquet('{base}/dim_routes/*.parquet')")
    con.execute(f"CREATE OR REPLACE VIEW calendar_base AS SELECT * FROM read_parquet('{base}/calendar_base/*.parquet')")
    con.execute(f"CREATE OR REPLACE VIEW fact_stop_events AS SELECT * FROM read_parquet('{base}/fact_stop_events/*.parquet')")

attach_parquet_views(PARQ_BASE)

# quick sanity: row counts
con.execute("""
SELECT 'dim_stops' t, COUNT(*) n FROM dim_stops
UNION ALL SELECT 'dim_trips', COUNT(*) FROM dim_trips
UNION ALL SELECT 'dim_routes', COUNT(*) FROM dim_routes
UNION ALL SELECT 'calendar_base', COUNT(*) FROM calendar_base
UNION ALL SELECT 'fact_stop_events', COUNT(*) FROM fact_stop_events
""").fetchdf()


Unnamed: 0,t,n
0,dim_stops,14456
1,dim_trips,230451
2,dim_routes,1532
3,calendar_base,136
4,fact_stop_events,7447673


In [5]:
feeds = con.execute("SELECT DISTINCT feed_id FROM dim_routes ORDER BY feed_id").fetchdf()
feeds

Unnamed: 0,feed_id
0,mta-nyct-bus-bronx
1,mta-nyct-bus-brooklyn
2,mta-nyct-bus-busco
3,mta-nyct-bus-manhattan
4,mta-nyct-bus-queens
5,mta-nyct-bus-si


In [4]:
df_top_routes = con.execute("""
SELECT feed_id, route_id, COUNT(*) AS stop_events
FROM fact_stop_events
GROUP BY 1,2
ORDER BY stop_events DESC
LIMIT 25
""").fetchdf()
df_top_routes

Unnamed: 0,feed_id,route_id,stop_events
0,mta-nyct-bus-brooklyn,B6,135232
1,mta-nyct-bus-brooklyn,B15,97827
2,mta-nyct-bus-si,S78,80098
3,mta-nyct-bus-brooklyn,B82,77476
4,mta-nyct-bus-manhattan,M101,76687
5,mta-nyct-bus-brooklyn,B41,75908
6,mta-nyct-bus-brooklyn,B46,74993
7,mta-nyct-bus-brooklyn,B44,72989
8,mta-nyct-bus-brooklyn,B8,71967
9,mta-nyct-bus-manhattan,M15,71490


In [5]:
df_stops = con.execute("""
SELECT feed_id, COUNT(DISTINCT stop_id) AS n_stops
FROM dim_stops
GROUP BY 1
ORDER BY 1
""").fetchdf()
df_stops

Unnamed: 0,feed_id,n_stops
0,mta-nyct-bus-bronx,1893
1,mta-nyct-bus-brooklyn,4600
2,mta-nyct-bus-busco,2758
3,mta-nyct-bus-manhattan,1826
4,mta-nyct-bus-queens,1414
5,mta-nyct-bus-si,1965


In [6]:
# List tables
con.execute("SHOW ALL TABLES").fetchdf()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,memory,main,calendar_base,"[service_id, monday, tuesday, wednesday, thurs...","[VARCHAR, BIGINT, BIGINT, BIGINT, BIGINT, BIGI...",False
1,memory,main,dim_routes,"[route_id, agency_id, route_short_name, route_...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR, ...",False
2,memory,main,dim_stops,"[stop_id, stop_name, stop_desc, lat, lon, loca...","[BIGINT, VARCHAR, VARCHAR, DOUBLE, DOUBLE, BIG...",False
3,memory,main,dim_trips,"[trip_id, route_id, direction_id, service_id, ...","[VARCHAR, VARCHAR, BIGINT, VARCHAR, VARCHAR, V...",False
4,memory,main,fact_stop_events,"[route_id, direction_id, service_id, stop_id, ...","[VARCHAR, BIGINT, VARCHAR, BIGINT, BIGINT, BIG...",False


In [7]:
# Peek at a table
con.execute("SELECT * FROM dim_stops LIMIT 5").fetchdf()

Unnamed: 0,stop_id,stop_name,stop_desc,lat,lon,location_type,parent_station,zone_id,feed_id,x2263,y2263
0,100014,BEDFORD PK BLVD/GRAND CONCOURSE,,40.872562,-73.888156,0,,,mta-nyct-bus-bronx,1015182.0,257195.345044
1,100017,PAUL AV/W 205 ST,,40.876836,-73.88971,0,,,mta-nyct-bus-bronx,1014750.0,258751.986309
2,100018,PAUL AV/WEST MOSHOLU PKWY SOUTH,,40.880392,-73.886081,0,,,mta-nyct-bus-bronx,1015752.0,260048.862093
3,100019,GRAND CONCOURSE/E 138 ST,,40.813496,-73.929489,0,,,mta-nyct-bus-bronx,1003768.0,235663.490524
4,100020,GRAND CONCOURSE/E 144 ST,,40.816812,-73.928001,0,,,mta-nyct-bus-bronx,1004179.0,236871.962862


In [36]:
con.execute("SELECT * FROM dim_routes LIMIT 5").fetchdf()

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_color,route_text_color,feed_id
0,B1,MTA NYCT,B1,Bay Ridge - Manhattan Beach,via 86th St / Ocean Pkwy,3,00AEEF,FFFFFF,mta-nyct-bus-bronx
1,B11,MTA NYCT,B11,Sunset Park - Midwood,via 49th & 50th St / Avenue J,3,006CB7,FFFFFF,mta-nyct-bus-bronx
2,B12,MTA NYCT,B12,Lefferts Gardens - East New York,via Clarkson Av / Empire Blvd / East New York Av,3,6CBE45,FFFFFF,mta-nyct-bus-bronx
3,B13,MTA NYCT,B13,Spring Creek - Wyckoff Hospital,via Crescent St / Jamaica Av / Wyckoff Av,3,FAA61A,FFFFFF,mta-nyct-bus-bronx
4,B14,MTA NYCT,B14,Spring Creek - Crown Heights,via Sutter Av / Pitkin Av,3,00AEEF,FFFFFF,mta-nyct-bus-bronx


In [38]:
con.execute("SELECT * FROM calendar_base LIMIT 5").fetchdf()

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,feed_id
0,GH_D5-Sunday,0,0,0,0,0,0,1,20250629,20260101,mta-nyct-bus-bronx
1,GH_D5-Weekday-SDon,1,1,1,1,1,0,0,20250630,20260102,mta-nyct-bus-bronx
2,GH_D5-Weekday-SDon-BM,1,1,1,1,0,0,1,20250629,20260101,mta-nyct-bus-bronx
3,GH_D5-Saturday,0,0,0,0,0,1,0,20250705,20260103,mta-nyct-bus-bronx
4,GH_O5-Weekday,1,1,1,1,1,0,0,20251013,20251231,mta-nyct-bus-bronx


In [39]:
con.execute("""
SELECT table_schema, table_name, column_name, data_type
FROM information_schema.columns
WHERE table_schema='main'
ORDER BY table_name, ordinal_position
""").fetchdf()

Unnamed: 0,table_schema,table_name,column_name,data_type
0,main,calendar_base,service_id,VARCHAR
1,main,calendar_base,monday,BIGINT
2,main,calendar_base,tuesday,BIGINT
3,main,calendar_base,wednesday,BIGINT
4,main,calendar_base,thursday,BIGINT
5,main,calendar_base,friday,BIGINT
6,main,calendar_base,saturday,BIGINT
7,main,calendar_base,sunday,BIGINT
8,main,calendar_base,start_date,BIGINT
9,main,calendar_base,end_date,BIGINT


### Query Testing

In [10]:
from pyproj import Transformer

def to_sec(hms: str) -> int:
    hh, mm, *rest = hms.split(":")
    ss = int(rest[0]) if rest else 0
    return int(hh) * 3600 + int(mm) * 60 + ss

def buses_by_stop_route_dir_within_radius(
    lon: float,
    lat: float,
    start_time: str,       # "HH:MM" or "HH:MM:SS"
    end_time: str,         # "HH:MM" or "HH:MM:SS"
    day_type: str,         # "Weekday" | "Saturday" | "Sunday"
    radius_ft: int = 250,
    selected_feeds: list[str] | None = None, 
    con: duckdb.DuckDBPyConnection | None = None,
) -> pd.DataFrame:
    """
    Returns one row per (route_id, direction_id, stop_id) within radius,
    with stop name + lat/lon and count of buses in the inclusive time window.
    Handles midnight-spanning windows (e.g., 23:30–00:30).
    """

    # project the query point to EPSG:2263 (NY state plane feet)
    x0, y0 = Transformer.from_crs("EPSG:4326", "EPSG:2263", always_xy=True).transform(lon, lat)
    s, e = to_sec(start_time), to_sec(end_time)

    # define placeholders for feeds selection 
    sel = list(selected_feeds or [])
    if sel:
        values = ",".join(["(?)"] * len(sel))           # -> "(?),(?),(?)"
        chosen_cte = f"chosen_feeds(feed_id) AS (VALUES {values}),"
        feed_pred = "feed_id IN (SELECT feed_id FROM chosen_feeds)"
    else:
        chosen_cte = ""                                  # no CTE
        feed_pred = "TRUE"                               # no filter = all feeds

    sql = f"""
    WITH
    {chosen_cte}
    dim_stops AS (SELECT * FROM read_parquet('{PARQ_BASE}/dim_stops/*.parquet')),
    dim_trips  AS (SELECT * FROM read_parquet('{PARQ_BASE}/dim_trips/*.parquet')),
    dim_routes AS (SELECT * FROM read_parquet('{PARQ_BASE}/dim_routes/*.parquet')),
    calendar_base AS (SELECT * FROM read_parquet('{PARQ_BASE}/calendar_base/*.parquet')),
    fact_stop_events AS (SELECT * FROM read_parquet('{PARQ_BASE}/fact_stop_events/*.parquet')),
    svcs AS (
      SELECT DISTINCT feed_id, service_id
      FROM calendar_base
      WHERE {feed_pred}
      AND (
        (? = 'Weekday'  AND (monday=1 OR tuesday=1 OR wednesday=1 OR thursday=1 OR friday=1))
        OR (? = 'Saturday' AND saturday=1)
        OR (? = 'Sunday'   AND sunday=1)
        )
    ),
    win AS (SELECT ?::INTEGER AS s, ?::INTEGER AS e),
    near_stops AS (
      SELECT feed_id, stop_id, stop_name, lat, lon
      FROM dim_stops
      WHERE {feed_pred}
      AND ((x2263 - ?)*(x2263 - ?) + (y2263 - ?)*(y2263 - ?)) <= ?*?
    )
    SELECT
      r.feed_id,
      r.route_id,
      t.direction_id,
      s.stop_id,
      s.stop_name,
      s.lat  AS stop_lat,
      s.lon  AS stop_lon,
      COUNT(*) AS buses_scheduled
    FROM fact_stop_events f
    JOIN dim_trips  t ON f.feed_id = t.feed_id AND f.trip_id = t.trip_id
    JOIN dim_routes r ON t.feed_id = r.feed_id AND t.route_id = r.route_id
    JOIN svcs       v ON f.feed_id = v.feed_id AND f.service_id = v.service_id
    JOIN near_stops s ON f.feed_id = s.feed_id AND f.stop_id   = s.stop_id
    CROSS JOIN win
    WHERE
      (
        (SELECT e FROM win) >= (SELECT s FROM win)
        AND f.arrival_sec BETWEEN (SELECT s FROM win) AND (SELECT e FROM win)
      )
      OR
      (
        (SELECT e FROM win) < (SELECT s FROM win)   -- midnight wrap
        AND (f.arrival_sec >= (SELECT s FROM win) OR f.arrival_sec <= (SELECT e FROM win))
      )
    GROUP BY r.feed_id, r.route_id, t.direction_id, s.stop_id, s.stop_name, s.lat, s.lon
    ORDER BY s.stop_name, r.feed_id, r.route_id, t.direction_id;
    """

    params = []
    if sel:
        params += sel                                # feeds for chosen_feeds CTE (once)
        params += [day_type, day_type, day_type]         # 3 day-type placeholders
        params += [s, e]                   # window
        params += [x0, x0, y0, y0, int(radius_ft), int(radius_ft)]  # spatial
    df = con.execute(sql, params).fetchdf()

    return df

In [11]:
# --- inputs you can tweak ---
lon, lat = -73.968297, 40.759026      # intersection point
radius_ft = 250
day_type = "Weekday"                  # "Weekday" | "Saturday" | "Sunday"
selected_feeds = feeds["feed_id"].tolist()  # or specify ['mta-nyct-bus', ...]

buses_by_stop_route_dir_within_radius(lon, lat, "07:30", "08:30", day_type, radius_ft, selected_feeds, con)

Unnamed: 0,feed_id,route_id,direction_id,stop_id,stop_name,stop_lat,stop_lon,buses_scheduled
0,mta-nyct-bus-busco,QM31,1,404169,3 AV/E 55 ST,40.759026,-73.968297,4
1,mta-nyct-bus-busco,QM32,1,405403,3 AV/E 55 ST,40.758873,-73.968456,2
2,mta-nyct-bus-busco,QM34,1,405403,3 AV/E 55 ST,40.758873,-73.968456,5
3,mta-nyct-bus-busco,QM35,1,404169,3 AV/E 55 ST,40.759026,-73.968297,7
4,mta-nyct-bus-busco,QM36,1,404169,3 AV/E 55 ST,40.759026,-73.968297,5
5,mta-nyct-bus-busco,QM40,1,405403,3 AV/E 55 ST,40.758873,-73.968456,3
6,mta-nyct-bus-busco,QM42,1,405403,3 AV/E 55 ST,40.758873,-73.968456,3
7,mta-nyct-bus-busco,QM44,1,404169,3 AV/E 55 ST,40.759026,-73.968297,3


In [22]:
# Example intersection (lon, lat)
lon, lat = -73.998522, 40.745306

from pyproj import Transformer
tf = Transformer.from_crs("EPSG:4326", "EPSG:2263", always_xy=True)
x0, y0 = tf.transform(lon, lat)

nearby = con.execute("""
SELECT stop_id, stop_name, lat, lon
FROM dim_stops
WHERE (x2263 - ?)*(x2263 - ?) + (y2263 - ?)*(y2263 - ?) <= ?*?
""", [x0, x0, y0, y0, 250, 250]).fetchdf()
nearby

Unnamed: 0,stop_id,stop_name,lat,lon
0,402126,W 23 ST/8 AV,40.745059,-73.998141
1,402154,W 23 ST/8 AV,40.745547,-73.998888
2,405623,8 AV/W 23 ST,40.744919,-73.998656


In [None]:
service_df = con.execute("""
            SELECT * 
            FROM calendar_base""").fetchdf()
service_df[service_df['service_id'].str.contains('Weekday')]

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,feed_id
1,GH_D5-Weekday-SDon,1,1,1,1,1,0,0,20250630,20260102,mta-nyct-bus-bronx
2,GH_D5-Weekday-SDon-BM,1,1,1,1,0,0,1,20250629,20260101,mta-nyct-bus-bronx
4,GH_O5-Weekday,1,1,1,1,1,0,0,20251013,20251231,mta-nyct-bus-bronx
5,GH_O5-Weekday-BM,1,1,1,1,0,0,1,20251012,20251230,mta-nyct-bus-bronx
7,KB_D5-Weekday-SDon,1,1,1,1,1,0,0,20250630,20260102,mta-nyct-bus-bronx
...,...,...,...,...,...,...,...,...,...,...,...
127,CA_S5-Weekday,1,1,1,1,1,0,0,20251128,20251128,mta-nyct-bus-si
129,CH_D5-Weekday-SDon,1,1,1,1,1,0,0,20250630,20260102,mta-nyct-bus-si
131,CH_S5-Weekday,1,1,1,1,1,0,0,20251128,20251128,mta-nyct-bus-si
133,YU_D5-Weekday-SDon,1,1,1,1,1,0,0,20250630,20260102,mta-nyct-bus-si


In [28]:
service_df.sort_values('service_id')

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,feed_id
53,BPPD5-BP_D5-Saturday-21,0,0,0,0,0,1,0,20250906,20260103,mta-nyct-bus-busco
51,BPPD5-BP_D5-Sunday-21,0,0,0,0,0,0,1,20250831,20260101,mta-nyct-bus-busco
52,BPPD5-BP_D5-Weekday-22-SDon,1,1,1,1,1,0,0,20250902,20260102,mta-nyct-bus-busco
54,BPPD5-BP_S5-Weekday-32,1,1,1,1,1,0,0,20251128,20251128,mta-nyct-bus-busco
126,CA_D5-Saturday,0,0,0,0,0,1,0,20250705,20260103,mta-nyct-bus-si
...,...,...,...,...,...,...,...,...,...,...,...
82,YOPD5-YO_S5-Weekday-45,1,1,1,1,1,0,0,20251128,20251128,mta-nyct-bus-busco
134,YU_D5-Saturday,0,0,0,0,0,1,0,20250705,20260103,mta-nyct-bus-si
132,YU_D5-Sunday,0,0,0,0,0,0,1,20250629,20260101,mta-nyct-bus-si
133,YU_D5-Weekday-SDon,1,1,1,1,1,0,0,20250630,20260102,mta-nyct-bus-si


In [122]:

pat = r'^(.*?)-(Weekday|Saturday|Sunday)(?:-(.*))?$'
service_df[['prefix', 'day_of_week', 'suffix']] = service_df['service_id'].str.extract(pat)
service_df#.groupby(['prefix','day_of_week'])

service_df[service_df['day_of_week'] == 'Weekday'].dropna().sort_values('prefix')#.groupby('prefix').count()

# service_df[service_df['prefix']]

weekday = service_df[service_df['day_of_week'] == 'Weekday']
weekday[weekday.duplicated(subset=['prefix'],keep=False)]

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,feed_id,prefix,day_of_week,suffix
1,GH_D5-Weekday-SDon,1,1,1,1,1,0,0,20250630,20260102,mta-nyct-bus-bronx,GH_D5,Weekday,SDon
2,GH_D5-Weekday-SDon-BM,1,1,1,1,0,0,1,20250629,20260101,mta-nyct-bus-bronx,GH_D5,Weekday,SDon-BM
4,GH_O5-Weekday,1,1,1,1,1,0,0,20251013,20251231,mta-nyct-bus-bronx,GH_O5,Weekday,
5,GH_O5-Weekday-BM,1,1,1,1,0,0,1,20251012,20251230,mta-nyct-bus-bronx,GH_O5,Weekday,BM
16,EN_D5-Weekday,1,1,1,1,1,0,0,20250902,20260102,mta-nyct-bus-brooklyn,EN_D5,Weekday,
17,EN_D5-Weekday-SDon,1,1,1,1,1,0,0,20250904,20251223,mta-nyct-bus-brooklyn,EN_D5,Weekday,SDon
21,FB_D5-Weekday,1,1,1,1,1,0,0,20250902,20260102,mta-nyct-bus-brooklyn,FB_D5,Weekday,
22,FB_D5-Weekday-SDon,1,1,1,1,1,0,0,20250904,20251223,mta-nyct-bus-brooklyn,FB_D5,Weekday,SDon
26,FP_D5-Weekday,1,1,1,1,1,0,0,20250902,20260102,mta-nyct-bus-brooklyn,FP_D5,Weekday,
27,FP_D5-Weekday-SDon,1,1,1,1,1,0,0,20250904,20251223,mta-nyct-bus-brooklyn,FP_D5,Weekday,SDon


In [124]:
comb[comb['service_id'].str.contains('SDon-BM')]#.route_id.unique()

Unnamed: 0,route_id,direction_id,service_id,stop_id,stop_sequence,arrival_sec,trip_id,feed_id
978050,BX41,1,GH_D5-Weekday-SDon-BM,102793,1,86100,GH_D5-Weekday-SDon-BM-143500_BX41_901,mta-nyct-bus-bronx
978051,BX41,1,GH_D5-Weekday-SDon-BM,102795,2,86204,GH_D5-Weekday-SDon-BM-143500_BX41_901,mta-nyct-bus-bronx
978052,BX41,1,GH_D5-Weekday-SDon-BM,102796,3,86338,GH_D5-Weekday-SDon-BM-143500_BX41_901,mta-nyct-bus-bronx
978053,BX41,1,GH_D5-Weekday-SDon-BM,102797,4,86392,GH_D5-Weekday-SDon-BM-143500_BX41_901,mta-nyct-bus-bronx
978054,BX41,1,GH_D5-Weekday-SDon-BM,102798,5,86449,GH_D5-Weekday-SDon-BM-143500_BX41_901,mta-nyct-bus-bronx
...,...,...,...,...,...,...,...,...
6613505,Q2,1,QV_D5-Weekday-SDon-BM,500143,16,86159,QV_D5-Weekday-SDon-BM-142500_Q2_51,mta-nyct-bus-queens
6613506,Q2,1,QV_D5-Weekday-SDon-BM,500145,17,86220,QV_D5-Weekday-SDon-BM-142500_Q2_51,mta-nyct-bus-queens
6613507,Q2,1,QV_D5-Weekday-SDon-BM,700704,18,86387,QV_D5-Weekday-SDon-BM-142500_Q2_51,mta-nyct-bus-queens
6613508,Q2,1,QV_D5-Weekday-SDon-BM,505186,19,86520,QV_D5-Weekday-SDon-BM-142500_Q2_51,mta-nyct-bus-queens


In [132]:
comb[(comb['route_id'] == 'Q2') & (comb['stop_id'] == 505316)]#.service_id.unique()

Unnamed: 0,route_id,direction_id,service_id,stop_id,stop_sequence,arrival_sec,trip_id,feed_id
5965655,Q2,1,QV_O5-Weekday-BM,505316,1,85500,QV_O5-Weekday-BM-142500_Q2_51,mta-nyct-bus-queens
5965695,Q2,1,QV_O5-Weekday,505316,1,2700,QV_O5-Weekday-004500_Q2_51,mta-nyct-bus-queens
5965735,Q2,1,QV_O5-Weekday,505316,1,6300,QV_O5-Weekday-010500_Q2_51,mta-nyct-bus-queens
5965775,Q2,1,QV_O5-Weekday,505316,1,9900,QV_O5-Weekday-016500_Q2_51,mta-nyct-bus-queens
5965815,Q2,1,QV_O5-Weekday,505316,1,13500,QV_O5-Weekday-022500_MISC_104,mta-nyct-bus-queens
...,...,...,...,...,...,...,...,...
6617090,Q2,1,QV_D5-Weekday-SDon,505316,1,22380,QV_D5-Weekday-SDon-037300_Q2_51,mta-nyct-bus-queens
6617110,Q2,1,QV_D5-Weekday-SDon,505316,1,17700,QV_D5-Weekday-SDon-029500_Q2_51,mta-nyct-bus-queens
6617130,Q2,1,QV_D5-Weekday-SDon,505316,1,33780,QV_D5-Weekday-SDon-056300_MISC_191,mta-nyct-bus-queens
6617190,Q2,1,QV_D5-Weekday-SDon,505316,1,26640,QV_D5-Weekday-SDon-044400_MISC_261,mta-nyct-bus-queens


In [None]:
# select all stops at a stop_id within a timeframe
all_stops_at_stop = """WITH win AS (
  SELECT ?::INTEGER AS s, ?::INTEGER AS e      -- start_sec, end_sec
),
svcs AS (
  SELECT DISTINCT service_id
  FROM calendar_base
  WHERE
    (? = 'Weekday'  AND (monday=1 OR tuesday=1 OR wednesday=1 OR thursday=1 OR friday=1))
    OR (? = 'Saturday' AND saturday=1)
    OR (? = 'Sunday'   AND sunday=1)
)
SELECT
  r.route_id,
  t.direction_id,
  t.trip_id,
  f.service_id,
  f.stop_id,
  f.stop_sequence,
  f.arrival_sec,
  (TIME '00:00:00' + f.arrival_sec * INTERVAL 1 SECOND) AS arrival_time,
  t.trip_headsign
FROM fact_stop_events f
JOIN dim_trips  t ON f.trip_id = t.trip_id
JOIN dim_routes r ON t.route_id = r.route_id AND t.feed_id = r.feed_id -- Note: need to match on feed_id for NYC, because each routes file has all routes (regardless of boroughs)
JOIN svcs       v ON v.service_id = f.service_id
CROSS JOIN win
WHERE f.stop_id = ?
  AND (
        -- normal window
        ((SELECT e FROM win) >= (SELECT s FROM win)
         AND f.arrival_sec BETWEEN (SELECT s FROM win) AND (SELECT e FROM win))
        -- midnight wrap window
        OR
        ((SELECT e FROM win) < (SELECT s FROM win)
         AND (f.arrival_sec >= (SELECT s FROM win) OR f.arrival_sec <= (SELECT e FROM win)))
      )
ORDER BY f.arrival_sec, r.route_id, t.direction_id, t.trip_id;"""


In [137]:
start_sec = 17*3600 + 30*60   # 07:45
end_sec   = 18*3600 + 30*60   # 08:45
day_type  = "Weekday"
stop_id   = "405480"

df = con.execute(all_stops_at_stop, [start_sec, end_sec, day_type, day_type, day_type, stop_id]).fetchdf()
df.sort_values(by=['route_id','arrival_time'])

# df.drop_duplicates('trip_id')
# df[df['route_id'] == 'M22']

Unnamed: 0,route_id,direction_id,trip_id,service_id,stop_id,stop_sequence,arrival_sec,arrival_time,trip_headsign
1,M102,0,OH_D5-Weekday-SDon-101800_M101_141,OH_D5-Weekday-SDon,405480,19,63079,17:31:19,HARLEM 147 ST via 3 AV via LENOX AV
2,M102,0,OH_O5-Weekday-102200_M101_74,OH_O5-Weekday,405480,19,63319,17:35:19,HARLEM 147 ST via 3 AV via LENOX AV
3,M102,0,OH_D5-Weekday-SDon-102700_M101_68,OH_D5-Weekday-SDon,405480,19,63619,17:40:19,HARLEM 147 ST via 3 AV via LENOX AV
6,M102,0,OH_O5-Weekday-103400_M101_61,OH_O5-Weekday,405480,19,64039,17:47:19,HARLEM 147 ST via 3 AV via LENOX AV
7,M102,0,OH_D5-Weekday-SDon-103600_M101_125,OH_D5-Weekday-SDon,405480,19,64159,17:49:19,HARLEM 147 ST via 3 AV via LENOX AV
9,M102,0,OH_D5-Weekday-SDon-104500_M101_143,OH_D5-Weekday-SDon,405480,19,64699,17:58:19,HARLEM 147 ST via 3 AV via LENOX AV
10,M102,0,OH_O5-Weekday-104600_M101_62,OH_O5-Weekday,405480,19,64759,17:59:19,HARLEM 147 ST via 3 AV via LENOX AV
13,M102,0,OH_D5-Weekday-SDon-105500_M101_148,OH_D5-Weekday-SDon,405480,19,65299,18:08:19,HARLEM 147 ST via 3 AV via LENOX AV
14,M102,0,OH_O5-Weekday-105800_M101_112,OH_O5-Weekday,405480,19,65479,18:11:19,HARLEM 147 ST via 3 AV via LENOX AV
16,M102,0,OH_D5-Weekday-SDon-106500_M101_102,OH_D5-Weekday-SDon,405480,19,65899,18:18:19,HARLEM 147 ST via 3 AV via LENOX AV


In [94]:
df.drop_duplicates(subset=['route_id','direction_id','stop_id','arrival_time'])#.groupby(['route_id','direction_id','stop_id','service_id']).count()

df[df['service_id']=='CPPD5-CP_D5-Weekday-10-SDon'].sort_values('route_id')

Unnamed: 0,route_id,direction_id,trip_id,service_id,stop_id,stop_sequence,arrival_sec,arrival_time,trip_headsign
6,QM31,1,44039047-CPPD5-CP_D5-Weekday-10-SDon,CPPD5-CP_D5-Weekday-10-SDon,404169,14,28500,07:55:00,EAST MIDTOWN 55 ST via 3 AV
10,QM31,1,44038986-CPPD5-CP_D5-Weekday-10-SDon,CPPD5-CP_D5-Weekday-10-SDon,404169,14,29400,08:10:00,EAST MIDTOWN 55 ST via 3 AV
16,QM31,1,44038987-CPPD5-CP_D5-Weekday-10-SDon,CPPD5-CP_D5-Weekday-10-SDon,404169,14,30300,08:25:00,EAST MIDTOWN 55 ST via 3 AV
0,QM35,1,44039017-CPPD5-CP_D5-Weekday-10-SDon,CPPD5-CP_D5-Weekday-10-SDon,404169,32,27300,07:35:00,EAST MIDTOWN 55 ST via 3 AV
3,QM35,1,44039057-CPPD5-CP_D5-Weekday-10-SDon,CPPD5-CP_D5-Weekday-10-SDon,404169,32,28200,07:50:00,EAST MIDTOWN 55 ST via 3 AV
8,QM35,1,44039056-CPPD5-CP_D5-Weekday-10-SDon,CPPD5-CP_D5-Weekday-10-SDon,404169,32,29100,08:05:00,EAST MIDTOWN 55 ST via 3 AV
11,QM35,1,44039009-CPPD5-CP_D5-Weekday-10-SDon,CPPD5-CP_D5-Weekday-10-SDon,404169,32,29580,08:13:00,EAST MIDTOWN 55 ST via 3 AV
13,QM35,1,44039034-CPPD5-CP_D5-Weekday-10-SDon,CPPD5-CP_D5-Weekday-10-SDon,404169,32,30000,08:20:00,EAST MIDTOWN 55 ST via 3 AV
18,QM35,1,44039033-CPPD5-CP_D5-Weekday-10-SDon,CPPD5-CP_D5-Weekday-10-SDon,404169,32,30480,08:28:00,EAST MIDTOWN 55 ST via 3 AV
4,QM36,1,44037944-CPPD5-CP_D5-Weekday-10-SDon,CPPD5-CP_D5-Weekday-10-SDon,404169,17,28260,07:51:00,EAST MIDTOWN 55 ST via 3 AV


In [None]:
# note each route is present regardless of borough in each of the nyct gtfs datasets
routes = con.execute("""SELECT *
FROM dim_routes
""").fetch_df()
routes[routes['route_id'] == 'M21']

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_color,route_text_color,feed_id
124,M21,MTA NYCT,M21,Lower East Side - West Village,via Houston St Crosstown,3,FAA61A,FFFFFF,mta-nyct-bus-bronx
412,M21,MTA NYCT,M21,Lower East Side - West Village,via Houston St Crosstown,3,FAA61A,FFFFFF,mta-nyct-bus-brooklyn
792,M21,MTA NYCT,M21,Lower East Side - West Village,via Houston St Crosstown,3,FAA61A,FFFFFF,mta-nyct-bus-manhattan
1080,M21,MTA NYCT,M21,Lower East Side - West Village,via Houston St Crosstown,3,FAA61A,FFFFFF,mta-nyct-bus-queens
1368,M21,MTA NYCT,M21,Lower East Side - West Village,via Houston St Crosstown,3,FAA61A,FFFFFF,mta-nyct-bus-si
