In [None]:
# Cell 1 – Imports and configuration

import requests
import pandas as pd
from datetime import datetime
from pathlib import Path
import time

pd.set_option("display.float_format", lambda x: f"{x:,.3f}")


API_KEY = "e37341c852cb470eaad6235f289dd44a"

# NTA TripUpdates endpoint (GTFS-Realtime v2)
TRIP_UPDATES_URL = "https://api.nationaltransport.ie/gtfsr/v2/TripUpdates?format=json"

# Where to store data (adjust the path to your project folder)
RAW_BUS_PATH = Path(
    r"../dataset/bus_trip_updates_raw_member3.csv"
)
DAILY_BUS_PATH = Path(
    r"../dataset/bus_daily_cleaned_member34.csv"
)

print("Config loaded.")


Config loaded.


In [2]:
# Cell 2 – Fetch one snapshot of TripUpdates (realtime delays)

def fetch_trip_updates(api_key: str) -> pd.DataFrame:
    """
    Call NTA GTFS-Realtime TripUpdates API and return a DataFrame.
    One row = one stop_time_update for a bus trip, including delay info.
    """
    headers = {"x-api-key": api_key}
    resp = requests.get(TRIP_UPDATES_URL, headers=headers, timeout=30)
    resp.raise_for_status()
    data = resp.json()

    rows = []
    utc_now = datetime.utcnow()

    # The JSON structure is: { "entity": [ { "trip_update": {...} }, ... ] }
    for entity in data.get("entity", []):
        tu = entity.get("trip_update") or entity.get("tripUpdate")
        if not tu:
            continue

        trip = tu.get("trip", {})
        route_id = trip.get("route_id") or trip.get("routeId")
        trip_id = trip.get("trip_id") or trip.get("tripId")
        start_date_raw = trip.get("start_date") or trip.get("startDate")  # e.g. '20251205'

        # Each stop_time_update has arrival/departure delays + stop_id
        stu_list = tu.get("stop_time_update") or tu.get("stopTimeUpdate") or []
        for stu in stu_list:
            arr = stu.get("arrival", {}) or {}
            dep = stu.get("departure", {}) or {}

            rows.append(
                {
                    "snapshot_time_utc": utc_now,          # when we pulled the data
                    "route_id": route_id,
                    "trip_id": trip_id,
                    "start_date_raw": start_date_raw,
                    "stop_id": stu.get("stop_id") or stu.get("stopId"),
                    "arrival_delay_sec": arr.get("delay"),
                    "departure_delay_sec": dep.get("delay"),
                }
            )

    df = pd.DataFrame(rows)
    print(f"Fetched {len(df)} records from TripUpdates.")
    return df


# Quick test
test_df = fetch_trip_updates(API_KEY)
display(test_df.head())
print("Columns:", list(test_df.columns))


Fetched 16935 records from TripUpdates.


Unnamed: 0,snapshot_time_utc,route_id,trip_id,start_date_raw,stop_id,arrival_delay_sec,departure_delay_sec
0,2025-12-12 20:10:26.905853,5241_117988,5241_178696,20251212,8460B5550401,3534.0,
1,2025-12-12 20:10:26.905853,5241_117988,5241_179251,20251212,8530B158221,2876.0,2892.0
2,2025-12-12 20:10:26.905853,5241_117988,5241_179251,20251212,8530B1558401,2623.0,2923.0
3,2025-12-12 20:10:26.905853,5241_117988,5241_179251,20251212,8530B158211,2918.0,2918.0
4,2025-12-12 20:10:26.905853,5241_117988,5241_179251,20251212,8530B1581501,2738.0,2738.0


Columns: ['snapshot_time_utc', 'route_id', 'trip_id', 'start_date_raw', 'stop_id', 'arrival_delay_sec', 'departure_delay_sec']


In [3]:
# Cell 3 – Helper to append snapshot to CSV file

def append_snapshot_to_csv(df: pd.DataFrame, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    if path.exists():
        df.to_csv(path, mode="a", header=False, index=False)
    else:
        df.to_csv(path, mode="w", header=True, index=False)
    print(f"Appended {len(df)} rows to {path.name}")


# Example: run once (manual test)
snapshot = fetch_trip_updates(API_KEY)
append_snapshot_to_csv(snapshot, RAW_BUS_PATH)


Fetched 16935 records from TripUpdates.
Appended 16935 rows to bus_trip_updates_raw_member3.csv


In [None]:
# Cell 4 – Collect multiple snapshots over time (OPTIONAL)

NUM_SNAPSHOTS = 20        # how many times to call the API
SLEEP_SECONDS = 300       # 300 sec = 5 minutes between calls

for i in range(NUM_SNAPSHOTS):
    print(f"\n=== Snapshot {i+1}/{NUM_SNAPSHOTS} ===")
    try:
        snap_df = fetch_trip_updates(API_KEY)
        if not snap_df.empty:
            append_snapshot_to_csv(snap_df, RAW_BUS_PATH)
        else:
            print("No records in this snapshot.")
    except Exception as e:
        print("Error while fetching:", e)

    if i < NUM_SNAPSHOTS - 1:
        print(f"Sleeping for {SLEEP_SECONDS} seconds...")
        time.sleep(SLEEP_SECONDS)

print("\nFinished collecting raw bus delay data.")



=== Snapshot 1/20 ===
Fetched 16935 records from TripUpdates.
Appended 16935 rows to bus_trip_updates_raw_member3.csv
Sleeping for 300 seconds...


In [None]:
# Cell 5 – Load raw TripUpdates file

bus_raw = pd.read_csv(RAW_BUS_PATH, parse_dates=["snapshot_time_utc"], low_memory=False)
print("Raw bus file shape:", bus_raw.shape)
display(bus_raw.head())


In [None]:
# Cell 6 – Create date and delay_min

df_bus = bus_raw.copy()

# Convert start_date_raw (YYYYMMDD) to date where present
if "start_date_raw" in df_bus.columns:
    df_bus["start_date_raw"] = pd.to_datetime(
        df_bus["start_date_raw"], format="%Y%m%d", errors="coerce"
    )

# Choose date: prefer trip start date, otherwise snapshot date
df_bus["date"] = df_bus["start_date_raw"].dt.date
df_bus.loc[df_bus["date"].isna(), "date"] = df_bus["snapshot_time_utc"].dt.date

# Combine arrival & departure delay into a single delay_sec
if {"arrival_delay_sec", "departure_delay_sec"}.issubset(df_bus.columns):
    df_bus["delay_sec"] = df_bus[["arrival_delay_sec", "departure_delay_sec"]].mean(
        axis=1
    )
elif "arrival_delay_sec" in df_bus.columns:
    df_bus["delay_sec"] = df_bus["arrival_delay_sec"]
elif "departure_delay_sec" in df_bus.columns:
    df_bus["delay_sec"] = df_bus["departure_delay_sec"]
else:
    raise ValueError("No delay columns found!")

# Convert to minutes
df_bus["delay_min"] = df_bus["delay_sec"] / 60.0

# Remove extreme values: keep only between -120 and +120 minutes
before = df_bus.shape[0]
df_bus = df_bus[(df_bus["delay_min"] >= -120) & (df_bus["delay_min"] <= 120)]
after = df_bus.shape[0]
print(f"Removed {before - after} rows with unrealistic delay values.")

display(df_bus[["date", "route_id", "stop_id", "delay_min"]].head())


In [None]:
# Cell 7 – Daily aggregation for Member 3 dataset

bus_daily = (
    df_bus.groupby("date")
    .agg(
        bus_delay_mean_min=("delay_min", "mean"),
        bus_delay_median_min=("delay_min", "median"),
        bus_delay_p95_min=("delay_min", lambda x: x.quantile(0.95)),
        bus_trips_count=("delay_min", "count"),
    )
    .reset_index()
    .sort_values("date")
)

print("Daily bus delay dataset shape:", bus_daily.shape)
display(bus_daily.head(15))


In [None]:
# Cell 8 – Save integration-ready Member 3 file

bus_daily.to_csv(DAILY_BUS_PATH, index=False)
print("Saved daily bus delay dataset to:", DAILY_BUS_PATH)
