# -----------------------------------------------------------------------------
# Combines the logic of notebooks 03, 04 and 05 from the original NYC‑Taxi repo
# into **one single runnable notebook / script** for the Citi Bike project.
#
# Flow:
#   1.  Load the consolidated raw rides parquet (output of 02… script)
#   2.  Aggregate to hourly counts per station & back‑fill the full grid
#   3.  Engineer features & targets for demand‑forecasting
#   4.  Display intermediate artefacts so they can be eyeballed interactively
#   5.  Only *after* visual inspection, persist the final feature table to parquet
# -----------------------------------------------------------------------------

In [28]:

from __future__ import annotations

import logging
from pathlib import Path
from typing import Iterable, List

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pandas.tseries.holiday import USFederalHolidayCalendar

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)8s  %(message)s")
logger = logging.getLogger(__name__)

# CONFIG


In [29]:
CWD = Path.cwd()
PROJECT_ROOT = CWD if (CWD / "data").exists() else CWD.parent  # inside notebooks/ → parent
DATA_DIR = PROJECT_ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed" / "2023"

RAW_PARQUET = PROCESSED_DIR / "citibike_2023_all.parquet"
FEATURE_PARQUET = PROCESSED_DIR / "citibike_hourly_features.parquet"
YEAR = 2023

logger.info("Working directory: %s", CWD)
logger.info("Resolved data directory: %s", DATA_DIR)

2025-05-10 01:56:06,185     INFO  Working directory: /Users/vaibhavranshoor/Downloads/Citibike_rides/notebooks
2025-05-10 01:56:06,189     INFO  Resolved data directory: /Users/vaibhavranshoor/Downloads/Citibike_rides/data


# STEP 1 — LOAD RAW CONSOLIDATED PARQUET


In [30]:
def load_raw() -> pd.DataFrame:
    if not RAW_PARQUET.exists():
        raise FileNotFoundError(f"Raw parquet not found: {RAW_PARQUET}")
    logger.info("Loading consolidated parquet %s …", RAW_PARQUET)
    return pd.read_parquet(RAW_PARQUET)


# STEP 2 — AGGREGATE TO HOURLY COUNTS


In [31]:

def aggregate_hourly(df: pd.DataFrame) -> pd.DataFrame:
    if "start_hour" not in df.columns:
        logger.info("Deriving start_hour …")
        df["start_hour"] = df["started_at"].dt.floor("H")

    df_hourly = (
        df.groupby(["start_station_id", "start_hour"], observed=True)
          .size()
          .rename("rides")
          .reset_index()
    )
    logger.info("Hourly table shape: %s", df_hourly.shape)
    return df_hourly


def expand_grid(df_hourly: pd.DataFrame) -> pd.DataFrame:
    stations: Iterable[str] = df_hourly["start_station_id"].unique()
    all_hours = pd.date_range(f"{YEAR}-01-01 00:00", f"{YEAR}-12-31 23:00", freq="H")

    idx = pd.MultiIndex.from_product([stations, all_hours], names=["start_station_id", "start_hour"])
    df_full = (
        df_hourly.set_index(["start_station_id", "start_hour"])
        .reindex(idx, fill_value=0)
        .reset_index()
    )
    logger.info("Full grid shape: %s", df_full.shape)
    return df_full


# STEP 3 — FEATURE ENGINEERING


In [32]:

HOL_CAL = USFederalHolidayCalendar()
HOLIDAYS = HOL_CAL.holidays(start=f"{YEAR}-01-01", end=f"{YEAR}-12-31").tz_localize(None)
PI2 = 2 * np.pi


def add_datetime_features(df: pd.DataFrame) -> pd.DataFrame:
    dt = df["start_hour"]
    df["hour"] = dt.dt.hour
    df["dow"] = dt.dt.dayofweek
    df["doy"] = dt.dt.dayofyear

    df["sin_hour"] = np.sin(PI2 * df["hour"] / 24)
    df["cos_hour"] = np.cos(PI2 * df["hour"] / 24)
    df["sin_dow"] = np.sin(PI2 * df["dow"] / 7)
    df["cos_dow"] = np.cos(PI2 * df["dow"] / 7)

    df["is_weekend"] = df["dow"].isin([5, 6]).astype("int8")
    df["is_holiday"] = dt.dt.normalize().isin(HOLIDAYS).astype("int8")
    return df


def add_lag_features(df: pd.DataFrame, lags: List[int] = [1, 24, 168, 672]) -> pd.DataFrame:
    df = df.sort_values(["start_station_id", "start_hour"])
    for lag in lags:
        df[f"lag_{lag}"] = df.groupby("start_station_id")["rides"].shift(lag).astype("float32")
    return df


def add_rolling_features(df: pd.DataFrame, windows: List[int] = [24, 168]) -> pd.DataFrame:
    for win in windows:
        df[f"rollmean_{win}"] = (
            df.groupby("start_station_id")["rides"].rolling(win, min_periods=1).mean().reset_index(level=0, drop=True)
        ).astype("float32")
    return df


def add_target(df: pd.DataFrame, horizon: int = 1) -> pd.DataFrame:
    df[f"target_t_plus_{horizon}"] = (
        df.groupby("start_station_id")["rides"].shift(-horizon).astype("float32")
    )
    return df


# STEP 4 — VISUALISATION HELPERS

In [37]:

def plot_sample(df: pd.DataFrame, n_stations: int = 3):
    sample_ids = (
        df["start_station_id"].drop_duplicates().sample(n=min(n_stations, df["start_station_id"].nunique()), random_state=42)
    )
    df_plot = df[df["start_station_id"].isin(sample_ids)]
    fig = px.line(
        df_plot.sort_values("start_hour"),
        x="start_hour",
        y="rides",
        color="start_station_id",
        title="Hourly Citi Bike Rides (sample stations)",
        labels={"start_hour": "Hour", "rides": "Ride count", "start_station_id": "Station"},
        height=600,
    )
    fig.show()


def plot_hourly_distribution(df: pd.DataFrame):
    """Bar plot of average rides per hour of day across all stations."""
    hourly_avg = df.groupby("hour")["rides"].mean().reset_index()
    fig = px.bar(hourly_avg, x="hour", y="rides", title="Average rides by hour of day", labels={"rides": "Avg rides"})
    fig.show()


def plot_heatmap_hour_dow(df: pd.DataFrame):
    """Heatmap of average rides for each (day‑of‑week, hour) combination."""
    pivot = (
        df.groupby(["dow", "hour"])["rides"]
        .mean()
        .unstack(level=0)
        .rename(columns={0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"})
    )
    fig = px.imshow(pivot.T, aspect="auto", title="Avg rides — heatmap hour × day‑of‑week", labels={"x": "Hour", "y": "Day of week"})
    fig.update_yaxes(autorange="reversed")
    fig.show()



def plot_feature_timeseries(df: pd.DataFrame, station_id: str | None = None, features: List[str] | None = None, horizon: int = 168):
    """Plot time‑series of selected features for a given station (or aggregate).

    Args:
        df: engineered feature dataframe.
        station_id: plot a single station; if None the series are averaged across stations.
        features: list of feature columns to include; if None choose common set.
        horizon: number of most recent hours to display (keeps the plot readable).
    """
    default_feats = [
        "rides",
        "lag_1",
        "lag_24",
        "rollmean_24",
        "rollmean_168",
    ]
    if features is None:
        features = default_feats

    if station_id is not None:
        df_plot = df[df["start_station_id"] == station_id].copy()
        title_prefix = f"Station {station_id} — "
    else:
        df_plot = df.groupby("start_hour")[features].mean().reset_index()
        title_prefix = "Network average — "

    # Trim horizon
    df_plot = df_plot.sort_values("start_hour").tail(horizon)

    fig = go.Figure()
    for col in features:
        fig.add_trace(go.Scatter(x=df_plot["start_hour"], y=df_plot[col], mode="lines", name=col))

    fig.update_layout(
        title=f"{title_prefix}Feature time‑series (last {horizon} h)",
        xaxis_title="Hour",
        yaxis_title="Value",
        height=600,
    )
    fig.show()


# STEP 5 — PIPELINE DRIVER


In [34]:

def run_pipeline(write_parquet: bool = True) -> pd.DataFrame:
    """Run the full raw→features pipeline and return the engineered DataFrame."""
    # 1. Load raw
    df_raw = load_raw()
    display(df_raw.head())

    # 2. Aggregate → full grid
    df_hourly = aggregate_hourly(df_raw)
    df_full = expand_grid(df_hourly)
    display(df_full.head())

    # 3. Feature engineering
    df_feat = add_datetime_features(df_full.copy())
    df_feat = add_lag_features(df_feat)
    df_feat = add_rolling_features(df_feat)
    df_feat = add_target(df_feat)
    display(df_feat.head())

    # 4. Visual sanity‑checks
    plot_sample(df_feat)
    plot_hourly_distribution(df_feat)
    plot_heatmap_hour_dow(df_feat)

    # 5. Persist parquet (optional)
    if write_parquet:
        FEATURE_PARQUET.parent.mkdir(parents=True, exist_ok=True)
        df_feat.to_parquet(FEATURE_PARQUET, engine="pyarrow", compression="snappy")
        logger.info("Feature parquet written: %s (rows=%d, cols=%d)", FEATURE_PARQUET, len(df_feat), df_feat.shape[1])

        # Reload and eyeball
        df_check = pd.read_parquet(FEATURE_PARQUET)
        display(df_check.head())
        display(df_check.tail())
        logger.info("Parquet reload sanity-check passed (rows=%d)", len(df_check))

    return df_feat


# running the pipeline and outputting the features!

In [35]:
df_features = run_pipeline(write_parquet=True)
df_features

2025-05-10 01:57:36,227     INFO  Loading consolidated parquet /Users/vaibhavranshoor/Downloads/Citibike_rides/data/processed/2023/citibike_2023_all.parquet …


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,off_grid,start_hour
0,DC1CB984821DFFF7,classic_bike,2023-01-07 15:36:53.430,2023-01-07 15:39:45.406,Vesey St & Church St,5216.06,Albany St & Greenwich St,5145.02,40.712219,-74.010475,40.709267,-74.013245,member,False,2023-01-07 15:00:00
1,C00CA02971557F16,classic_bike,2023-01-04 19:23:01.234,2023-01-04 19:34:57.151,Lispenard St & Broadway,5391.06,St Marks Pl & 1 Ave,5626.13,40.719391,-74.002472,40.727791,-73.985649,member,False,2023-01-04 19:00:00
2,C753AE5EBD8458F9,classic_bike,2023-01-20 09:22:19.894,2023-01-20 10:23:24.255,3 Ave & Schermerhorn St,4437.01,State St & Smith St,4522.07,40.686832,-73.979675,40.689171,-73.988602,member,False,2023-01-20 09:00:00
3,E4415A543C1972A7,classic_bike,2023-01-24 10:38:01.135,2023-01-24 10:41:40.749,E 5 St & Ave A,5626.06,E 1 St & 1 Ave,5593.01,40.724789,-73.984299,40.723354,-73.988647,member,False,2023-01-24 10:00:00
4,BD52A87B215877C7,electric_bike,2023-01-13 10:17:38.192,2023-01-13 10:33:59.099,W 54 St & 11 Ave,6955.05,Washington St & Gansevoort St,6039.06,40.768291,-73.992561,40.739323,-74.008118,member,False,2023-01-13 10:00:00


2025-05-10 01:58:25,516     INFO  Hourly table shape: (8295504, 3)
2025-05-10 01:58:28,379     INFO  Full grid shape: (19797600, 3)


Unnamed: 0,start_station_id,start_hour,rides
0,2733.03,2023-01-01 00:00:00,1
1,2733.03,2023-01-01 01:00:00,0
2,2733.03,2023-01-01 02:00:00,0
3,2733.03,2023-01-01 03:00:00,0
4,2733.03,2023-01-01 04:00:00,0


Unnamed: 0,start_station_id,start_hour,rides,hour,dow,doy,sin_hour,cos_hour,sin_dow,cos_dow,is_weekend,is_holiday,lag_1,lag_24,lag_168,lag_672,rollmean_24,rollmean_168,target_t_plus_1
0,2733.03,2023-01-01 00:00:00,1,0,6,1,0.0,1.0,-0.781831,0.62349,1,0,,,,,1.0,1.0,0.0
1,2733.03,2023-01-01 01:00:00,0,1,6,1,0.258819,0.965926,-0.781831,0.62349,1,0,1.0,,,,0.5,0.5,0.0
2,2733.03,2023-01-01 02:00:00,0,2,6,1,0.5,0.866025,-0.781831,0.62349,1,0,0.0,,,,0.333333,0.333333,0.0
3,2733.03,2023-01-01 03:00:00,0,3,6,1,0.707107,0.707107,-0.781831,0.62349,1,0,0.0,,,,0.25,0.25,0.0
4,2733.03,2023-01-01 04:00:00,0,4,6,1,0.866025,0.5,-0.781831,0.62349,1,0,0.0,,,,0.2,0.2,0.0


2025-05-10 01:58:48,605     INFO  Feature parquet written: /Users/vaibhavranshoor/Downloads/Citibike_rides/data/processed/2023/citibike_hourly_features.parquet (rows=19797600, cols=19)


Unnamed: 0,start_station_id,start_hour,rides,hour,dow,doy,sin_hour,cos_hour,sin_dow,cos_dow,is_weekend,is_holiday,lag_1,lag_24,lag_168,lag_672,rollmean_24,rollmean_168,target_t_plus_1
0,2733.03,2023-01-01 00:00:00,1,0,6,1,0.0,1.0,-0.781831,0.62349,1,0,,,,,1.0,1.0,0.0
1,2733.03,2023-01-01 01:00:00,0,1,6,1,0.258819,0.965926,-0.781831,0.62349,1,0,1.0,,,,0.5,0.5,0.0
2,2733.03,2023-01-01 02:00:00,0,2,6,1,0.5,0.866025,-0.781831,0.62349,1,0,0.0,,,,0.333333,0.333333,0.0
3,2733.03,2023-01-01 03:00:00,0,3,6,1,0.707107,0.707107,-0.781831,0.62349,1,0,0.0,,,,0.25,0.25,0.0
4,2733.03,2023-01-01 04:00:00,0,4,6,1,0.866025,0.5,-0.781831,0.62349,1,0,0.0,,,,0.2,0.2,0.0


Unnamed: 0,start_station_id,start_hour,rides,hour,dow,doy,sin_hour,cos_hour,sin_dow,cos_dow,is_weekend,is_holiday,lag_1,lag_24,lag_168,lag_672,rollmean_24,rollmean_168,target_t_plus_1
19797595,SYS039,2023-12-31 19:00:00,0,19,6,365,-0.965926,0.258819,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19797596,SYS039,2023-12-31 20:00:00,0,20,6,365,-0.866025,0.5,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19797597,SYS039,2023-12-31 21:00:00,0,21,6,365,-0.707107,0.707107,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19797598,SYS039,2023-12-31 22:00:00,0,22,6,365,-0.5,0.866025,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19797599,SYS039,2023-12-31 23:00:00,0,23,6,365,-0.258819,0.965926,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.0,0.0,


2025-05-10 01:58:52,980     INFO  Parquet reload sanity-check passed (rows=19797600)


Unnamed: 0,start_station_id,start_hour,rides,hour,dow,doy,sin_hour,cos_hour,sin_dow,cos_dow,is_weekend,is_holiday,lag_1,lag_24,lag_168,lag_672,rollmean_24,rollmean_168,target_t_plus_1
0,2733.03,2023-01-01 00:00:00,1,0,6,1,0.000000,1.000000,-0.781831,0.62349,1,0,,,,,1.000000,1.000000,0.0
1,2733.03,2023-01-01 01:00:00,0,1,6,1,0.258819,0.965926,-0.781831,0.62349,1,0,1.0,,,,0.500000,0.500000,0.0
2,2733.03,2023-01-01 02:00:00,0,2,6,1,0.500000,0.866025,-0.781831,0.62349,1,0,0.0,,,,0.333333,0.333333,0.0
3,2733.03,2023-01-01 03:00:00,0,3,6,1,0.707107,0.707107,-0.781831,0.62349,1,0,0.0,,,,0.250000,0.250000,0.0
4,2733.03,2023-01-01 04:00:00,0,4,6,1,0.866025,0.500000,-0.781831,0.62349,1,0,0.0,,,,0.200000,0.200000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19797595,SYS039,2023-12-31 19:00:00,0,19,6,365,-0.965926,0.258819,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
19797596,SYS039,2023-12-31 20:00:00,0,20,6,365,-0.866025,0.500000,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
19797597,SYS039,2023-12-31 21:00:00,0,21,6,365,-0.707107,0.707107,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
19797598,SYS039,2023-12-31 22:00:00,0,22,6,365,-0.500000,0.866025,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0


In [36]:
df_features

Unnamed: 0,start_station_id,start_hour,rides,hour,dow,doy,sin_hour,cos_hour,sin_dow,cos_dow,is_weekend,is_holiday,lag_1,lag_24,lag_168,lag_672,rollmean_24,rollmean_168,target_t_plus_1
0,2733.03,2023-01-01 00:00:00,1,0,6,1,0.000000,1.000000,-0.781831,0.62349,1,0,,,,,1.000000,1.000000,0.0
1,2733.03,2023-01-01 01:00:00,0,1,6,1,0.258819,0.965926,-0.781831,0.62349,1,0,1.0,,,,0.500000,0.500000,0.0
2,2733.03,2023-01-01 02:00:00,0,2,6,1,0.500000,0.866025,-0.781831,0.62349,1,0,0.0,,,,0.333333,0.333333,0.0
3,2733.03,2023-01-01 03:00:00,0,3,6,1,0.707107,0.707107,-0.781831,0.62349,1,0,0.0,,,,0.250000,0.250000,0.0
4,2733.03,2023-01-01 04:00:00,0,4,6,1,0.866025,0.500000,-0.781831,0.62349,1,0,0.0,,,,0.200000,0.200000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19797595,SYS039,2023-12-31 19:00:00,0,19,6,365,-0.965926,0.258819,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
19797596,SYS039,2023-12-31 20:00:00,0,20,6,365,-0.866025,0.500000,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
19797597,SYS039,2023-12-31 21:00:00,0,21,6,365,-0.707107,0.707107,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
19797598,SYS039,2023-12-31 22:00:00,0,22,6,365,-0.500000,0.866025,-0.781831,0.62349,1,0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0


In [40]:
plot_feature_timeseries(df_features)