<a href="https://colab.research.google.com/github/Rakabi007/Rideshare-Demand-Forecasting-to-Reduce-Driver-Idle-Time-and-Slash-Rider-Wait-Time/blob/main/Rideshare_Demand_Forecasting_to_Reduce_Driver_Idle_Time_and_Slash_Rider_Wait_Time.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Rideshare Demand Forecasting & Dispatch Optimization (Prototype)
# ---------------------------------------------------------------
# This notebook creates a small, self-contained simulation to:
# 1) Generate synthetic rideshare demand by city zones in 15-min buckets.
# 2) Train a simple forecasting model (features: time-of-day, day-of-week, zone).
# 3) Run a minute-by-minute dispatch simulator for a few hours.
# 4) Optimize driver-request assignment each minute (Hungarian algorithm).
# 5) Report key KPIs: rider wait time, driver idle time, utilization.

In [1]:
import numpy as np
import pandas as pd
from datetime import timedelta
import random
from scipy.optimize import linear_sum_assignment
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# -------------------------------------------------------------------
# 0) ASSUME: hist_df_feat already exists from your earlier code
#    Columns: timestamp, zone, dow, hour, bucket_in_hour, x, y,
#             sin_daily, cos_daily, count
# -------------------------------------------------------------------

FREQ_MIN  = 15
RANDOM_SEED = 42
BUCKET_MIN  = FREQ_MIN        # use same bucket size as training data
HOURS_SIM   = 4               # forecast / simulation horizon (tweak)
RNG_SEED  = 7
DAYS      = 5                 # number of weekdays to simulate
FREQ_MIN  = 15                # bucket size in minutes (e.g., 5, 15, 30, 60)
OPEN_HR   = 8                 # business day start hour (inclusive)
CLOSE_HR  = 18                # business day end hour (exclusive)
N_ZONES_X = 3                 # grid width (x)
N_ZONES_Y = 2                 # grid height (y)
START     = pd.Timestamp("2025-01-06")  # a Monday

rng = np.random.default_rng(RNG_SEED)

# -------------------------------
# Build the time index (weekdays, business hours only)
# -------------------------------
# Generate business days (Mon–Fri)
days = pd.bdate_range(START, periods=DAYS, freq="C")  # commercial days ~ weekdays
# Intra-day time buckets
intra = pd.date_range(f"{OPEN_HR:02d}:00", f"{CLOSE_HR:02d}:00",
                      freq=f"{FREQ_MIN}min", inclusive="left").time
# Cartesian product: days × intra-day times
timestamps = pd.to_datetime(
    [pd.Timestamp(d.date()).replace(hour=t.hour, minute=t.minute) for d in days for t in intra]
)

# -------------------------------
# Zones and (x,y) grid coords
# -------------------------------
zones = [f"Z{i:02d}" for i in range(1, N_ZONES_X * N_ZONES_Y + 1)]
xy_map = {z: (i % N_ZONES_X, i // N_ZONES_X) for i, z in enumerate(zones)}

# Build full panel: all timestamps × all zones
grid = pd.MultiIndex.from_product([timestamps, zones], names=["timestamp", "zone"])
df = pd.DataFrame(index=grid).reset_index()

# -------------------------------
# Calendar features
# -------------------------------
df["dow"]  = df["timestamp"].dt.dayofweek            # 0=Mon..6=Sun
df["hour"] = df["timestamp"].dt.hour
# bucket index within the hour: 0..(60/FREQ_MIN - 1)
df["bucket_in_hour"] = (df["timestamp"].dt.minute // FREQ_MIN).astype(int)

# (x,y) for zones
df[["x","y"]] = df["zone"].map(lambda z: xy_map[z]).apply(pd.Series)

# Daily seasonality (sin/cos over 24h)
buckets_per_day = int(24 * 60 // FREQ_MIN)
t_frac = ((df["hour"]*60 + df["timestamp"].dt.minute) / (24*60)) * 2*np.pi
df["sin_daily"] = np.sin(t_frac)
df["cos_daily"] = np.cos(t_frac)


# NOTE:
# sin_daily = y-coordinate
# cos_daily = x-coordinate

# hour runs from 0 → 23, but:
# 23:45 and 00:00 are 15 minutes apart,
# yet numerically 23 and 0 are far apart.


In [2]:
# -------------------------------
# Build a realistic demand rate (λ) for Poisson counts
#   Components:
#   - Base level per zone
#   - Daily shape (peaks late morning & mid-afternoon)
#   - Day-of-week effect (Mon a bit higher)
#   - Random zone multipliers
#   - Mild trend/noise
# -------------------------------
# Zone baseline (some zones busier)
zone_base = {z: 1.0 + 0.25*(i % N_ZONES_X) + 0.15*(i // N_ZONES_X)
             for i, z in enumerate(zones)}

# Time-of-day shape: two humps (e.g., ~10–11am and ~3–4pm)
hour = df["hour"].values
tod_curve = (
    0.6
    + 0.8 * np.exp(-((hour - 10)/2.2)**2)     # late morning bump
    + 0.6 * np.exp(-((hour - 15)/2.5)**2)     # mid-afternoon bump
)

# Day-of-week multiplier: Mon highest, Fri lower
dow_mult = df["dow"].map({0:1.15, 1:1.05, 2:1.00, 3:0.98, 4:0.90, 5:0.85, 6:0.80}).astype(float)

# Zone multiplier array
z_mult = df["zone"].map(zone_base).astype(float).values

# Combine into lambda; add mild random noise and floor > 0
lam = (2.0 * z_mult * tod_curve * dow_mult.values) * (1.0 + rng.normal(0, 0.05, len(df)))
lam = np.clip(lam, 0.15, None)

# -------------------------------
# Sample counts from Poisson(λ)
# -------------------------------
df["count"] = rng.poisson(lam)


hist_df_feat = df[[
    "timestamp","zone","dow","hour","bucket_in_hour",
    "x","y","sin_daily","cos_daily","count"
]].copy()

# Quick check
display(hist_df_feat.head(), hist_df_feat.tail(), hist_df_feat.describe(include="all"))
print("Rows:", len(hist_df_feat), "| Zones:", len(zones),
      "| Buckets/day:", len(intra), "| Days:", len(days))


np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# ---------------- 1) Train/validation split ----------------
cutoff = hist_df_feat["timestamp"].max() - timedelta(hours=2)
train_df = hist_df_feat[hist_df_feat["timestamp"] <= cutoff].copy()
val_df   = hist_df_feat[hist_df_feat["timestamp"] > cutoff].copy()

feature_cols_num = ["x", "y", "bucket_in_hour", "sin_daily", "cos_daily"]
feature_cols_cat = ["zone", "dow", "hour"]

pre = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"), feature_cols_cat),
        ("pass", "passthrough", feature_cols_num),
    ]
)

Unnamed: 0,timestamp,zone,dow,hour,bucket_in_hour,x,y,sin_daily,cos_daily,count
0,2025-01-06 08:00:00,Z01,0,8,0,0,0,0.866025,-0.5,3
1,2025-01-06 08:00:00,Z02,0,8,0,1,0,0.866025,-0.5,4
2,2025-01-06 08:00:00,Z03,0,8,0,2,0,0.866025,-0.5,2
3,2025-01-06 08:00:00,Z04,0,8,0,0,1,0.866025,-0.5,4
4,2025-01-06 08:00:00,Z05,0,8,0,1,1,0.866025,-0.5,1


Unnamed: 0,timestamp,zone,dow,hour,bucket_in_hour,x,y,sin_daily,cos_daily,count
1195,2025-01-10 17:45:00,Z02,4,17,3,1,0,-0.997859,-0.065403,1
1196,2025-01-10 17:45:00,Z03,4,17,3,2,0,-0.997859,-0.065403,4
1197,2025-01-10 17:45:00,Z04,4,17,3,0,1,-0.997859,-0.065403,2
1198,2025-01-10 17:45:00,Z05,4,17,3,1,1,-0.997859,-0.065403,1
1199,2025-01-10 17:45:00,Z06,4,17,3,2,1,-0.997859,-0.065403,2


Unnamed: 0,timestamp,zone,dow,hour,bucket_in_hour,x,y,sin_daily,cos_daily,count
count,1200,1200,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0
unique,,6,,,,,,,,
top,,Z01,,,,,,,,
freq,,200,,,,,,,,
mean,2025-01-08 12:52:30,,2.0,12.5,1.5,1.0,0.5,-0.167592,-0.718765,3.074167
min,2025-01-06 08:00:00,,0.0,8.0,0.0,0.0,0.0,-0.997859,-1.0,0.0
25%,2025-01-07 10:26:15,,1.0,10.0,0.75,0.0,0.0,-0.762218,-0.94693,2.0
50%,2025-01-08 12:52:30,,2.0,12.5,1.5,1.0,0.5,-0.226955,-0.793353,3.0
75%,2025-01-09 15:18:45,,3.0,15.0,2.25,2.0,1.0,0.397585,-0.55557,4.0
max,2025-01-10 17:45:00,,4.0,17.0,3.0,2.0,1.0,0.866025,-0.065403,10.0


Rows: 1200 | Zones: 6 | Buckets/day: 40 | Days: 5


In [3]:
reg = RandomForestRegressor(
    n_estimators=200,
    random_state=RANDOM_SEED,
    n_jobs=-1
)

pipe = Pipeline([("pre", pre), ("rf", reg)])

X_train = train_df[feature_cols_cat + feature_cols_num]
y_train = train_df["count"]
X_val   = val_df[feature_cols_cat + feature_cols_num]
y_val   = val_df["count"]

pipe.fit(X_train, y_train)
val_pred = pipe.predict(X_val)
mae = mean_absolute_error(y_val, val_pred)
print(f"Validation MAE: {mae:.3f}")

# ---------------- 2) Future forecast window ----------------
def generate_future_panel(forecast_start, hours, bucket_min):
    fs = pd.to_datetime(forecast_start).floor(f"{bucket_min}min")
    times = pd.date_range(
        fs, fs + timedelta(hours=hours),
        freq=f"{bucket_min}min", inclusive="left"
    )
    grid = pd.MultiIndex.from_product([times, zones], names=["timestamp", "zone"])
    return pd.DataFrame(index=grid).reset_index()

def add_time_features(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    df["dow"]  = df["timestamp"].dt.dayofweek
    df["hour"] = df["timestamp"].dt.hour
    df["bucket_in_hour"] = (df["timestamp"].dt.minute // BUCKET_MIN).astype(int)
    df[["x","y"]] = df["zone"].map(lambda z: xy_map[z]).apply(pd.Series)
    t = ((df["hour"]*60 + df["timestamp"].dt.minute) / (24*60)) * 2*np.pi
    df["sin_daily"] = np.sin(t)
    df["cos_daily"] = np.cos(t)
    return df

forecast_start = START
forecast_df = generate_future_panel(forecast_start, HOURS_SIM, BUCKET_MIN)
forecast_feat = add_time_features(forecast_df)
X_fore = forecast_feat[feature_cols_cat + feature_cols_num]
forecast_df["pred"] = np.maximum(0, pipe.predict(X_fore)).round().astype(int)

# ---------------- 3) Expand to minute-level requests ----------------
rng = np.random.default_rng(123)

def poisson(lmbda):
    return int(rng.poisson(lmbda))

def expand_to_minute_requests(bucket_df, bucket_min):
    requests = []  # list of (arrival_time, zone)
    for _, r in bucket_df.iterrows():
        ts = r["timestamp"]
        zone = r["zone"]
        lam_per_min = (r["pred"] / bucket_min) if r["pred"] > 0 else 0.0
        for m in range(bucket_min):
            minute_ts = ts + timedelta(minutes=m)
            k = poisson(lam_per_min)
            for _ in range(k):
                requests.append((minute_ts, zone))
    return sorted(requests, key=lambda x: x[0])

req_stream = expand_to_minute_requests(forecast_df, BUCKET_MIN)

# ---------------- 4) Dispatch simulator (one function, 2 strategies) ----------------
GRID_W, GRID_H = N_ZONES_X, N_ZONES_Y
SPEED_KMH = 24.0
SPEED_M_PER_MIN = SPEED_KMH * 1000.0 / 60.0
CELL_M = 700.0

PICKUP_SERVICE_TIME_MIN  = (1.0, 3.0)
DROPOFF_SERVICE_TIME_MIN = (1.0, 3.0)
N_DRIVERS = 30

def zone_xy(z):
    return xy_map[z]

def distance_between_zones(z1, z2):
    (x1, y1), (x2, y2) = zone_xy(z1), zone_xy(z2)
    manhattan = abs(x1 - x2) + abs(y1 - y2)
    return manhattan * CELL_M

def travel_minutes(z_origin, z_dest):
    dist_m = distance_between_zones(z_origin, z_dest)
    return dist_m / SPEED_M_PER_MIN

def random_neighbor_zone(z: str) -> str:
    x, y = zone_xy(z)
    candidates = [(x,y), (x+1,y), (x-1,y), (x,y+1), (x,y-1)]
    random.shuffle(candidates)
    for nx, ny in candidates:
        if 0 <= nx < GRID_W and 0 <= ny < GRID_H:
            return f"Z{(ny*GRID_W + nx + 1):02d}"
    return z

class Driver:
    def __init__(self, idx, zone, now):
        self.idx = idx
        self.zone = zone
        self.status = "idle"
        self.free_at = now
        self.last_dropoff = now - timedelta(minutes=random.randint(10, 60))

def init_drivers(now):
    # demand-weighted initial placement
    hist_counts = hist_df_feat.groupby("zone")["count"].sum()
    w = hist_counts.reindex(zones).fillna(1.0).values
    w = w / w.sum()
    drivers = []
    for i in range(N_DRIVERS):
        z = np.random.choice(zones, p=w)
        drivers.append(Driver(i, z, now))
    return drivers

def run_dispatch(req_stream, strategy="optimal"):
    drivers = init_drivers(forecast_start)
    pending = []
    assignments_log = []
    rider_waits = []
    driver_idles = []

    current_time = forecast_start
    end_time = forecast_start + timedelta(hours=HOURS_SIM)
    req_idx = 0

    def finish_trips(now):
        for d in drivers:
            if d.free_at <= now and d.status != "idle":
                d.status = "idle"
                d.last_dropoff = now
                d.zone = random_neighbor_zone(d.zone)

    while current_time <= end_time:
        # enqueue new requests
        while req_idx < len(req_stream) and req_stream[req_idx][0] <= current_time:
            pending.append(req_stream[req_idx])
            req_idx += 1

        finish_trips(current_time)

        idle_idxs = [i for i, d in enumerate(drivers) if d.free_at <= current_time and d.status == "idle"]

        if pending and idle_idxs:
            K = min(len(pending), len(idle_idxs))
            req_subset = pending[:K]

            if strategy == "optimal":
                cost = np.zeros((K, K))
                for i in range(K):
                    _, r_zone = req_subset[i]
                    for j in range(K):
                        d = drivers[idle_idxs[j]]
                        cost[i, j] = travel_minutes(d.zone, r_zone)
                row_ind, col_ind = linear_sum_assignment(cost)
                pairs = sorted(zip(row_ind, col_ind), key=lambda x: x[0], reverse=True)

            elif strategy == "random":
                perm = np.random.permutation(K)
                pairs = [(i, perm[i]) for i in range(K)]
                pairs = sorted(pairs, key=lambda x: x[0], reverse=True)

            else:
                raise ValueError("strategy must be 'optimal' or 'random'")

            for r_i, c_j in pairs:
                r_ts, r_zone = req_subset[r_i]
                d = drivers[idle_idxs[c_j]]

                eta_pick = travel_minutes(d.zone, r_zone)
                pickup_service = random.uniform(*PICKUP_SERVICE_TIME_MIN)
                drop_service   = random.uniform(*DROPOFF_SERVICE_TIME_MIN)
                total_trip = eta_pick + pickup_service + drop_service

                d.status = "on_trip"
                d.zone = random_neighbor_zone(r_zone)
                d.free_at = current_time + timedelta(minutes=total_trip)

                rider_wait = max(0.0, (current_time + timedelta(minutes=eta_pick) - r_ts).total_seconds()/60.0)
                driver_idle = max(0.0, (current_time - d.last_dropoff).total_seconds()/60.0)

                rider_waits.append(rider_wait)
                driver_idles.append(driver_idle)
                assignments_log.append({
                    "time_assigned": current_time,
                    "req_time": r_ts,
                    "req_zone": r_zone,
                    "driver": d.idx,
                    "eta_to_pick": eta_pick,
                    "rider_wait": rider_wait,
                    "driver_idle_before_assign": driver_idle,
                    "finish_time": d.free_at
                })

                del pending[r_i]

        current_time += timedelta(minutes=1)

    return (
        np.array(rider_waits),
        np.array(driver_idles),
        pd.DataFrame(assignments_log)
    )

# Run both strategies
rider_wait_opt, driver_idle_opt, assign_opt = run_dispatch(req_stream, "optimal")
rider_wait_rnd, driver_idle_rnd, assign_rnd = run_dispatch(req_stream, "random")

print(rider_wait_opt)
print(rider_wait_rnd)


Validation MAE: 1.553
[3.5  3.5  1.75 1.75 3.5  3.5  1.75 5.25 1.75 5.25 3.5  1.75 0.   1.75
 1.75 1.75 0.   0.   0.   1.75 0.   0.   1.75 3.5  3.5  5.25 0.   1.75
 3.5  0.   0.   1.75 3.5  5.25 1.75 0.   0.   1.75 0.   0.   1.75 1.75
 5.25 3.5  0.   1.75 0.   0.   0.   0.   1.75 1.75 1.75 1.75 0.   1.75
 0.   0.   0.   1.75 5.25 0.   1.75 3.5  1.75 1.75 3.5  1.75 1.75 3.5
 3.5  0.   0.   0.   1.75 3.5  1.75 0.   0.   1.75 3.5  1.75 0.   1.75
 0.   0.   1.75 1.75 0.   0.   1.75 0.   1.75 3.5  1.75 1.75 0.   1.75
 0.   1.75 3.5  1.75 3.5  5.25 0.   1.75 0.   3.5  3.5  1.75 1.75 1.75
 1.75 1.75 0.   1.75 1.75 1.75 1.75 0.   1.75 0.   1.75 3.5  0.   1.75
 3.5  3.5  1.75 3.5  1.75 5.25 1.75 3.5  1.75 0.   1.75 3.5  3.5  3.5
 0.   1.75 3.5  3.5  3.5  3.5  0.   0.   0.   5.25 3.5  0.   3.5  1.75
 1.75 0.   1.75 3.5  1.75 3.5  0.   1.75 5.25 3.5  1.75 0.   0.   1.75
 0.   1.75 3.5  1.75 1.75 3.5  0.   1.75 1.75 0.   0.   3.5  5.25 3.5
 0.   1.75 1.75 1.75 1.75 0.   1.75 1.75 3.5  1.75 1.75 0.

In [4]:
# ---------------- 5) Charts & savings summary ----------------
# Histograms for optimal strategy
fig_wait = px.histogram(
    x=rider_wait_opt,
    nbins=30,
    labels={"x": "Minutes"},
    title="Rider Wait Time (Optimal Dispatch)"
)
fig_wait.update_layout(width=1100, height=500, bargap=0.05)
fig_wait.show()

fig_idle = px.histogram(
    x=driver_idle_opt,
    nbins=30,
    labels={"x": "Minutes"},
    title="Driver Idle Time Before Assignment (Optimal Dispatch)"
)
fig_idle.update_layout(width=1100, height=500, bargap=0.05)
fig_idle.show()

# Mean ETA comparison
mean_eta_opt = float(assign_opt["eta_to_pick"].mean())
mean_eta_rnd = float(assign_rnd["eta_to_pick"].mean())

fig_eta = go.Figure()
fig_eta.add_trace(go.Bar(
    x=["Optimal", "Random"],
    y=[mean_eta_opt, mean_eta_rnd],
    text=[f"{mean_eta_opt:.2f} min", f"{mean_eta_rnd:.2f} min"],
    textposition="auto",
    name="Mean ETA to Pickup"
))
fig_eta.update_layout(
    title="Mean ETA to Pickup (Optimal vs Random Assignment)",
    xaxis_title="Dispatch Strategy",
    yaxis_title="Minutes",
    template="plotly_white",
    width=1100, height=500
)
fig_eta.show()

# Multi-panel: baseline vs optimal, time & $ savings
BASELINE_ETA_MIN = mean_eta_rnd
VALUE_PER_MIN    = 0.50  # $ per rider-minute

assign_opt = assign_opt.copy()
assign_opt["time_assigned"] = pd.to_datetime(assign_opt["time_assigned"])
assign_opt["saved_min"] = np.clip(BASELINE_ETA_MIN - assign_opt["eta_to_pick"], 0, None)
assign_opt["saved_$"]   = assign_opt["saved_min"] * VALUE_PER_MIN

hourly = (
    assign_opt
    .groupby(assign_opt["time_assigned"].dt.floor("H"))["saved_$"]
    .sum()
    .sort_index()
)
hours = hourly.index
cumulative_savings = hourly.cumsum().values

total_wait_real   = assign_opt["eta_to_pick"].sum()
total_wait_base   = len(assign_opt) * BASELINE_ETA_MIN
saved_time_total  = max(0.0, total_wait_base - total_wait_real)
waited_time_total = total_wait_real

fig_all = make_subplots(
    rows=1, cols=3,
    subplot_titles=[
        "Mean ETA to Pickup",
        "Wait-Time Composition",
        "Cumulative $ Savings"
    ],
    specs=[[{"type":"bar"}, {"type":"pie"}, {"type":"scatter"}]]
)

fig_all.add_trace(
    go.Bar(
        x=["Optimal", "Baseline"],
        y=[mean_eta_opt, BASELINE_ETA_MIN],
        text=[f"{mean_eta_opt:.1f} min", f"{BASELINE_ETA_MIN:.1f} min"],
        textposition="auto",
        marker_color=["#27AE60","#E74C3C"],
        name="ETA (min)"
    ),
    row=1, col=1
)

fig_all.add_trace(
    go.Pie(
        labels=["Saved Time", "Waited Time"],
        values=[saved_time_total, waited_time_total],
        hole=0.4, textinfo="label+percent",
        marker_colors=["#2ECC71","#95A5A6"],
        sort=False
    ),
    row=1, col=2
)

fig_all.add_trace(
    go.Scatter(
        x=hours, y=cumulative_savings,
        mode="lines+markers",
        line=dict(width=3),
        name="Cumulative $ Savings"
    ),
    row=1, col=3
)

fig_all.update_layout(
    title="Rideshare Dispatch Optimization — Time and Cost Savings",
    template="plotly_white",
    showlegend=False,
    width=1100, height=500
)

fig_all.show()



'H' is deprecated and will be removed in a future version, please use 'h' instead.

