In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

pd.set_option("display.max_columns", 200)
pd.set_option("display.float_format", lambda x: f"{x:,.3f}")


In [3]:
RAW_PATH = r"../dataset/bus_trip_updates_raw_member3.csv"

df_raw = pd.read_csv(RAW_PATH, low_memory=False)
print("Raw shape:", df_raw.shape)
display(df_raw.head())
print("\nColumns:", list(df_raw.columns))


Raw shape: (206623, 7)


Unnamed: 0,snapshot_time_utc,route_id,trip_id,start_date_raw,stop_id,arrival_delay_sec,departure_delay_sec
0,56:44.6,5158_116122,5158_65525,20251205,8530B1522701,2467.0,2467.0
1,56:44.6,5158_116122,5158_65525,20251205,8530B158221,3453.0,3453.0
2,56:44.6,5158_116119,5158_43335,20251205,8490B141751,1408.0,1408.0
3,56:44.6,5158_116119,5158_43335,20251205,8490B141651,1343.0,1343.0
4,56:44.6,5158_116119,5158_43335,20251205,8490B5550501,1283.0,



Columns: ['snapshot_time_utc', 'route_id', 'trip_id', 'start_date_raw', 'stop_id', 'arrival_delay_sec', 'departure_delay_sec']


In [4]:
df = df_raw.copy()

# Convert snapshot_time_utc
if "snapshot_time_utc" in df.columns:
    df["snapshot_time_utc"] = pd.to_datetime(df["snapshot_time_utc"], errors="coerce")

# Convert start_date_raw (YYYYMMDD)
if "start_date_raw" in df.columns:
    df["start_date_raw"] = df["start_date_raw"].astype(str)
    df["start_date"] = pd.to_datetime(df["start_date_raw"], format="%Y%m%d", errors="coerce")

# Ensure delays are numeric
for col in ["arrival_delay_sec", "departure_delay_sec"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

print("After type fixing:", df.shape)
print(df.dtypes)


After type fixing: (206623, 8)
snapshot_time_utc      datetime64[ns]
route_id                       object
trip_id                        object
start_date_raw                 object
stop_id                        object
arrival_delay_sec             float64
departure_delay_sec           float64
start_date             datetime64[ns]
dtype: object


  df["snapshot_time_utc"] = pd.to_datetime(df["snapshot_time_utc"], errors="coerce")


In [5]:
# Target = best available delay signal per row
df["delay_sec"] = df[["arrival_delay_sec", "departure_delay_sec"]].max(axis=1, skipna=True)

# Remove rows where we have no delay at all
df = df.dropna(subset=["delay_sec"]).copy()

# Clean weird values:

df["delay_sec"] = df["delay_sec"].clip(lower=-900, upper=7200)  

df["delay_min"] = df["delay_sec"] / 60.0

print("After target creation:", df.shape)
display(df[["delay_sec", "delay_min"]].describe())


After target creation: (193311, 10)


Unnamed: 0,delay_sec,delay_min
count,193311.0,193311.0
mean,366.34,6.106
std,728.266,12.138
min,-900.0,-15.0
25%,0.0,0.0
50%,76.0,1.267
75%,477.0,7.95
max,7200.0,120.0


In [6]:

if df["snapshot_time_utc"].notna().any():
    dt_col = "snapshot_time_utc"
else:
    dt_col = "start_date"

df["hour"] = df[dt_col].dt.hour
df["dayofweek"] = df[dt_col].dt.dayofweek  # Mon=0
df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)

# Cast IDs to string for encoding
for col in ["route_id", "stop_id", "trip_id"]:
    if col in df.columns:
        df[col] = df[col].astype(str)

# Remove rows with missing essential features
df = df.dropna(subset=["hour", "dayofweek", "route_id", "stop_id"]).copy()

print("After feature engineering:", df.shape)
display(df.head())


After feature engineering: (109512, 13)


Unnamed: 0,snapshot_time_utc,route_id,trip_id,start_date_raw,stop_id,arrival_delay_sec,departure_delay_sec,start_date,delay_sec,delay_min,hour,dayofweek,is_weekend
20884,2026-02-02 01:53:54,5158_116122,5158_65525,20251205,8530B1522701,2467.0,2467.0,2025-12-05,2467.0,41.117,1.0,0.0,0
20885,2026-02-02 01:53:54,5158_116122,5158_65525,20251205,8530B158221,3753.0,3753.0,2025-12-05,3753.0,62.55,1.0,0.0,0
20886,2026-02-02 01:53:54,5158_116119,5158_43335,20251205,8490B141751,1408.0,1408.0,2025-12-05,1408.0,23.467,1.0,0.0,0
20887,2026-02-02 01:53:54,5158_116119,5158_43335,20251205,8490B141651,1343.0,1343.0,2025-12-05,1343.0,22.383,1.0,0.0,0
20888,2026-02-02 01:53:54,5158_116119,5158_43335,20251205,8490B5550501,1283.0,,2025-12-05,1283.0,21.383,1.0,0.0,0


In [7]:

df["date"] = pd.to_datetime(df[dt_col].dt.date)

group_cols = ["date", "route_id", "stop_id", "hour", "dayofweek", "is_weekend"]

df_agg = (
    df.groupby(group_cols, as_index=False)
      .agg(
          delay_min_mean=("delay_min", "mean"),
          delay_min_median=("delay_min", "median"),
          delay_min_p95=("delay_min", lambda x: np.percentile(x, 95)),
          n_updates=("delay_min", "size")
      )
)

print("Aggregated shape:", df_agg.shape)
display(df_agg.head())


Aggregated shape: (29084, 10)


Unnamed: 0,date,route_id,stop_id,hour,dayofweek,is_weekend,delay_min_mean,delay_min_median,delay_min_p95,n_updates
0,2025-12-12,5146_116052,8220B1351001,20.0,4.0,0,60.369,51.5,79.867,9
1,2025-12-12,5146_116052,8220B1351201,20.0,4.0,0,31.976,32.567,66.217,16
2,2025-12-12,5146_116052,8220B1351401,20.0,4.0,0,61.657,72.967,72.967,5
3,2025-12-12,5146_116052,8220B1354001,20.0,4.0,0,51.667,50.75,75.263,5
4,2025-12-12,5146_116052,8220DB000316,20.0,4.0,0,41.232,51.567,64.417,10


In [8]:

CLEAN_PATH = r"../cleaned dataset/bus_daily_cleaned_member3.csv"

df_agg.to_csv(CLEAN_PATH, index=False)

print(" Cleaned Member 3 dataset saved to:")
print(CLEAN_PATH)
print("Saved shape:", df_agg.shape)


 Cleaned Member 3 dataset saved to:
../cleaned dataset/bus_daily_cleaned_member3.csv
Saved shape: (29084, 10)


In [9]:
df_model = df_agg.sort_values("date").reset_index(drop=True)

target_col = "delay_min_mean"
feature_cols = ["route_id", "stop_id", "hour", "dayofweek", "is_weekend", "n_updates"]

X = df_model[feature_cols].copy()
y = df_model[target_col].copy()

# Time-aware split
n = len(df_model)
split_idx = int(n * 0.8)

X_train = X.iloc[:split_idx]
y_train = y.iloc[:split_idx]
X_test  = X.iloc[split_idx:]
y_test  = y.iloc[split_idx:]

print("Train size:", X_train.shape[0])
print("Test size :", X_test.shape[0])
print("Train date range:", df_model["date"].iloc[0], "to", df_model["date"].iloc[split_idx - 1])
print("Test date range :", df_model["date"].iloc[split_idx], "to", df_model["date"].iloc[-1])


Train size: 23267
Test size : 5817
Train date range: 2025-12-12 00:00:00 to 2026-02-02 00:00:00
Test date range : 2026-02-02 00:00:00 to 2026-02-02 00:00:00


In [10]:
cat_features = ["route_id", "stop_id"]
num_features = ["hour", "dayofweek", "is_weekend", "n_updates"]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median"))
        ]), num_features),
    ]
)


In [11]:
ridge_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", Ridge(alpha=1.0, random_state=42))
])

ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)

rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("\n==============================")
print("MEMBER 3 – MODEL 1: RIDGE REGRESSION")
print("==============================")
print(f"RMSE : {rmse_ridge:,.3f}")
print(f"MAE  : {mae_ridge:,.3f}")
print(f"R²   : {r2_ridge:,.3f}")




MEMBER 3 – MODEL 1: RIDGE REGRESSION
RMSE : 8.043
MAE  : 5.155
R²   : 0.048


In [12]:
rf_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
        max_depth=None
    ))
])

rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\n==============================")
print("MEMBER 3 – MODEL 2: RANDOM FOREST")
print("==============================")
print(f"RMSE : {rmse_rf:,.3f}")
print(f"MAE  : {mae_rf:,.3f}")
print(f"R²   : {r2_rf:,.3f}")



MEMBER 3 – MODEL 2: RANDOM FOREST
RMSE : 8.474
MAE  : 4.568
R²   : -0.057


In [13]:
print("\nSUMMARY – MEMBER 3 MODELS")
print("-------------------------")
print(f"Ridge Regression -> RMSE: {rmse_ridge:,.3f}, R²: {r2_ridge:,.3f}")
print(f"Random Forest    -> RMSE: {rmse_rf:,.3f}, R²: {r2_rf:,.3f}")



SUMMARY – MEMBER 3 MODELS
-------------------------
Ridge Regression -> RMSE: 8.043, R²: 0.048
Random Forest    -> RMSE: 8.474, R²: -0.057
