In [None]:
from google.colab import files
uploaded = files.upload()

Saving tamil_nadu_bus_sample_enriched_processed.csv to tamil_nadu_bus_sample_enriched_processed.csv


In [None]:
!pip install catboost xgboost lightgbm category_encoders joblib -q

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import StackingRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import joblib


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m85.7/85.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
df = pd.read_csv("tamil_nadu_bus_sample_enriched_processed.csv")

# Ensure numeric time columns
for c in ['scheduled_hour','scheduled_minute','scheduled_second','day_of_week']:
    df[c] = pd.to_numeric(df[c], errors='coerce')

# Standardize traffic/weather/route
for c in ['traffic_level','weather','route_id']:
    df[c] = df[c].astype(str).str.strip().str.lower()

# --- Handle missing values ---

# Numeric columns: fill NaN with median
for c in ['scheduled_hour','scheduled_minute','scheduled_second','day_of_week','delay_minutes']:
    df[c] = df[c].fillna(df[c].median())

# Categorical columns: fill NaN with 'unknown'
for c in ['traffic_level','weather','route_id']:
    df[c] = df[c].fillna('unknown')


In [None]:
# Ordinal mapping for traffic
traffic_map = {'low':1, 'medium':2, 'high':3}
df['traffic_level_num'] = df['traffic_level'].map(traffic_map)

# One-hot for weather
weather_dum = pd.get_dummies(df['weather'], prefix='wx', drop_first=True)
df = pd.concat([df, weather_dum], axis=1)

In [None]:
# --- Existing features ---
df['minute_of_day'] = df['scheduled_hour']*60 + df['scheduled_minute'] + df['scheduled_second']/60.0
df['time_sin'] = np.sin(2*np.pi*df['minute_of_day']/1440)
df['time_cos'] = np.cos(2*np.pi*df['minute_of_day']/1440)
df['dow_sin'] = np.sin(2*np.pi*df['day_of_week']/7)
df['dow_cos'] = np.cos(2*np.pi*df['day_of_week']/7)
df['is_peak'] = df['scheduled_hour'].isin(list(range(7,11)) + list(range(17,21))).astype(int)
df['time_bin_15'] = (df['minute_of_day']//15).astype(int)

# --- NEW Feature 1: Weekend flag ---
df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)

# --- NEW Feature 2: Rush hour intensity ---
df['rush_intensity'] = np.where(df['is_peak']==1, df['traffic_level_num']*2, df['traffic_level_num'])

# --- NEW Feature 3: Route-level delay statistics ---
route_delay_mean = df.groupby('route_id')['delay_minutes'].transform('mean')
route_delay_q25  = df.groupby('route_id')['delay_minutes'].transform(lambda x: x.quantile(0.25))
route_delay_q75  = df.groupby('route_id')['delay_minutes'].transform(lambda x: x.quantile(0.75))

df['route_delay_mean'] = route_delay_mean
df['route_delay_q25']  = route_delay_q25
df['route_delay_q75']  = route_delay_q75


In [None]:
df['route_freq'] = df['route_id'].map(df['route_id'].value_counts(normalize=True))


In [None]:
def oof_target_encode(train_df, test_df, group_cols, target='delay_minutes'):
    te_tr = pd.Series(index=train_df.index, dtype=float)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for tr_idx, val_idx in kf.split(train_df):
        tr, val = train_df.iloc[tr_idx], train_df.iloc[val_idx]
        means = tr.groupby(group_cols)[target].mean()
        te_tr.iloc[val_idx] = val.set_index(group_cols).index.map(means)
    te_tr = te_tr.fillna(train_df[target].mean())
    # test mapping
    full_means = train_df.groupby(group_cols)[target].mean()
    te_te = test_df.set_index(group_cols).index.map(full_means).fillna(train_df[target].mean())
    return te_tr, te_te


In [None]:
df = df.sort_values(['route_id','day_of_week','minute_of_day'])
df['delay_lag1'] = df.groupby('route_id')['delay_minutes'].shift(1)
df['delay_rolling3'] = df.groupby('route_id')['delay_minutes'].rolling(3, min_periods=1).mean().reset_index(0,drop=True)

# Fill NaNs with global mean
df['delay_lag1'] = df['delay_lag1'].fillna(df['delay_minutes'].mean())
df['delay_rolling3'] = df['delay_rolling3'].fillna(df['delay_minutes'].mean())


In [None]:
# --- Extra Feature Engineering ---

# Month & Quarter
df['month'] = pd.to_datetime(df['scheduled_hour'], errors='coerce').dt.month.fillna(0).astype(int)
df['quarter'] = ((df['month']-1)//3 + 1).astype(int)
df['is_monsoon'] = df['month'].isin([6,7,8,9]).astype(int)

# Route-level std and rolling 5
df['route_delay_std'] = df.groupby('route_id')['delay_minutes'].transform('std').fillna(0)
df['delay_rolling5'] = df.groupby('route_id')['delay_minutes'].rolling(5, min_periods=1).mean().reset_index(0,drop=True)
df['delay_rolling5'] = df['delay_rolling5'].fillna(df['delay_minutes'].mean())

# Traffic √ó Weather interaction
df['traffic_weather'] = df['traffic_level'] + "_" + df['weather']
df['is_rain_peak'] = ((df['weather']=='rainy') & (df['is_peak']==1)).astype(int)

# Trip order within route/day
df['trip_order'] = df.groupby(['route_id','day_of_week']).cumcount()+1


In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Attach OOF route features
te_route_tr, te_route_te = oof_target_encode(train_df, test_df, ['route_id'])
train_df['te_route'] = te_route_tr
test_df['te_route'] = te_route_te


In [None]:
# ============================
# Update feature list
# ============================
feature_cols = [
    'minute_of_day','time_sin','time_cos','dow_sin','dow_cos',
    'is_peak','is_weekend','rush_intensity',
    'time_bin_15','traffic_level_num','route_freq',
    'delay_lag1','delay_rolling3','te_route',
    'route_delay_mean','route_delay_q25','route_delay_q75',

    # --- new features ---
    'month','quarter','is_monsoon',
    'route_delay_std','delay_rolling5',
    'traffic_weather','is_rain_peak',
    'trip_order',

    # categorical
    'route_id','traffic_level','weather'
]

cat_features = ['route_id','traffic_level','weather','traffic_weather']

# Split train/test
X_train = train_df[feature_cols].copy()
X_test  = test_df[feature_cols].copy()
y_train = train_df['delay_minutes']
y_test  = test_df['delay_minutes']

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Store results
results = []

def evaluate_model(model, X_train, y_train, X_test, y_test, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"\nüìä {name} Metrics:")
    print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R¬≤: {r2:.3f}")
    results.append({"Model": name, "MAE": mae, "RMSE": rmse, "R¬≤": r2})



In [None]:
# --- Train CatBoost separately ---
cat_model = CatBoostRegressor(
    iterations=2000, learning_rate=0.03, depth=8,
    l2_leaf_reg=3, bagging_temperature=0.5,
    eval_metric='R2', random_seed=42,
    early_stopping_rounds=100, verbose=200
)

cat_model.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_test, y_test))

0:	learn: 0.0448834	test: 0.0451072	best: 0.0451072 (0)	total: 257ms	remaining: 8m 34s
200:	learn: 0.8500556	test: 0.8546422	best: 0.8546422 (200)	total: 29.6s	remaining: 4m 24s
400:	learn: 0.8607221	test: 0.8641177	best: 0.8641177 (400)	total: 52.8s	remaining: 3m 30s
600:	learn: 0.8651269	test: 0.8674535	best: 0.8674535 (600)	total: 1m 16s	remaining: 2m 56s
800:	learn: 0.8683759	test: 0.8691930	best: 0.8691930 (800)	total: 1m 40s	remaining: 2m 29s
1000:	learn: 0.8709755	test: 0.8701149	best: 0.8701149 (1000)	total: 2m 4s	remaining: 2m 3s
1200:	learn: 0.8734683	test: 0.8708660	best: 0.8708675 (1198)	total: 2m 29s	remaining: 1m 39s
1400:	learn: 0.8758898	test: 0.8715802	best: 0.8715807 (1389)	total: 2m 54s	remaining: 1m 14s
1600:	learn: 0.8779984	test: 0.8719953	best: 0.8719953 (1600)	total: 3m 18s	remaining: 49.5s
1800:	learn: 0.8799518	test: 0.8723228	best: 0.8723228 (1800)	total: 3m 43s	remaining: 24.6s
1999:	learn: 0.8818115	test: 0.8725326	best: 0.8725395 (1992)	total: 4m 8s	remain

<catboost.core.CatBoostRegressor at 0x7e74885bd220>

In [None]:

# --- Evaluate CatBoost without re-fitting ---
y_pred = cat_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\nüìä CatBoost Metrics:")
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R¬≤: {r2:.3f}")

results.append({"Model": "CatBoost", "MAE": mae, "RMSE": rmse, "R¬≤": r2})


üìä CatBoost Metrics:
MAE: 2.50, RMSE: 3.53, R¬≤: 0.873


In [None]:
X_train_xgb, X_test_xgb = X_train.copy(), X_test.copy()
for col in X_train_xgb.select_dtypes(include="object").columns:
    X_train_xgb[col] = X_train_xgb[col].astype("category").cat.codes
    X_test_xgb[col] = X_test_xgb[col].astype("category").cat.codes

xgb_model = xgb.XGBRegressor(
    n_estimators=1000, max_depth=8, learning_rate=0.05, random_state=42, n_jobs=-1
)
evaluate_model(xgb_model, X_train_xgb, y_train, X_test_xgb, y_test, "XGBoost")


üìä XGBoost Metrics:
MAE: 2.49, RMSE: 3.53, R¬≤: 0.873


In [None]:
# =============================
# 2. LightGBM
# =============================
X_train_lgb, X_test_lgb = X_train.copy(), X_test.copy()
for col in X_train_lgb.select_dtypes(include="object").columns:
    X_train_lgb[col] = X_train_lgb[col].astype("category")
    X_test_lgb[col] = X_test_lgb[col].astype("category")

lgb_model = lgb.LGBMRegressor(
    n_estimators=1000, learning_rate=0.05, max_depth=8, random_state=42
)
evaluate_model(lgb_model, X_train_lgb, y_train, X_test_lgb, y_test, "LightGBM")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009654 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1756
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 21
[LightGBM] [Info] Start training from score 19.119925

üìä LightGBM Metrics:
MAE: 2.57, RMSE: 3.57, R¬≤: 0.870


In [None]:

# =============================
# 3. Random Forest
# =============================
X_train_rf, X_test_rf = X_train.copy(), X_test.copy()
for col in X_train_rf.select_dtypes(include="object").columns:
    X_train_rf[col] = X_train_rf[col].astype("category").cat.codes
    X_test_rf[col] = X_test_rf[col].astype("category").cat.codes

rf_model = RandomForestRegressor(n_estimators=300, max_depth=12, random_state=42, n_jobs=-1)
evaluate_model(rf_model, X_train_rf, y_train, X_test_rf, y_test, "RandomForest")



üìä RandomForest Metrics:
MAE: 2.46, RMSE: 3.56, R¬≤: 0.870


In [None]:
# =============================
# 4. Extra Trees
# =============================
et_model = ExtraTreesRegressor(n_estimators=300, max_depth=12, random_state=42, n_jobs=-1)
evaluate_model(et_model, X_train_rf, y_train, X_test_rf, y_test, "ExtraTrees")



üìä ExtraTrees Metrics:
MAE: 2.71, RMSE: 3.69, R¬≤: 0.861


In [None]:
from sklearn.impute import SimpleImputer

# Reuse RandomForest-encoded data
X_train_lin, X_test_lin = X_train_rf.copy(), X_test_rf.copy()

# Drop all-NaN columns (safe, since they carry no info)
nan_cols = X_train_lin.columns[X_train_lin.isna().all()]
print("Dropping all-NaN columns:", nan_cols.tolist())
X_train_lin = X_train_lin.drop(columns=nan_cols)
X_test_lin = X_test_lin.drop(columns=nan_cols)

# Impute remaining missing values
imputer = SimpleImputer(strategy="mean")
X_train_lin = pd.DataFrame(imputer.fit_transform(X_train_lin),
                           columns=X_train_lin.columns, index=X_train_lin.index)
X_test_lin = pd.DataFrame(imputer.transform(X_test_lin),
                          columns=X_test_lin.columns, index=X_test_lin.index)


Dropping all-NaN columns: ['dow_sin', 'dow_cos', 'trip_order']


In [None]:
X_train_lin = X_train_lin.fillna(0)
X_test_lin = X_test_lin.fillna(0)


In [None]:
# =============================
# 5. Linear Regression
# =============================

lin_model = LinearRegression()
evaluate_model(lin_model, X_train_lin, y_train, X_test_lin, y_test, "LinearRegression")


üìä LinearRegression Metrics:
MAE: 3.59, RMSE: 4.60, R¬≤: 0.784


In [None]:
# =============================
# 6. Ridge Regression
# =============================
ridge_model = Ridge(alpha=1.0, random_state=42)
evaluate_model(ridge_model, X_train_lin, y_train, X_test_lin, y_test, "RidgeRegression")


üìä RidgeRegression Metrics:
MAE: 3.59, RMSE: 4.60, R¬≤: 0.784


In [None]:
# =============================
# Collect Results into Table
# =============================
import pandas as pd
df_results = pd.DataFrame(results)
print("\nüìä Final Model Comparison:")
print(df_results)

from tabulate import tabulate
print(tabulate(df_results, headers="keys", tablefmt="psql"))


üìä Final Model Comparison:
              Model       MAE      RMSE        R¬≤
0          CatBoost  2.503189  3.528264  0.872540
1           XGBoost  2.487626  3.525962  0.872706
2          LightGBM  2.565481  3.568860  0.869590
3      RandomForest  2.457253  3.557350  0.870429
4        ExtraTrees  2.707056  3.687285  0.860791
5  LinearRegression  3.586393  4.598224  0.783512
6   RidgeRegression  3.586132  4.597952  0.783538
+----+------------------+---------+---------+----------+
|    | Model            |     MAE |    RMSE |       R¬≤ |
|----+------------------+---------+---------+----------|
|  0 | CatBoost         | 2.50319 | 3.52826 | 0.87254  |
|  1 | XGBoost          | 2.48763 | 3.52596 | 0.872706 |
|  2 | LightGBM         | 2.56548 | 3.56886 | 0.86959  |
|  3 | RandomForest     | 2.45725 | 3.55735 | 0.870429 |
|  4 | ExtraTrees       | 2.70706 | 3.68729 | 0.860791 |
|  5 | LinearRegression | 3.58639 | 4.59822 | 0.783512 |
|  6 | RidgeRegression  | 3.58613 | 4.59795 | 0.783538 