In [None]:
import joblib
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import pandas as pd

from bike_demand_forecasting.metrics import smape, bias
from bike_demand_forecasting.utils import get_paths

In [2]:
# Retrieve Path
paths = get_paths()

WORK_DIR = paths["WORK_DIR"]
DATA_DIR = paths['DATA_DIR']
DATA_PROCESSED_DIR = DATA_DIR / "processed"

# Read X
X_test_baseline = pd.read_csv(DATA_PROCESSED_DIR / "X_test_baseline.csv") 
X_test_feat = pd.read_csv(DATA_PROCESSED_DIR / "X_test_feat_eng.csv")
# Read target
y_test = pd.read_csv(DATA_PROCESSED_DIR / "y_test_baseline_fe.csv")

model_dir = WORK_DIR / "models"

In [3]:
# Load final trained models (used for hold-out test predictions)
rfg_baseline = joblib.load(model_dir / "rfg_baseline.joblib")
rfg_fe = joblib.load(model_dir / "rfg_feature_eng.joblib")


# Load GridSearchCV objects to access CV metrics and best hyperparameters
grid_rfg_baseline = joblib.load(model_dir / "rfg_baseline_grid.joblib")
grid_rfg_fe = joblib.load(model_dir / "rfg_feature_eng_grid.joblib")

## **Metrics** (mae, mape, smape, bias)

### Baseline model

In [4]:
# Extract cross-validation metrics for the best baseline configuration found by GridSearchCV
# Scores are stored as negative values for error metrics (neg_*), so we multiply by -1 to get readable positive errors.
# These are CV (train/validation on dev set) results, not hold-out test results.
cv_mae = -grid_rfg_baseline.cv_results_["mean_test_mae"][grid_rfg_baseline.best_index_]
cv_mape = -grid_rfg_baseline.cv_results_["mean_test_mape"][grid_rfg_baseline.best_index_]
cv_smape = -grid_rfg_baseline.cv_results_["mean_test_smape"][grid_rfg_baseline.best_index_]
cv_bias = -grid_rfg_baseline.cv_results_["mean_test_bias"][grid_rfg_baseline.best_index_]

print("CV MAE:", cv_mae)
print("CV MAPE:", cv_mape)
print("CV sMAPE:", cv_smape)
print("CV bias:", cv_bias)

print("Best params:", grid_rfg_baseline.best_params_)

CV MAE: 190.98086694040956
CV MAPE: 0.2578374708478391
CV sMAPE: 0.23074337185643498
CV bias: -39.681806827447545
Best params: {'reg__max_depth': None, 'reg__min_samples_leaf': 3, 'reg__n_estimators': 200}


In [5]:
# Metrics compute on test set
y_test_pred = rfg_baseline.predict(X_test_baseline)
mae_test = mean_absolute_error(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
smape_test = smape(y_test, y_test_pred)
bias_test = bias(y_test, y_test_pred)

print("TEST MAE:", mae_test)
print("TEST MAPE:", mape_test)
print("TEST sMAPE:", smape_test)
print("TEST bias:", bias_test)

TEST MAE: 152.03435090168801
TEST MAPE: 0.44895096764549863
TEST sMAPE: 0.2962417451169806
TEST bias: 119.61546469756003


### Model feature engineering

In [6]:
# Extract cross-validation metrics for the best baseline configuration found by GridSearchCV
# Scores are stored as negative values for error metrics (neg_*), so we multiply by -1 to get readable positive errors.
# These are CV (train/validation on dev set) results, not hold-out test results.
cv_mae = -grid_rfg_fe.cv_results_["mean_test_mae"][grid_rfg_fe.best_index_]
cv_mape = -grid_rfg_fe.cv_results_["mean_test_mape"][grid_rfg_fe.best_index_]
cv_smape = -grid_rfg_fe.cv_results_["mean_test_smape"][grid_rfg_fe.best_index_]
cv_bias = -grid_rfg_fe.cv_results_["mean_test_bias"][grid_rfg_fe.best_index_]

print("CV MAE:", cv_mae)
print("CV MAPE:", cv_mape)
print("CV sMAPE:", cv_smape)
print("CV bias:", cv_bias)

print("Best params:", grid_rfg_fe.best_params_)

CV MAE: 155.49654182340632
CV MAPE: 0.25256636863259596
CV sMAPE: 0.20000110064348356
CV bias: -0.39151191572275723
Best params: {'reg__max_depth': None, 'reg__min_samples_leaf': 3, 'reg__n_estimators': 400}


In [7]:
# Metrics compute on test set
y_test_pred = rfg_fe.predict(X_test_feat)
mae_test = mean_absolute_error(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
smape_test = smape(y_test, y_test_pred)
bias_test = bias(y_test, y_test_pred)

print("TEST MAE:", mae_test)
print("TEST MAPE:", mape_test)
print("TEST sMAPE:", smape_test)
print("Test bias", bias_test)

TEST MAE: 105.94719655982622
TEST MAPE: 0.27288410838956956
TEST sMAPE: 0.20741491622707803
Test bias 18.12371005815411


## **Model Performance by Time Segment on the Feature-Engineered Test Set (Peak Hours and Holidays)**

### Holidays

In [8]:
# y_test_feat is loaded from CSV -> convert it to an aligned 1D Series
y_true = y_test.squeeze("columns").reset_index(drop=True)
y_pred = pd.Series(y_test_pred, index=y_test.index)

# Mask holidays
mask_holiday = X_test_feat["is_holiday"].astype(int).eq(1).reset_index(drop=True)

# Target subset holidays
y_true_holiday = y_true[mask_holiday]
y_pred_holiday = y_pred[mask_holiday]

# Compute metrics
mae_holiday = mean_absolute_error(y_true_holiday, y_pred_holiday)
mape_holiday = mean_absolute_percentage_error(y_true_holiday, y_pred_holiday)
smape_holiday = smape(y_true_holiday, y_pred_holiday)
bias_holiday = bias(y_true_holiday, y_pred_holiday)

print("Nb row holiday:", int(mask_holiday.sum()))
print("MAE holiday:", mae_holiday)
print("MAPE holiday:", mape_holiday)
print("sMAPE holiday:", smape_holiday)
print("Bias holiday:", bias_holiday)

Nb row holiday: 120
MAE holiday: 223.71604805931068
MAPE holiday: 0.9272508659333816
sMAPE holiday: 0.5234810921078273
Bias holiday: 198.70437758327594


### hour peak

In [9]:
# Define peak hour
peak_hours = [7, 8, 9, 17, 18, 19]
# Mask peak hour
mask_peak = X_test_feat["hour"].isin(peak_hours).reset_index(drop=True)

# Target subset peak hour 
y_true_peak = y_true[mask_peak]
y_pred_peak = y_pred[mask_peak]

# Compute metrics
mae_peak = mean_absolute_error(y_true_peak, y_pred_peak)
mape_peak = mean_absolute_percentage_error(y_true_peak, y_pred_peak)
smape_peak = smape(y_true_peak, y_pred_peak)
bias_peak = bias(y_true_peak, y_pred_peak)

print("Nb row peak:", int(mask_peak.sum()))
print("MAE peak:", mae_peak)
print("MAPE peak:", mape_peak)
print("sMAPE peak:", smape_peak)
print("Bias peak:", bias_peak)

Nb row peak: 868
MAE peak: 176.0487695295178
MAPE peak: 0.2659749660805995
sMAPE peak: 0.20055297434552097
Bias peak: 24.49269271357507


### hour peak off

In [10]:
# Mask off peak (inverse peak mask)
mask_offpeak = ~X_test_feat["hour"].isin(peak_hours).reset_index(drop=True)

# Target subset offpeak hour 
y_true_offpeak = y_true[mask_offpeak]
y_pred_offpeak = y_pred[mask_offpeak]

# COmpute metrics
mae_offpeak = mean_absolute_error(y_true_offpeak, y_pred_offpeak)
mape_offpeak = mean_absolute_percentage_error(y_true_offpeak, y_pred_offpeak)
smape_offpeak = smape(y_true_offpeak, y_pred_offpeak)
bias_offpeak = bias(y_true_offpeak, y_pred_offpeak)

print("Nb row off-peak:", int(mask_offpeak.sum()))
print("MAE off-peak:", mae_offpeak)
print("MAPE off-peak:", mape_offpeak)
print("sMAPE off-peak:", smape_offpeak)
print("Bias off-peak:", bias_offpeak)

Nb row off-peak: 2603
MAE off-peak: 82.57102854688259
MAPE off-peak: 0.27518804059248386
sMAPE off-peak: 0.2097031089098254
Bias off-peak: 15.999900244513928


### Plot metrics

In [11]:
import plotly.express as px

df_plot = pd.DataFrame([
    {"segment": "Off-peak", "MAE": mae_offpeak, "MAPE": mape_offpeak, "sMAPE": smape_offpeak, "Bias": bias_offpeak},
    {"segment": "Peak",     "MAE": mae_peak,    "MAPE": mape_peak,    "sMAPE": smape_peak,    "Bias": bias_peak},
    {"segment": "Holiday",  "MAE": mae_holiday, "MAPE": mape_holiday, "sMAPE": smape_holiday, "Bias": bias_holiday},
])

df_long = df_plot.melt(
    id_vars="segment",
    value_vars=["MAE", "MAPE", "sMAPE", "Bias"],
    var_name="metric",
    value_name="value"
)

fig = px.bar(
    df_long,
    x="segment",
    y="value",
    color="segment",
    facet_col="metric",
    barmode="group",
    text_auto=".3f",
    category_orders={"segment": ["Off-peak", "Peak", "Holiday"]}
)

fig.update_layout(
    title="Metrics comparison by temporal segments",
    showlegend=False,
    height=450
)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_yaxes(type="log", matches=None)
fig.show()