In [2]:
import pandas as pd

df = pd.read_csv("hour.csv")

target = "cnt"

DROP_COLS = [
    "instant", "dteday",
    "casual", "registered"
]

df = df.drop(columns=DROP_COLS)

X = df.drop(columns=[target])
y = df[target]


In [3]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)


In [4]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

subag = BaggingRegressor(
    estimator=DecisionTreeRegressor(max_depth=10),
    n_estimators=150,
    max_samples=0.6,
    random_state=42,
    n_jobs=-1
)


In [5]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)


In [7]:
from sklearn.model_selection import KFold, cross_validate

from sklearn.metrics import make_scorer, root_mean_squared_error, mean_absolute_error



import numpy as np

rmse = make_scorer(root_mean_squared_error)
mae = make_scorer(mean_absolute_error)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "RandomForest": rf,
    "Subagging": subag,
    "Boosting": gbr
}

results = []

for name, model in models.items():
    scores = cross_validate(
        model,
        X, y,
        cv=kf,
        scoring={"rmse": rmse, "mae": mae}
    )

    results.append({
        "Model": name,
        "RMSE_mean": scores["test_rmse"].mean(),
        "RMSE_std": scores["test_rmse"].std(),
        "MAE_mean": scores["test_mae"].mean(),
        "MAE_std": scores["test_mae"].std()
    })

cv_df = pd.DataFrame(results)
cv_df.to_csv("cv_regression_results.csv", index=False)

print(cv_df)


          Model  RMSE_mean  RMSE_std   MAE_mean   MAE_std
0  RandomForest  45.266008  1.473496  27.500673  0.655411
1     Subagging  50.815499  1.538520  32.030913  0.636701
2      Boosting  71.446796  1.525714  48.560694  0.918406


In [8]:
best_model = rf   # replace if another wins

best_model.fit(X, y)

preds = best_model.predict(X)

final = pd.DataFrame({
    "ActualCnt": y,
    "PredictedCnt": preds
})

final.to_csv("final_predictions.csv", index=False)


In [9]:
importances = best_model.feature_importances_

fi = pd.DataFrame({
    "Feature": X.columns,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(fi.head(8))


       Feature  Importance
3           hr    0.631579
8         temp    0.121383
1           yr    0.082619
6   workingday    0.057898
0       season    0.021772
10         hum    0.020835
7   weathersit    0.017418
9        atemp    0.016401


Which model generalized best?

Random Forest achieved lowest RMSE with smallest variance.

Why?

Bias–Variance intuition:

Subagging:
*   reduces variance
*   still high bias (weak trees)

Boosting:
*  reduces bias
*  sensitive to noise
*  can overfit hourly spikes

Random Forest:
✅ variance reduction
✅ feature randomness
✅ robust to noise
→ best balance