Load and Prepare Data




In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_validate
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Load dataset
df = pd.read_csv("hour.csv")

# Display first few rows
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [3]:
# Remove leakage columns
df = df.drop(columns=["casual", "registered"])

# Define features and target
X = df.drop(columns=["cnt"])
y = df["cnt"]

In [4]:
categorical_features = ["season", "weathersit", "weekday", "mnth", "hr", "workingday"]
numerical_features = ["temp", "atemp", "hum", "windspeed"]

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numerical_features)
    ]
)

Train and Evaluate Bagging (RandomForestRegressor)


In [6]:
# Bagging (Random Forest)
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

# Subagging
subagging = BaggingRegressor(
    estimator=DecisionTreeRegressor(max_depth=15),
    n_estimators=200,
    max_samples=0.6,
    random_state=42,
    n_jobs=-1
)

# Boosting
gbr = GradientBoostingRegressor(
    learning_rate=0.05,
    n_estimators=300,
    max_depth=3,
    random_state=42
)

models = {
    "RandomForest": rf,
    "Subagging": subagging,
    "GradientBoosting": gbr
}

In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "rmse": "neg_root_mean_squared_error",
    "mae": "neg_mean_absolute_error"
}

In [None]:
results = []

for name, model in models.items():
    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])

    cv = cross_validate(
        pipeline,
        X,
        y,
        cv=kf,
        scoring=scoring,
        return_train_score=False
    )

    results.append({
        "Model": name,
        "RMSE_mean": -cv["test_rmse"].mean(),
        "RMSE_std": cv["test_rmse"].std(),
        "MAE_mean": -cv["test_mae"].mean(),
        "MAE_std": cv["test_mae"].std()
    })

In [None]:
cv_results_df = pd.DataFrame(results)
cv_results_df


--- Fold 1/5 ---


TypeError: BaggingRegressor.__init__() got an unexpected keyword argument 'base_estimator'

In [None]:
cv_results_df.to_csv("cv_regression_results.csv", index=False)


--- Fold 1/5 ---
Fold 1 - Subagging RMSE: 51.3627
Fold 1 - Subagging MAE: 32.8826

--- Fold 2/5 ---
Fold 2 - Subagging RMSE: 52.1460
Fold 2 - Subagging MAE: 33.0014

--- Fold 3/5 ---
Fold 3 - Subagging RMSE: 55.7211
Fold 3 - Subagging MAE: 35.5980

--- Fold 4/5 ---
Fold 4 - Subagging RMSE: 52.8293
Fold 4 - Subagging MAE: 34.5218

--- Fold 5/5 ---
Fold 5 - Subagging RMSE: 55.8172
Fold 5 - Subagging MAE: 35.5056

--- Overall Subagging Performance ---
Mean RMSE across all folds: 53.5753 (+/- 1.8507)
Mean MAE across all folds: 34.3019 (+/- 1.1733)


In [None]:
best_model = Pipeline([
    ("preprocess", preprocessor),
    ("model", rf)
])

best_model.fit(X, y)

GradientBoostingRegressor imported.
rmse_scores_boosting and mae_scores_boosting lists initialized.


In [None]:
predictions = best_model.predict(X)

final_predictions = pd.DataFrame({
    "ActualCnt": y,
    "PredictedCnt": predictions
})

final_predictions.head()


--- Fold 1/5 ---
Fold 1 - Boosting RMSE: 78.9652
Fold 1 - Boosting MAE: 57.0295

--- Fold 2/5 ---
Fold 2 - Boosting RMSE: 81.4311
Fold 2 - Boosting MAE: 58.5549

--- Fold 3/5 ---
Fold 3 - Boosting RMSE: 81.0773
Fold 3 - Boosting MAE: 58.3590

--- Fold 4/5 ---
Fold 4 - Boosting RMSE: 81.3589
Fold 4 - Boosting MAE: 58.9697

--- Fold 5/5 ---
Fold 5 - Boosting RMSE: 83.0107
Fold 5 - Boosting MAE: 59.8353

--- Overall Boosting Performance ---
Mean RMSE across all folds: 81.1686 (+/- 1.2932)
Mean MAE across all folds: 58.5497 (+/- 0.9140)


In [None]:
final_predictions.to_csv("final_predictions.csv", index=False)

In [None]:
rf_model = best_model.named_steps["model"]

feature_names = best_model.named_steps["preprocess"].get_feature_names_out()

feature_importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": rf_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

top_8_features = feature_importance_df.head(8)
top_8_features

In [None]:
print("""
Random Forest generalized best with lowest RMSE and MAE due to effective variance reduction
using bootstrap sampling and feature randomness. Subagging reduced overfitting but increased
bias due to smaller training subsets. Boosting performed well but required careful tuning and
was sensitive to noise. Overall, Random Forest provided the best biasâ€“variance tradeoff.
""")