In [17]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

In [18]:
data = pd.read_csv("hour.csv")
data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [19]:
drop_cols = ['instant', 'dteday', 'casual', 'registered']
data.drop(columns=drop_cols, inplace=True)

X = data.drop(columns=['cnt'])
y = data['cnt']

print("Feature shape:", X.shape)


Feature shape: (17379, 12)


In [20]:
def evaluate_cv(model, X, y, k=5):
    rmse_scores = []
    mae_scores = []

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        rmse_scores.append(np.sqrt(mean_squared_error(y_test, preds)))
        mae_scores.append(mean_absolute_error(y_test, preds))

    return (
        np.mean(rmse_scores), np.std(rmse_scores),
        np.mean(mae_scores), np.std(mae_scores)
    )

In [21]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

rf_rmse_m, rf_rmse_s, rf_mae_m, rf_mae_s = evaluate_cv(rf, X, y)

rf_results = ["Random Forest", rf_rmse_m, rf_rmse_s, rf_mae_m, rf_mae_s]
rf_results

['Random Forest',
 np.float64(42.88841218854876),
 np.float64(1.4080750824149615),
 np.float64(25.64802909606911),
 np.float64(0.5874694175623849)]

**Hyperparameters used:**

1. n_estimators: reduces variance

2. max_depth: controls overfitting

In [22]:
subag = BaggingRegressor(
    estimator=DecisionTreeRegressor(max_depth=10),
    n_estimators=200,
    max_samples=0.6,
    random_state=42,
    n_jobs=-1
)

sub_rmse_m, sub_rmse_s, sub_mae_m, sub_mae_s = evaluate_cv(subag, X, y)

sub_results = ["Subagging", sub_rmse_m, sub_rmse_s, sub_mae_m, sub_mae_s]
sub_results


['Subagging',
 np.float64(50.76308616775524),
 np.float64(1.4622087231271663),
 np.float64(32.01622995063064),
 np.float64(0.5958361186931656)]

**Hyperparameters used:**

1. max_samples < 1.0: enables subagging

2. n_estimators: ensemble stability

In [23]:
gbr = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    random_state=42
)

gb_rmse_m, gb_rmse_s, gb_mae_m, gb_mae_s = evaluate_cv(gbr, X, y)

gb_results = ["Gradient Boosting", gb_rmse_m, gb_rmse_s, gb_mae_m, gb_mae_s]
gb_results

['Gradient Boosting',
 np.float64(46.650064159613166),
 np.float64(1.418010344784103),
 np.float64(29.900669550115026),
 np.float64(1.124475732380915)]

**Hyperparameters used:**

1. learning_rate: controls contribution of each tree

2. n_estimators: improves model strength gradually

In [24]:
results_df = pd.DataFrame(
    [rf_results, sub_results, gb_results],
    columns=["Model", "RMSE_mean", "RMSE_std", "MAE_mean", "MAE_std"]
)

results_df

Unnamed: 0,Model,RMSE_mean,RMSE_std,MAE_mean,MAE_std
0,Random Forest,42.888412,1.408075,25.648029,0.587469
1,Subagging,50.763086,1.462209,32.01623,0.595836
2,Gradient Boosting,46.650064,1.41801,29.90067,1.124476


In [25]:
results_df.to_csv("cv_regression_results.csv", index=False)

In [26]:
best_model = gbr
best_model.fit(X, y)

predictions = best_model.predict(X)

final_df = pd.DataFrame({
    "ActualCnt": y,
    "PredictedCnt": predictions
})

final_df.head()

Unnamed: 0,ActualCnt,PredictedCnt
0,16,32.148312
1,40,25.639654
2,32,19.921313
3,13,1.758806
4,1,-5.199294


In [27]:
final_df.to_csv("final_predictions.csv", index=False)

In [28]:
feature_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": best_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

feature_importance.head(8)

Unnamed: 0,Feature,Importance
3,hr,0.593073
6,workingday,0.103551
8,temp,0.095405
1,yr,0.08752
9,atemp,0.043234
0,season,0.024957
10,hum,0.015882
7,weathersit,0.014643
