### imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import BaggingRegressor , GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge

### Part A: Data Preprocessing and Baseline

#### 1. Data Loading and Feature Engineering

In [2]:
df_hour=pd.read_csv('dataset/hour.csv')
df_hour.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [3]:
print(f"Number of datapoints: {df_hour.shape[0]} with features: {df_hour.shape[1]}")
df_hour.info()

Number of datapoints: 17379 with features: 17
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [4]:
df_hour.drop(columns=['dteday','instant','casual','registered'],inplace=True)
df_hour.describe()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,189.463088
std,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,181.387599
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,1.0
25%,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,40.0
50%,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,142.0
75%,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,281.0
max,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,977.0


In [5]:
df_cat=df_hour.drop(columns=['temp','atemp','hum','windspeed','cnt'])
df_processed=pd.get_dummies(df_cat.astype('category'),drop_first=True).astype('int')
df=pd.concat([df_hour,df_processed],axis=1)

In [6]:
df.drop(columns=['season','yr','mnth','hr','holiday','weekday','workingday','weathersit'],inplace=True)

In [7]:
df.head()

Unnamed: 0,temp,atemp,hum,windspeed,cnt,season_2,season_3,season_4,yr_1,mnth_2,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_1,weathersit_2,weathersit_3,weathersit_4
0,0.24,0.2879,0.81,0.0,16,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0.22,0.2727,0.8,0.0,40,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0.22,0.2727,0.8,0.0,32,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0.24,0.2879,0.75,0.0,13,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0.24,0.2879,0.75,0.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


#### 2. Train/Test Split

In [8]:
X=df.drop(columns=['cnt'])
y=df['cnt']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#### 3. Baseline Model 

In [9]:
dtr=DecisionTreeRegressor(max_depth=6,random_state=42)
lr=LinearRegression()
dtr.fit(X_train,y_train)
lr.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [10]:
print(f"Decision Tree Regressor rmse is {root_mean_squared_error(y_test,dtr.predict(X_test))}")
print(f"Linear Regression rmse is {root_mean_squared_error(y_test,lr.predict(X_test))}")

Decision Tree Regressor rmse is 118.45551730357617
Linear Regression rmse is 100.44594623557185


    As we seen above Linear Regression RMSE is lesser than Decision Tree Regressor.
    so we use the Linear regression as base model

### Part B: Ensemble Techniques for Bias and Variance Reduction

#### 1. Bagging (Variance Reduction)

In [11]:
gs=GridSearchCV(estimator=BaggingRegressor(estimator=dtr, random_state=42),param_grid={"n_estimators":[50,100,200,250]},scoring="neg_root_mean_squared_error",verbose=2,n_jobs=-1)
gs.fit(X_train,y_train)
print(f"Best n_estimators is {gs.best_params_}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best n_estimators is {'n_estimators': 200}


In [12]:
br=BaggingRegressor(estimator = dtr, n_estimators=200,random_state=42)
br.fit(X_train,y_train)
print(f"Bagging Regressor rmse is {root_mean_squared_error(y_test,br.predict(X_test))}")

Bagging Regressor rmse is 112.27253260601442


    The Bagging Regressor gave an RMSE of 112.27, which is lower than the Decision Tree baseline (118.46).

    This confirms that bagging effectively reduced variance, leading to better generalization.

    However, since the Linear Regression model still performs best (RMSE 100.45), it suggests that the underlying relationship in the data might be mostly linear — limiting how much variance reduction helps.

#### 2. Boosting (Bias Reduction)

In [13]:
gs=GridSearchCV(estimator=GradientBoostingRegressor(random_state=42),param_grid={"n_estimators":[50,100,200,250]},scoring="neg_root_mean_squared_error",verbose=2,n_jobs=-1)
gs.fit(X_train,y_train)
print(f"Best n_estimators is {gs.best_params_}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best n_estimators is {'n_estimators': 250}


In [14]:
gbr=GradientBoostingRegressor(n_estimators=250,random_state=42)
gbr.fit(X_train,y_train)
print(f"Gradient Boosting Regressor rmse is {root_mean_squared_error(y_test,gbr.predict(X_test))}")

Gradient Boosting Regressor rmse is 61.23895078896386


    The Gradient Boosting Regressor, optimized with 250 estimators, achieved a test RMSE of 61.24, which is significantly lower than both the single Decision Tree (118.46) and the Bagging ensemble (112.27).
    This substantial improvement confirms the hypothesis that boosting effectively reduces bias by sequentially correcting errors from previous weak learners.
    Unlike bagging, which primarily targets variance reduction, boosting focuses on improving model fit and capturing complex patterns, resulting in a markedly better generalization performance on the test set.

### Part C: Stacking for Optimal Performance

#### 1. Stacking Implementation

In [15]:
estimators=[
    ('KNN', KNeighborsRegressor()),
    ('BR', br),
    ('GBR', gbr)]
sr=StackingRegressor(estimators=estimators,final_estimator=Ridge(random_state=42))

In [16]:
gs=GridSearchCV(estimator=StackingRegressor(estimators=estimators,final_estimator=Ridge(random_state=42)),param_grid={"final_estimator__alpha":
[0.1,1,5,10]},scoring="neg_mean_squared_error",verbose=2,n_jobs=-1)
gs.fit(X_train,y_train)
print(f"Best alpha value is {gs.best_params_}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best alpha value is {'final_estimator__alpha': 10}


In [17]:
sr=StackingRegressor(estimators=estimators,final_estimator=Ridge(alpha=10,random_state=42))

In [18]:
sr.fit(X_train,y_train)

0,1,2
,estimators,"[('KNN', ...), ('BR', ...), ...]"
,final_estimator,Ridge(alpha=1...ndom_state=42)
,cv,
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,250
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0

0,1,2
,alpha,10
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [19]:
print(f"Stacking Regressor rmse is {root_mean_squared_error(y_test,sr.predict(X_test))}")


Stacking Regressor rmse is 58.29235827342292


### Part D: Final Analysis 

#### 1. Comparative Table

In [20]:
rmse_dt = root_mean_squared_error(y_test, dtr.predict(X_test))
rmse_lr = root_mean_squared_error(y_test, lr.predict(X_test))
rmse_br = root_mean_squared_error(y_test, br.predict(X_test))
rmse_gbr = root_mean_squared_error(y_test, gbr.predict(X_test))
rmse_sr = root_mean_squared_error(y_test, sr.predict(X_test))

rmse_table = pd.DataFrame({
    "Model": [
        "LinearRegression",
        "DecisionTreeRegressor",
        "Bagging Regressor",
        "Gradient Boosting Regressor",
        "Stacking Regressor"
    ],
    "RMSE": [
        rmse_lr,
        rmse_dt,
        rmse_br,
        rmse_gbr,
        rmse_sr
    ]
})

rmse_table = rmse_table.sort_values(by="RMSE").reset_index(drop=True)

print("\nComparative RMSE Table:")
print(rmse_table)



Comparative RMSE Table:
                         Model        RMSE
0           Stacking Regressor   58.292358
1  Gradient Boosting Regressor   61.238951
2             LinearRegression  100.445946
3            Bagging Regressor  112.272533
4        DecisionTreeRegressor  118.455517


#### 2. Conclusion

    The Stacking Regressor achieved the lowest RMSE of 58.29, outperforming all other models, including the single-model baselines and individual ensemble techniques.

    The baseline Linear Regression (RMSE = 100.45) performed reasonably well but was limited by its high bias, failing to capture complex nonlinear relationships.
    The Decision Tree showed high variance (RMSE = 118.46), which was reduced through Bagging (RMSE = 112.27).
    Gradient Boosting further improved performance (RMSE = 61.24) by sequentially reducing bias, producing a strong yet focused model.

    Finally, the Stacking Regressor leveraged model diversity by combining KNN, Bagging, and Gradient Boosting predictions through a Ridge meta-learner.
    This hierarchical combination allowed the model to balance both bias and variance optimally, capturing linear and nonlinear trends across different data regions.

    Hence, stacking outperformed all individual and ensemble models by effectively exploiting complementary learning behaviors — achieving the best generalization performance and confirming the benefits of ensemble diversity and bias–variance trade-off optimization.