In [22]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

In [23]:
# IMPORT DATASET
data = pd.read_csv('../dataset/SeoulBikeData.csv', encoding='Windows-1252')
data.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


In [24]:
# DROP COLUMNS NOT NECESSARY
data = data.drop(columns=['Date', 'Functioning Day', 'Dew point temperature(°C)'])
data.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday
0,254,0,-5.2,37,2.2,2000,0.0,0.0,0.0,Winter,No Holiday
1,204,1,-5.5,38,0.8,2000,0.0,0.0,0.0,Winter,No Holiday
2,173,2,-6.0,39,1.0,2000,0.0,0.0,0.0,Winter,No Holiday
3,107,3,-6.2,40,0.9,2000,0.0,0.0,0.0,Winter,No Holiday
4,78,4,-6.0,36,2.3,2000,0.0,0.0,0.0,Winter,No Holiday


In [25]:
# ONE_HOT ENCODING FOR CATEGORICAL FEATURES
data = pd.get_dummies(data, columns=['Holiday', 'Seasons'], drop_first=True)
data.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Holiday_No Holiday,Seasons_Spring,Seasons_Summer,Seasons_Winter
0,254,0,-5.2,37,2.2,2000,0.0,0.0,0.0,True,False,False,True
1,204,1,-5.5,38,0.8,2000,0.0,0.0,0.0,True,False,False,True
2,173,2,-6.0,39,1.0,2000,0.0,0.0,0.0,True,False,False,True
3,107,3,-6.2,40,0.9,2000,0.0,0.0,0.0,True,False,False,True
4,78,4,-6.0,36,2.3,2000,0.0,0.0,0.0,True,False,False,True


In [26]:
# SCALING

# Separate features and target variable
X = data.drop('Rented Bike Count', axis=1)  # Replace with the actual target column name
y = data['Rented Bike Count']  # Replace with the actual target column name

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the features
X_scaled = scaler.fit_transform(X)

# Convert the scaled array back to a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)


In [27]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)

# Verify the shape of the splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7008, 12), (1752, 12), (7008,), (1752,))

In [28]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

# Initialize a Random Forest regressor
regressor = RandomForestRegressor(random_state=42)

# Define scorers
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

# Perform K-Fold Cross Validation for RMSE
cv_mse_scores = cross_val_score(regressor, X_train, y_train, cv=5, scoring=mse_scorer)
cv_rmse_scores = (-cv_mse_scores)**0.5

# Perform K-Fold Cross Validation for R-squared
cv_r2_scores = cross_val_score(regressor, X_train, y_train, cv=5, scoring=r2_scorer)

# Display the cross-validation scores
print("Cross-validation RMSE scores:", cv_rmse_scores)
print("Mean CV RMSE:", cv_rmse_scores.mean())
print("Cross-validation R-squared scores:", cv_r2_scores)
print("Mean CV R-squared:", cv_r2_scores.mean())

Cross-validation RMSE scores: [301.49033972 296.03622011 311.15818315 288.4872638  320.895376  ]
Mean CV RMSE: 303.61347655924635
Cross-validation R-squared scores: [0.77476171 0.78634087 0.77687475 0.80280514 0.74770579]
Mean CV R-squared: 0.7776976512381643


In [29]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Grid Search
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters found:", grid_search.best_params_)

# Best score achieved with the best parameters
best_rmse = (-grid_search.best_score_)**0.5
print("Best CV RMSE:", best_rmse)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   3.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   3.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   3.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.7s
[CV] END m

In [18]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define the parameter grid for Random Forest
param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=2000, num=10)],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Randomized Search
random_search = RandomizedSearchCV(estimator=regressor, param_distributions=param_dist, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit Randomized Search
random_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters found:", random_search.best_params_)

# Best score achieved with the best parameters
best_rmse = (-random_search.best_score_)**0.5
print("Best CV RMSE:", best_rmse)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=4, min_samples_split=2, n_estimators=733; total time=   9.6s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=4, min_samples_split=2, n_estimators=733; total time=   9.7s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=4, min_samples_split=2, n_estimators=733; total time=   9.7s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=4, min_samples_split=2, n_estimators=733; total time=   9.8s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=4, min_samples_split=2, n_estimators=733; total time=   9.8s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=311; total time=   7.6s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=311; total time=   7.8s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=311; total time=   7.9



[CV] END bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=1788; total time=  43.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=1788; total time=  51.5s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=311; total time=   6.7s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=1788; total time=  52.2s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=311; total time=   6.5s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=1788; total time=  43.5s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=311; total time=   6.5s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=311; total time=   6.6s
[CV] END bootstrap=False, max_depth=None, mi

KeyboardInterrupt: 

In [30]:
# Get feature importances from the best model found by grid search
importances = grid_search.best_estimator_.feature_importances_

# Convert to a DataFrame for better visualization
feature_importances = pd.DataFrame(importances, index=X_train.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)

# Display the feature importance
feature_importances.head(10)  # Display the top 10 features


Unnamed: 0,Importance
Temperature(°C),0.347598
Hour,0.290285
Solar Radiation (MJ/m2),0.110727
Humidity(%),0.09434
Rainfall(mm),0.041292
Wind speed (m/s),0.036412
Visibility (10m),0.033813
Seasons_Winter,0.021895
Seasons_Spring,0.011042
Holiday_No Holiday,0.006526


In [36]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate RMSE on the test set
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Test Set RMSE:", test_rmse)

# Calculate R-squared on the test set
r2 = r2_score(y_test, y_pred)
print("Test Set R-squared:", r2)


Test Set RMSE: 300.7477896658328
Test Set R-squared: 0.7829111970712697


