In [12]:
import pandas as pd

In [None]:
import warnings
warnings.filterwarnings("ignore", message="use_inf_as_na option is deprecated")

In [13]:
# IMPORT DATASET
data = pd.read_csv('../dataset/SeoulBikeData.csv', encoding='Windows-1252')
data.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


In [14]:
data = data.drop(columns=['Date', 'Functioning Day', 'Dew point temperature(°C)'])
data.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday
0,254,0,-5.2,37,2.2,2000,0.0,0.0,0.0,Winter,No Holiday
1,204,1,-5.5,38,0.8,2000,0.0,0.0,0.0,Winter,No Holiday
2,173,2,-6.0,39,1.0,2000,0.0,0.0,0.0,Winter,No Holiday
3,107,3,-6.2,40,0.9,2000,0.0,0.0,0.0,Winter,No Holiday
4,78,4,-6.0,36,2.3,2000,0.0,0.0,0.0,Winter,No Holiday


In [15]:
# One-hot encoding for categorical variables
data = pd.get_dummies(data, columns=['Holiday', 'Seasons'], drop_first=True)
data.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Holiday_No Holiday,Seasons_Spring,Seasons_Summer,Seasons_Winter
0,254,0,-5.2,37,2.2,2000,0.0,0.0,0.0,True,False,False,True
1,204,1,-5.5,38,0.8,2000,0.0,0.0,0.0,True,False,False,True
2,173,2,-6.0,39,1.0,2000,0.0,0.0,0.0,True,False,False,True
3,107,3,-6.2,40,0.9,2000,0.0,0.0,0.0,True,False,False,True
4,78,4,-6.0,36,2.3,2000,0.0,0.0,0.0,True,False,False,True


In [16]:
from sklearn.preprocessing import StandardScaler # type: ignore

X = data.drop('Rented Bike Count', axis=1)  # Replace with the actual target column name if different
y = data['Rented Bike Count']

scaler = StandardScaler()

# Scale the features
X_scaled = scaler.fit_transform(X)

# Convert the scaled array back to a DataFrame for easier handling
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

In [17]:
from sklearn.model_selection import train_test_split # type: ignore

X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7008, 12), (1752, 12), (7008,), (1752,))

In [7]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m981.2 kB/s[0m eta [36m0:00:00[0m00:01[0m:01[0m0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.1
Note: you may need to restart the kernel to use updated packages.


In [18]:
from xgboost import XGBRegressor # type: ignore
xgb_regressor = XGBRegressor()
xgb_regressor.fit(X_train, y_train)
y_pred = xgb_regressor.predict(X_test)

In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # type: ignore

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Test Set RMSE:", rmse)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print("Test Set MAE:", mae)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print("Test Set R-squared:", r2)

Test Set RMSE: 301.89233968334855
Test Set MAE: 183.31541202455486
Test Set R-squared: 0.7812557116542022




In [20]:
from sklearn.model_selection import cross_val_score # type: ignore

# Initialize a Random Forest regressor
regressor = XGBRegressor(random_state=42)

# Perform K-Fold Cross Validation
cv_scores = cross_val_score(regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE to positive RMSE for better interpretability
cv_rmse_scores = (-cv_scores)**0.5

# Display the cross-validation scores
print("Cross-validation RMSE scores:", cv_rmse_scores)
print("Mean CV RMSE:", cv_rmse_scores.mean())

Cross-validation RMSE scores: [305.15947898 305.87702296 313.45321455 296.52259821 320.78480429]
Mean CV RMSE: 308.3594237983013


In [21]:
from sklearn.model_selection import GridSearchCV # type: ignore
from sklearn.metrics import make_scorer, mean_squared_error # type: ignore

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage
    'max_depth': [3, 6, 10],  # Maximum depth of trees
    'min_child_weight': [1, 5, 10],  # Minimum sum of instance weight (hessian) needed in a child
    'subsample': [0.8, 1.0],  # Fraction of samples used for fitting each base learner
    'colsample_bytree': [0.8, 1.0]  # Fraction of features used for fitting each base learner
}

# Initialize the XGBoost regressor
xgb_regressor = XGBRegressor()

# Initialize Grid Search
grid_search = GridSearchCV(
    estimator=xgb_regressor,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring=make_scorer(mean_squared_error, greater_is_better=False)
)

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters found:", grid_search.best_params_)

# Best score achieved with the best parameters
best_rmse = (-grid_search.best_score_)**0.5
print("Best CV RMSE:", best_rmse)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time

In [22]:
# Predict on the test set using the best model
y_pred_best = grid_search.best_estimator_.predict(X_test)

# Calculate RMSE on the test set
test_rmse = mean_squared_error(y_test, y_pred_best, squared=False)
print("Test Set RMSE with Best Model:", test_rmse)

# Calculate MAE on the test set
test_mae = mean_absolute_error(y_test, y_pred_best)
print("Test Set MAE with Best Model:", test_mae)

# Calculate R-squared on the test set
test_r2 = r2_score(y_test, y_pred_best)
print("Test Set R-squared with Best Model:", test_r2)

Test Set RMSE with Best Model: 295.94542169021133
Test Set MAE with Best Model: 171.50213915871703
Test Set R-squared with Best Model: 0.7897888311649974


