In [1]:
import pandas as pd

In [2]:
# IMPORT DATASET
data = pd.read_csv('../dataset/SeoulBikeData.csv', encoding='Windows-1252')
data.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


In [3]:
data = data.drop(columns=['Date', 'Functioning Day', 'Dew point temperature(°C)'])
data.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday
0,254,0,-5.2,37,2.2,2000,0.0,0.0,0.0,Winter,No Holiday
1,204,1,-5.5,38,0.8,2000,0.0,0.0,0.0,Winter,No Holiday
2,173,2,-6.0,39,1.0,2000,0.0,0.0,0.0,Winter,No Holiday
3,107,3,-6.2,40,0.9,2000,0.0,0.0,0.0,Winter,No Holiday
4,78,4,-6.0,36,2.3,2000,0.0,0.0,0.0,Winter,No Holiday


In [4]:
# One-hot encoding for categorical variables
data = pd.get_dummies(data, columns=['Holiday', 'Seasons'], drop_first=True)
data.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Holiday_No Holiday,Seasons_Spring,Seasons_Summer,Seasons_Winter
0,254,0,-5.2,37,2.2,2000,0.0,0.0,0.0,True,False,False,True
1,204,1,-5.5,38,0.8,2000,0.0,0.0,0.0,True,False,False,True
2,173,2,-6.0,39,1.0,2000,0.0,0.0,0.0,True,False,False,True
3,107,3,-6.2,40,0.9,2000,0.0,0.0,0.0,True,False,False,True
4,78,4,-6.0,36,2.3,2000,0.0,0.0,0.0,True,False,False,True


In [5]:
from sklearn.preprocessing import StandardScaler

# Assume 'Rented Bike Count' is the target column
X = data.drop('Rented Bike Count', axis=1)  # Replace with the actual target column name if different
y = data['Rented Bike Count']

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the features
X_scaled = scaler.fit_transform(X)

# Convert the scaled array back to a DataFrame for easier handling
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)


In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)

# Verify the shape of the splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((7008, 12), (1752, 12), (7008,), (1752,))

In [7]:
from sklearn.svm import SVR

# Initialize the SVR model
svr = SVR()

# Fit the model to the training data
svr.fit(X_train, y_train)

# Predict on the test set
y_pred = svr.predict(X_test)

In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Test Set RMSE:", rmse)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print("Test Set MAE:", mae)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print("Test Set R-squared:", r2)

Test Set RMSE: 537.6175235088756
Test Set MAE: 363.3013185417755
Test Set R-squared: 0.3062877438339504




In [9]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for SVR
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'epsilon': [0.1, 0.2, 0.5, 1],  # Epsilon in the epsilon-SVR model
    'kernel': ['linear', 'rbf', 'poly']  # Kernel type
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=SVR(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters found:", grid_search.best_params_)

# Best score achieved with the best parameters
best_rmse = (-grid_search.best_score_)**0.5
print("Best CV RMSE:", best_rmse)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END ..................C=0.1, epsilon=0.1, kernel=linear; total time=   0.6s
[CV] END ..................C=0.1, epsilon=0.1, kernel=linear; total time=   0.7s
[CV] END ..................C=0.1, epsilon=0.1, kernel=linear; total time=   0.7s
[CV] END ..................C=0.1, epsilon=0.1, kernel=linear; total time=   0.7s
[CV] END ..................C=0.1, epsilon=0.1, kernel=linear; total time=   0.7s
[CV] END .....................C=0.1, epsilon=0.1, kernel=rbf; total time=   1.2s
[CV] END .....................C=0.1, epsilon=0.1, kernel=rbf; total time=   1.2s
[CV] END .....................C=0.1, epsilon=0.1, kernel=rbf; total time=   1.2s
[CV] END ....................C=0.1, epsilon=0.1, kernel=poly; total time=   0.8s
[CV] END ....................C=0.1, epsilon=0.1, kernel=poly; total time=   0.9s
[CV] END ....................C=0.1, epsilon=0.1, kernel=poly; total time=   0.8s
[CV] END .....................C=0.1, epsilon=0.

In [12]:
# Predict on the test set using the best model
y_pred_best = grid_search.best_estimator_.predict(X_test)

# Calculate RMSE on the test set
test_rmse = mean_squared_error(y_test, y_pred_best, squared=False)
print("Test Set RMSE with Best Model:", test_rmse)

# Calculate MAE on the test set
test_mae = mean_absolute_error(y_test, y_pred_best)
print("Test Set MAE with Best Model:", test_mae)

# Calculate R-squared on the test set
test_r2 = r2_score(y_test, y_pred_best)
print("Test Set R-squared with Best Model:", test_r2)


Test Set RMSE with Best Model: 397.4052887703451
Test Set MAE with Best Model: 244.73584763355223
Test Set R-squared with Best Model: 0.6209471027237286


