In [11]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# IMPORT DATASET
data = pd.read_csv('../dataset/SeoulBikeData.csv', encoding='Windows-1252')
data.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


In [12]:
# REMOVE "FUNCTIONING DAY" FEATURE SINCE NOT NEEDED
data = data.drop(columns=['Functioning Day'])
data.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday


In [13]:
# CONVERT DATE COLUMN TO THE CORRECT FORMAT
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y', dayfirst=True)

# CHECK THE RESULT
print(data['Date'].head())

0   2017-12-01
1   2017-12-01
2   2017-12-01
3   2017-12-01
4   2017-12-01
Name: Date, dtype: datetime64[ns]


In [14]:
#ENCODING
categorical_cols = data.select_dtypes(include=['object']).columns
print(categorical_cols)

Index(['Seasons', 'Holiday'], dtype='object')


In [15]:
# Perform one-hot encoding on categorical features
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
data.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons_Spring,Seasons_Summer,Seasons_Winter,Holiday_No Holiday
0,2017-12-01,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,False,False,True,True
1,2017-12-01,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,False,False,True,True
2,2017-12-01,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,False,False,True,True
3,2017-12-01,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,False,False,True,True
4,2017-12-01,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,False,False,True,True


In [16]:
#HANDLING OUTLIERS
def remove_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
    return data

# Apply the function to relevant numerical columns
numerical_cols = data.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    data = remove_outliers_iqr(data, col)

In [18]:
data.describe()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm)
count,6850,6850.0,6850.0,6850.0,6850.0,6850.0,6850.0,6850.0,6850.0,6850.0,6850.0
mean,2018-06-04 06:06:37.313868544,702.885255,11.257372,12.390336,58.028321,1.596263,1465.680292,3.735066,0.39347,0.0,0.0
min,2017-12-01 00:00:00,0.0,0.0,-15.1,0.0,0.0,27.0,-25.6,0.0,0.0,0.0
25%,2018-03-02 00:00:00,221.0,5.0,3.6,44.0,0.9,990.0,-5.2,0.0,0.0,0.0
50%,2018-06-05 00:00:00,518.0,10.0,12.5,58.0,1.4,1724.0,4.4,0.0,0.0,0.0
75%,2018-09-09 00:00:00,1056.0,18.0,21.6,73.0,2.2,2000.0,14.2,0.62,0.0,0.0
max,2018-11-30 00:00:00,2375.0,23.0,39.0,98.0,4.3,2000.0,26.1,2.25,0.0,0.0
std,,603.288264,7.262623,11.432486,18.687018,0.929482,590.245334,12.848216,0.623899,0.0,0.0


In [21]:
#FEATURE IMPORTANCE MEASURES 
from sklearn.ensemble import RandomForestRegressor

# Step 1: Drop the target variable and define the features
X = data.drop(columns=['Rented Bike Count'])
y = data['Rented Bike Count']

# Step 2: Check and handle datetime features (if necessary)
# Convert datetime features to numerical values if present
date_cols = X.select_dtypes(include=['datetime64']).columns
if not date_cols.empty:
    for col in date_cols:
        X[col + '_Year'] = X[col].dt.year
        X[col + '_Month'] = X[col].dt.month
        X[col + '_Day'] = X[col].dt.day
        X[col + '_Hour'] = X[col].dt.hour
    # Drop original datetime columns
    X = X.drop(columns=date_cols)

# Step 3: Initialize and fit a RandomForestRegressor model
model = RandomForestRegressor(random_state=42)
model.fit(X, y)

# Step 4: Calculate and display feature importances
importances = model.feature_importances_
feature_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)

# Display the feature importances
print(feature_importances)


Temperature(°C)              0.327681
Hour                         0.326086
Solar Radiation (MJ/m2)      0.076153
Date_Day                     0.046765
Humidity(%)                  0.042885
Dew point temperature(°C)    0.041783
Date_Month                   0.035966
Seasons_Winter               0.033374
Visibility (10m)             0.030736
Wind speed (m/s)             0.025108
Holiday_No Holiday           0.007171
Seasons_Summer               0.003558
Seasons_Spring               0.002482
Date_Year                    0.000252
Snowfall (cm)                0.000000
Rainfall(mm)                 0.000000
Date_Hour                    0.000000
dtype: float64


In [22]:
#FEATURE SCALING
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)

In [26]:
#TRAIN AND TEST SPLITS
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Display the shapes of the splits
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

Training set: (5480, 17), Test set: (1370, 17)
