In [1]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# IMPORT DATASET
data = pd.read_csv('../dataset/SeoulBikeData.csv', encoding='Windows-1252')
data.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


## REMOVE "FUNCTIONING DAY" and "DEW POINT TEMPERATURE" FEATURE SINCE NOT NEEDED

In [2]:
data = data.drop(columns=['Functioning Day', 'Dew point temperature(°C)'])
data.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday
0,01/12/2017,254,0,-5.2,37,2.2,2000,0.0,0.0,0.0,Winter,No Holiday
1,01/12/2017,204,1,-5.5,38,0.8,2000,0.0,0.0,0.0,Winter,No Holiday
2,01/12/2017,173,2,-6.0,39,1.0,2000,0.0,0.0,0.0,Winter,No Holiday
3,01/12/2017,107,3,-6.2,40,0.9,2000,0.0,0.0,0.0,Winter,No Holiday
4,01/12/2017,78,4,-6.0,36,2.3,2000,0.0,0.0,0.0,Winter,No Holiday


## CONVERT DATE COLUMN TO THE CORRECT FORMAT

In [3]:
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y', dayfirst=True)

print(data['Date'].head())

0   2017-12-01
1   2017-12-01
2   2017-12-01
3   2017-12-01
4   2017-12-01
Name: Date, dtype: datetime64[ns]


In [4]:
# SCALING
from sklearn.preprocessing import StandardScaler

# Assuming df is your DataFrame
datetime_cols = data.select_dtypes(include=['datetime64']).columns
categorical_cols = data.select_dtypes(include=['object', 'bool']).columns
numerical_cols = data.select_dtypes(include=['number']).columns

scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Date                     8760 non-null   datetime64[ns]
 1   Rented Bike Count        8760 non-null   float64       
 2   Hour                     8760 non-null   float64       
 3   Temperature(°C)          8760 non-null   float64       
 4   Humidity(%)              8760 non-null   float64       
 5   Wind speed (m/s)         8760 non-null   float64       
 6   Visibility (10m)         8760 non-null   float64       
 7   Solar Radiation (MJ/m2)  8760 non-null   float64       
 8   Rainfall(mm)             8760 non-null   float64       
 9   Snowfall (cm)            8760 non-null   float64       
 10  Seasons                  8760 non-null   object        
 11  Holiday                  8760 non-null   object        
dtypes: datetime64[ns](1), float64(9), 

## We will now start the Encoding process for the categorical features.

In [6]:
categorical_cols = data.select_dtypes(include=['object']).columns
print(categorical_cols)

Index(['Seasons', 'Holiday'], dtype='object')


In [7]:
data = pd.get_dummies(data, columns=['Holiday'], drop_first=True)
data = pd.get_dummies(data, columns=['Seasons'],drop_first=True)
data.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Holiday_No Holiday,Seasons_Spring,Seasons_Summer,Seasons_Winter
0,2017-12-01,-0.69865,-1.661325,-1.513957,-1.042483,0.458476,0.925871,-0.655132,-0.1318,-0.171891,True,False,False,True
1,2017-12-01,-0.776175,-1.516862,-1.539074,-0.99337,-0.892561,0.925871,-0.655132,-0.1318,-0.171891,True,False,False,True
2,2017-12-01,-0.82424,-1.372399,-1.580936,-0.944257,-0.699556,0.925871,-0.655132,-0.1318,-0.171891,True,False,False,True
3,2017-12-01,-0.926571,-1.227936,-1.59768,-0.895144,-0.796059,0.925871,-0.655132,-0.1318,-0.171891,True,False,False,True
4,2017-12-01,-0.971535,-1.083473,-1.580936,-1.091596,0.554978,0.925871,-0.655132,-0.1318,-0.171891,True,False,False,True


## HANDLING OUTLIERS

In [8]:
def remove_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
    return data

# Apply to relevant numerical columns
numerical_cols = data.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    data = remove_outliers_iqr(data, col)

In [9]:
data.describe()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm)
count,6882,6882.0,6882.0,6882.0,6882.0,6882.0,6882.0,6882.0,6882.0,6882.0
mean,2018-06-04 01:16:47.497820160,-0.001614,-0.033124,-0.040962,-0.01396,-0.117147,0.047292,-0.194401,-0.1317999,-0.1718911
min,2017-12-01 00:00:00,-1.092473,-1.661325,-2.342815,-2.85966,-1.664583,-2.317786,-0.655132,-0.1317999,-0.1718911
25%,2018-03-02 00:00:00,-0.748266,-0.93901,-0.777194,-0.698693,-0.796059,-0.734592,-0.655132,-0.1317999,-0.1718911
50%,2018-06-04 00:00:00,-0.286222,-0.216695,-0.032059,-0.011112,-0.313546,0.472121,-0.655132,-0.1317999,-0.1718911
75%,2018-09-09 00:00:00,0.547937,0.93901,0.721448,0.725581,0.458476,0.925871,0.081604,-0.1317999,-0.1718911
max,2018-11-30 00:00:00,2.589922,1.661325,2.186601,1.953404,2.581534,0.925871,1.957978,-0.1317999,-0.1718911
std,,0.93467,1.047377,0.957071,0.918868,0.903696,0.970422,0.727627,1.274073e-14,1.962462e-14


## Train and Test split

In [11]:
from sklearn.model_selection import train_test_split # type: ignore

X = data.drop('Rented Bike Count', axis=1)
y = data['Rented Bike Count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
