
## [Prof. Pedram Jahangiry](https://huntsman.usu.edu/directory/jahangiry-pedram) 
[GitHub account](https://github.com/PJalgotrader) 

[YouTube Channel](https://www.youtube.com/channel/UCNDElcuuyX-2pSatVBDpJJQ/playlists) 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()  #if you want to use seaborn themes with matplotlib functions
import warnings
warnings.filterwarnings('ignore')

In [2]:
rand_state = 1000

---------------

In [3]:
df = pd.read_csv("bikeshare.csv")
df.drop('dteday',axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,notbizday,weathersit,temp,hum,windspeed,cnt
0,1,0,1,0,0,6,1,1,-1.334609,0.947345,-1.553844,16
1,1,0,1,1,0,6,1,1,-1.438475,0.895513,-1.553844,40
2,1,0,1,2,0,6,1,1,-1.438475,0.895513,-1.553844,32
3,1,0,1,3,0,6,1,1,-1.334609,0.636351,-1.553844,13
4,1,0,1,4,0,6,1,1,-1.334609,0.636351,-1.553844,1


In [5]:
df_cat = df.copy()

In [6]:
categorical = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'notbizday','weathersit',]

for col in categorical: 
    df_cat[col] = df_cat[col].astype("category")

In [7]:
df_cat.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,notbizday,weathersit,temp,hum,windspeed,cnt
0,1,0,1,0,0,6,1,1,-1.334609,0.947345,-1.553844,16
1,1,0,1,1,0,6,1,1,-1.438475,0.895513,-1.553844,40
2,1,0,1,2,0,6,1,1,-1.438475,0.895513,-1.553844,32
3,1,0,1,3,0,6,1,1,-1.334609,0.636351,-1.553844,13
4,1,0,1,4,0,6,1,1,-1.334609,0.636351,-1.553844,1


In [8]:
df_dum = pd.get_dummies(df_cat, drop_first=True)

In [9]:
df_dum.head()

Unnamed: 0,temp,hum,windspeed,cnt,season_2,season_3,season_4,yr_1,mnth_2,mnth_3,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,notbizday_1,weathersit_2,weathersit_3,weathersit_4
0,-1.334609,0.947345,-1.553844,16,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,-1.438475,0.895513,-1.553844,40,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
2,-1.438475,0.895513,-1.553844,32,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
3,-1.334609,0.636351,-1.553844,13,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,-1.334609,0.636351,-1.553844,1,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0


Let's keep track of 6 versions:
1. categoricals
2. categoricals with standardization
3. categoricals with normalization
4. dummy
5. dummy with standardization
6. dummy with normalization

In [10]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [11]:
# 1. categoricals
y = df_cat['cnt']
X = df_cat.drop('cnt', axis=1) # becareful inplace= False

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rand_state)

In [12]:
# 2. categoricals with standardization
stn = StandardScaler()

In [13]:
X_train_stn = stn.fit_transform(X_train)
X_test_stn = stn.transform(X_test)
y_train_stn = stn.fit_transform(np.array(y_train).reshape(-1,1))
y_test_stn = stn.transform(np.array(y_test).reshape(-1,1))

In [14]:
pd.DataFrame(X_train_stn).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.449324,-0.996144,-0.738859,-0.51551,-0.174129,1.001403,-0.679162,2.47257,-0.813233,1.255764,-0.214048
1,-1.352801,1.003871,1.58488,-1.383875,-0.174129,0.502465,-0.679162,0.905475,-1.33113,1.255764,2.094346
2,-1.352801,-0.996144,-1.610262,-1.09442,-0.174129,0.502465,-0.679162,-0.661619,-1.227551,-0.345836,-0.214048


In [15]:
# 3. Normalization (MinMaxScaler)
norm = MinMaxScaler()

In [16]:
# 3. raw with normalization
X_train_norm = norm.fit_transform(X_train)
X_test_norm  = norm.transform(X_test)
y_train_norm = norm.fit_transform(np.array(y_train).reshape(-1,1))
y_test_norm  = norm.transform(np.array(y_test).reshape(-1,1))

In [17]:
pd.DataFrame(X_train_norm).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.333333,0.0,0.272727,0.347826,0.0,0.833333,0.0,0.666667,0.326531,0.87,0.193018
1,0.0,1.0,1.0,0.086957,0.0,0.666667,0.0,0.333333,0.22449,0.87,0.52639
2,0.0,0.0,0.0,0.173913,0.0,0.666667,0.0,0.0,0.244898,0.56,0.193018


In [18]:
# 4. dummy (this means that the continuos variables are unchanged)
y_dum = df_dum['cnt']
X_dum = df_dum.drop('cnt', axis=1) # becareful inplace= False

from sklearn.model_selection import train_test_split
X_train_dum, X_test_dum, y_train_dum, y_test_dum = train_test_split(X_dum, y_dum, test_size=0.3, random_state=rand_state)

In [19]:
X_train_dum.head(3)

Unnamed: 0,temp,hum,windspeed,season_2,season_3,season_4,yr_1,mnth_2,mnth_3,mnth_4,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,notbizday_1,weathersit_2,weathersit_3,weathersit_4
2243,-0.81528,1.258339,-0.211685,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0
17261,-1.334609,1.258339,2.10644,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
435,-1.230743,-0.348463,-0.211685,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [20]:
# 5. dummy with standardization (everything is standardized)
X_train_dum_stn = stn.fit_transform(X_train_dum)
X_test_dum_stn  = stn.transform(X_test_dum)
y_train_dum_stn = stn.fit_transform(np.array(y_train_dum).reshape(-1,1))
y_test_dum_stn  = stn.transform(np.array(y_test_dum).reshape(-1,1))

In [21]:
pd.DataFrame(X_train_dum_stn).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
0,-0.813233,1.255764,-0.214048,1.718956,-0.592129,-0.564661,-0.996144,-0.291545,-0.303009,3.301989,...,-0.411961,-0.404973,-0.404423,-0.408542,2.436304,-0.406622,-0.679162,-0.591495,3.359744,-0.009067
1,-1.33113,1.255764,2.094346,-0.581748,-0.592129,-0.564661,1.003871,-0.291545,-0.303009,-0.302848,...,-0.411961,-0.404973,-0.404423,2.447729,-0.410458,-0.406622,-0.679162,1.69063,-0.297642,-0.009067
2,-1.227551,-0.345836,-0.214048,-0.581748,-0.592129,-0.564661,-0.996144,-0.291545,-0.303009,-0.302848,...,-0.411961,-0.404973,-0.404423,2.447729,-0.410458,-0.406622,-0.679162,-0.591495,-0.297642,-0.009067


In [22]:
# 6. dummy with normalization (everything is normalized)
X_train_dum_norm = norm.fit_transform(X_train_dum)
X_test_dum_norm  = norm.transform(X_test_dum)
y_train_dum_norm = norm.fit_transform(np.array(y_train_dum).reshape(-1,1))
y_test_dum_norm  = norm.transform(np.array(y_test_dum).reshape(-1,1))

In [23]:
pd.DataFrame(X_train_dum_norm).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
0,0.326531,0.87,0.193018,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.22449,0.87,0.52639,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.244898,0.56,0.193018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


---

##  SVM Regression 

In [24]:
from sklearn.svm import SVR

In [25]:
#1. 
SVR_regression = SVR()
SVR_regression.fit(X_train, y_train)

SVR()

In [26]:
# 2
SVR_regression_stn = SVR()
SVR_regression_stn.fit(X_train_stn, y_train_stn)

SVR()

In [27]:
# 3
SVR_regression_norm = SVR()
SVR_regression_norm.fit(X_train_norm, y_train_norm)

SVR()

In [28]:
# 4
SVR_regression_dum = SVR()
SVR_regression_dum.fit(X_train_dum, y_train_dum)

SVR()

In [29]:
# 5
SVR_regression_dum_stn = SVR()
SVR_regression_dum_stn.fit(X_train_dum_stn, y_train_dum_stn)

SVR()

In [30]:
# 6
SVR_regression_dum_norm = SVR()
SVR_regression_dum_norm.fit(X_train_dum_norm, y_train_dum_norm)

SVR()

In [31]:
# Predicting the Test set results
y_hat          = SVR_regression.predict(X_test)
y_hat_stn      = SVR_regression_stn.predict(X_test_stn)
y_hat_norm     = SVR_regression_norm.predict(X_test_norm)

y_hat_dum      = SVR_regression_dum.predict(X_test_dum)
y_hat_dum_stn  = SVR_regression_dum_stn.predict(X_test_dum_stn)
y_hat_dum_norm = SVR_regression_dum_norm.predict(X_test_dum_norm)

In [32]:
print('Raw data: ' + np.str(SVR_regression.score(X_test, y_test)))
print('Raw +stn: ' + np.str(SVR_regression_stn.score(X_test_stn, y_test_stn)))
print('Raw +nrm: ' + np.str(SVR_regression_norm.score(X_test_norm, y_test_norm)))
print('dummy   : ' + np.str(SVR_regression_dum.score(X_test_dum, y_test_dum)))        
print('dum +stn: ' + np.str(SVR_regression_dum_stn.score(X_test_dum_stn, y_test_dum_stn)))    
print('dum +nrm: ' + np.str(SVR_regression_dum_norm.score(X_test_dum_norm, y_test_dum_norm)))


Raw data: 0.3824409110511513
Raw +stn: 0.6173062695511451
Raw +nrm: 0.6405952814948637
dummy   : 0.35658207520733265
dum +stn: 0.9312607394252405
dum +nrm: 0.8847050416534323


---
---

## SVM Classification

In [33]:
overload = np.where(df_cat['cnt']>500,1,0)

In [34]:
pd.crosstab(overload,overload, normalize='all')*100

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,92.640543,0.0
1,0.0,7.359457


The dataset is relatively imbalanced. 

In [35]:
y = overload
X = df_cat.drop('cnt', axis=1) # becareful inplace= False

from sklearn.model_selection import train_test_split
X_train, X_test, y_train_classification, y_test_classification = train_test_split(X, y, test_size=0.3, random_state=rand_state)

---

In [36]:
from sklearn.svm import SVC

In [37]:
# 1

SVC_classifier = SVC()
SVC_classifier.fit(X_train, y_train_classification)

SVC()

In [38]:
# 2

SVC_classifier_stn = SVC()
SVC_classifier_stn.fit(X_train_stn, y_train_classification)

SVC()

In [39]:
# 3

SVC_classifier_norm = SVC()
SVC_classifier_norm.fit(X_train_norm, y_train_classification)

SVC()

In [40]:
# 4

SVC_classifier_dum = SVC()
SVC_classifier_dum.fit(X_train_dum, y_train_classification)

SVC()

In [41]:
# 5

SVC_classifier_dum_stn = SVC()
SVC_classifier_dum_stn.fit(X_train_dum_stn, y_train_classification)

SVC()

In [42]:
# 6

SVC_classifier_dum_norm = SVC()
SVC_classifier_dum_norm.fit(X_train_dum_norm, y_train_classification)

SVC()

In [43]:
# Predicting the Test set results
y_hat          = SVC_classifier.predict(X_test)
y_hat_stn      = SVC_classifier_stn.predict(X_test_stn)
y_hat_norm     = SVC_classifier_norm.predict(X_test_norm)

y_hat_dum      = SVC_classifier_dum.predict(X_test_dum)
y_hat_dum_stn  = SVC_classifier_dum_stn.predict(X_test_dum_stn)
y_hat_dum_norm = SVC_classifier_dum_norm.predict(X_test_dum_norm)

In [44]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print('Raw data: ' + np.str(f1_score(y_test_classification,y_hat)))
print('Raw +stn: ' + np.str(f1_score(y_test_classification,y_hat_stn)))
print('Raw +nrm: ' + np.str(f1_score(y_test_classification,y_hat_norm)))
print('dummy   : ' + np.str(f1_score(y_test_classification,y_hat_dum)))
print('dum +stn: ' + np.str(f1_score(y_test_classification,y_hat_dum_stn)))
print('dum +nrm: ' + np.str(f1_score(y_test_classification,y_hat_dum_norm)))


Raw data: 0.0
Raw +stn: 0.17040358744394618
Raw +nrm: 0.0
dummy   : 0.7763347763347763
dum +stn: 0.7790368271954675
dum +nrm: 0.7763347763347763
