### Support Vector Regression

In [46]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt

In [47]:
df = sns.load_dataset('tips')

In [48]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [49]:
print(df['sex'].value_counts())
print(df['smoker'].value_counts())
print(df['day'].value_counts())
print(df['time'].value_counts())

# so no Null Values

sex
Male      157
Female     87
Name: count, dtype: int64
smoker
No     151
Yes     93
Name: count, dtype: int64
day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64
time
Dinner    176
Lunch      68
Name: count, dtype: int64


The reason for doing train,test split before encoding, is that the model should not know about the test data (also known as data leakage)

In [50]:
X = df[['total_bill','tip','sex','smoker','day','time']]
y = df[['size']]

In [51]:
from sklearn.model_selection import train_test_split 

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=34)

In [52]:
# Feature Encoding (Label Encoding and OneHot Encoding)
from sklearn.preprocessing import LabelEncoder 

le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()

# df['sex'] = np.where(df['sex'] == 'Male',0,1)
# df['smoker'] = np.where(df['smoker'] == 'Yes',1,0)
# df['time'] = np.where(df['time'] == 'Lunch',1,0)

In [53]:
import warnings
warnings.filterwarnings('ignore')

X_train['sex'] = le1.fit_transform(X_train['sex'])
X_train['smoker'] = le2.fit_transform(X_train['smoker'])
X_train['time'] = le3.fit_transform(X_train['time'])

In [54]:
X_train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time
82,10.07,1.83,0,0,Thur,1
8,15.04,1.96,1,0,Sun,0
150,14.07,2.5,1,0,Sun,0
127,14.52,2.0,0,0,Thur,1
60,20.29,3.21,1,1,Sat,0


In [55]:
# Apply the same for X_test 

X_test['sex'] = le1.transform(X_test['sex'])
X_test['smoker'] = le2.transform(X_test['smoker'])
X_test['time'] = le3.transform(X_test['time'])
X_test.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time
3,23.68,3.31,1,0,Sun,0
53,9.94,1.56,1,0,Sun,0
63,18.29,3.76,1,1,Sat,0
28,21.7,4.3,1,0,Sat,0
202,13.0,2.0,0,1,Thur,1


In [56]:
# One-Hot encoding on "day"

from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder 

ct = ColumnTransformer(
    transformers=[('onehot',OneHotEncoder(drop='first'),[4])],  # index 3 column to be encoded
    remainder="passthrough"  # after encoding all feature must be kept as it is
)

In [57]:
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)
X_test[0:5,:]

array([[ 0.  ,  1.  ,  0.  , 23.68,  3.31,  1.  ,  0.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  ,  9.94,  1.56,  1.  ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  , 18.29,  3.76,  1.  ,  1.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  , 21.7 ,  4.3 ,  1.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  1.  , 13.  ,  2.  ,  0.  ,  1.  ,  1.  ]])

In [58]:
# SVR
from sklearn.svm import SVR 
from sklearn.metrics import r2_score, mean_absolute_error

svr = SVR()

svr.fit(X_train,y_train)
y_pred = svr.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))


0.3572745519405601
0.4975524962162039


In [59]:
y_pred[0:5]

array([2.5348277 , 1.9889988 , 2.10695289, 2.36747522, 1.91882307])

In [60]:
y_test[0:5].T

Unnamed: 0,3,53,63,28,202
size,2,2,4,2,2


In [61]:
# hyper-paramter tuning
from sklearn.model_selection import GridSearchCV 

model = SVR()

params = {
    'C': [0.1,1,10,100,1000],
    'gamma': [1,0.1,0.01,0.001,0.0001],
    'kernel': ['rbf']
}
grid = GridSearchCV(estimator=model,param_grid=params,cv=5,refit=True,verbose=2)


In [62]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.0

In [63]:
grid.best_params_

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

In [64]:
y_pred_tuned = grid.predict(X_test)

print(r2_score(y_test,y_pred_tuned))
print(mean_absolute_error(y_test,y_pred_tuned))

0.3725966122380918
0.49816594821059773
