In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df=sns.load_dataset('tips')

In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [6]:
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [7]:
df['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [8]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [9]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [10]:
### feature encoding(label encoding)(onehotencoding)

In [12]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [13]:
### independent features and dependent features

x=df[['tip', 'sex', 'smoker', 'day', 'time', 'size']]
y=df['total_bill']

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25, random_state=20)

In [15]:
x_train

Unnamed: 0,tip,sex,smoker,day,time,size
122,2.50,Male,No,Thur,Lunch,2
80,3.00,Male,Yes,Thur,Lunch,2
63,3.76,Male,Yes,Sat,Dinner,4
50,2.50,Male,No,Sun,Dinner,2
170,10.00,Male,Yes,Sat,Dinner,3
...,...,...,...,...,...,...
156,5.00,Male,No,Sun,Dinner,6
223,3.00,Female,No,Fri,Lunch,3
15,3.92,Male,No,Sun,Dinner,2
218,1.44,Male,Yes,Sat,Dinner,2


In [16]:
y_train

122    14.26
80     19.44
63     18.29
50     12.54
170    50.81
       ...  
156    48.17
223    15.98
15     21.58
218     7.74
99     12.46
Name: total_bill, Length: 183, dtype: float64

In [17]:
### apply label encoding for all binary features

from sklearn.preprocessing import LabelEncoder

In [18]:
le1=LabelEncoder()
le2=LabelEncoder()
le3=LabelEncoder()

In [19]:
x_train['sex']=le1.fit_transform(x_train['sex'])
x_train['smoker']=le2.fit_transform(x_train['smoker'])
x_train['time']=le3.fit_transform(x_train['time'])

In [20]:
x_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
122,2.5,1,0,Thur,1,2
80,3.0,1,1,Thur,1,2
63,3.76,1,1,Sat,0,4
50,2.5,1,0,Sun,0,2
170,10.0,1,1,Sat,0,3


In [21]:
x_test['sex']=le1.transform(x_test['sex'])
x_test['smoker']=le2.transform(x_test['smoker'])
x_test['time']=le3.transform(x_test['time'])

In [22]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [23]:
ct=ColumnTransformer(transformers=[('onehot',OneHotEncoder(drop='first'),[3])],remainder='passthrough')

In [27]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
x_train=ct.fit_transform(x_train)

In [28]:
x_test=ct.transform(x_test)

In [29]:
from sklearn.svm import SVR
svr=SVR()

In [30]:
svr.fit(x_train,y_train)
y_predict=svr.predict(x_test)

In [31]:
from sklearn.metrics import r2_score, mean_absolute_error

In [32]:
accuracy=r2_score(y_test,y_predict)
mae=mean_absolute_error(y_test,y_predict)
print(accuracy)
print(mae)

0.35601683959928343
5.57163825689048


In [33]:
### hyperparameter tuning using gridsearchcv

In [34]:
param_grid={'C': [0.1,1,10,100,1000],
            'gamma':[1,0.1,0.01,0.001,0.0001],
            'kernel':['rbf']}

In [36]:
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(SVR(), param_grid=param_grid, refit=True, verbose=3)

In [37]:
grid.fit(x_train,y_train)
y_predicted_grid = grid.predict(x_test)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.185 total time=   0.0s
[CV 2/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.080 total time=   0.0s
[CV 3/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.004 total time=   0.0s
[CV 4/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.028 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.039 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.1, kernel=rbf;, score=-0.073 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.111 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.087 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.093 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.176 total time=   0.0s
[CV 1/5] END ....C=0.1, gamma=0.01, kernel=rbf;, score=-0.177 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=0.01, kernel=rbf;

In [38]:
grid.best_params_

{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}

In [39]:
grid.best_score_

np.float64(0.545018152209758)

In [40]:
accuracy_grid=r2_score(y_test,y_predicted_grid)
mae_grid=mean_absolute_error(y_test,y_predicted_grid)
print(accuracy_grid)
print(mae_grid)

0.4829139708265714
5.088533110514922
