In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = sns.load_dataset('tips')

In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
df.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [6]:
df.shape

(244, 7)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [9]:
df['day'].value_counts()

(day
 Sat     87
 Sun     76
 Thur    62
 Fri     19
 Name: count, dtype: int64,
 ['Sun', 'Sat', 'Thur', 'Fri']
 Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun'])

In [10]:
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [12]:
df['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [14]:
## independent and dependent featuress
X = df.drop('total_bill', axis=1)
y = df['total_bill']

In [15]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [16]:
# Feature Encoding (Label Encoding and One Hot Encoding)
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()

In [18]:
X_train['sex'] = le1.fit_transform(X_train['sex'])
X_train['smoker'] = le2.fit_transform(X_train['smoker'])
X_train['time'] = le3.fit_transform(X_train['time'])

In [19]:
X_train

Unnamed: 0,tip,sex,smoker,day,time,size
115,3.50,0,0,Sun,0,2
181,5.65,1,1,Sun,0,2
225,2.50,0,1,Fri,1,2
68,2.01,1,0,Sat,0,2
104,4.08,0,0,Sat,0,2
...,...,...,...,...,...,...
106,4.06,1,1,Sat,0,2
14,3.02,0,0,Sun,0,2
92,1.00,0,1,Fri,0,2
179,3.55,1,1,Sun,0,2


In [20]:
X_test['sex'] = le1.transform(X_test['sex'])
X_test['smoker'] = le2.transform(X_test['smoker'])
X_test['time'] = le3.transform(X_test['time'])

In [21]:
X_test.head()

Unnamed: 0,tip,sex,smoker,day,time,size
24,3.18,1,0,Sat,0,2
6,2.0,1,0,Sun,0,2
153,2.0,1,0,Sun,0,4
211,5.16,1,1,Sat,0,4
198,2.0,0,1,Thur,1,2


In [22]:
## One Hot Encoding -- Column Transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [24]:
ct = ColumnTransformer(transformers=[('onehot', OneHotEncoder(drop='first'), [3])], remainder='passthrough')

In [27]:
import sys
np.set_printoptions(threshold=sys.maxsize)
X_train = ct.fit_transform(X_train)

In [28]:
X_test = ct.transform(X_test)

In [29]:
X_test

array([[1.  , 0.  , 0.  , 3.18, 1.  , 0.  , 0.  , 2.  ],
       [0.  , 1.  , 0.  , 2.  , 1.  , 0.  , 0.  , 2.  ],
       [0.  , 1.  , 0.  , 2.  , 1.  , 0.  , 0.  , 4.  ],
       [1.  , 0.  , 0.  , 5.16, 1.  , 1.  , 0.  , 4.  ],
       [0.  , 0.  , 1.  , 2.  , 0.  , 1.  , 1.  , 2.  ],
       [0.  , 1.  , 0.  , 2.  , 1.  , 1.  , 0.  , 2.  ],
       [0.  , 0.  , 1.  , 2.56, 1.  , 1.  , 1.  , 2.  ],
       [0.  , 0.  , 1.  , 2.52, 0.  , 0.  , 1.  , 2.  ],
       [0.  , 1.  , 0.  , 3.23, 1.  , 0.  , 0.  , 2.  ],
       [0.  , 0.  , 0.  , 3.  , 0.  , 1.  , 0.  , 2.  ],
       [0.  , 1.  , 0.  , 3.  , 1.  , 0.  , 0.  , 2.  ],
       [1.  , 0.  , 0.  , 1.47, 1.  , 0.  , 0.  , 2.  ],
       [0.  , 0.  , 1.  , 1.5 , 0.  , 0.  , 1.  , 2.  ],
       [0.  , 1.  , 0.  , 2.  , 1.  , 1.  , 0.  , 2.  ],
       [0.  , 0.  , 1.  , 1.83, 0.  , 0.  , 1.  , 1.  ],
       [0.  , 0.  , 1.  , 1.36, 0.  , 0.  , 1.  , 3.  ],
       [0.  , 0.  , 1.  , 4.  , 1.  , 1.  , 1.  , 3.  ],
       [0.  , 1.  , 0.  , 3.92,

In [31]:
# Import SVR
from sklearn.svm import SVR
svr = SVR()

In [32]:
svr.fit(X_train, y_train)

In [33]:
y_pred = svr.predict(X_test)
y_pred

array([17.5561784 , 14.36577654, 19.63397234, 28.25543047, 14.02663803,
       15.14661512, 15.65622876, 14.69846537, 17.74841731, 17.10279588,
       17.00871008, 13.11497368, 12.69657312, 15.14661512, 12.61920692,
       14.56276428, 22.36183592, 20.05495874, 15.19753962, 27.62501035,
       20.51137651, 20.70196982, 20.81712863, 13.09052072, 21.43691299,
       14.41684672, 13.85511311, 23.80397724, 19.63397234, 27.64931837,
       22.79293822, 13.8188067 , 20.30086231, 18.25263814, 20.98187239,
       21.0145318 , 13.75259705, 26.55345115, 15.35776158, 14.77444494,
       12.88789193, 13.46643546, 15.6772362 , 15.81001   , 14.37448237,
       12.8355147 , 13.91918958, 17.21837273, 12.89649108, 15.99658633,
       14.83989577, 20.674379  , 26.42594064, 13.792125  , 18.67892859,
       13.73233647, 23.77247816, 13.53584868, 18.67078969, 20.11314707,
       28.25510507])

In [34]:
## Model Evaluation
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.49798620106004743
4.463296539661225
39.31122612339172


In [35]:
## Hyperparameter Tuning

In [36]:
from sklearn.model_selection import GridSearchCV
# defining parameteres

param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'liner', 'poly']
}

In [37]:
gird = GridSearchCV(SVR(), param_grid, refit=True, verbose=3, n_jobs=-1, cv=5)

In [39]:
gird.fit(X_train, y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=liner;, score=nan total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.019 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.078 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=liner;, score=nan total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=liner;, score=nan total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.563 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.168 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=liner;, score=nan total time=   0.0s
[CV 1/5] END ...C=0.1, gamma=0.01, kernel=poly;, score=-0.145 total time=   0.0s
[CV 5/5] END ...C=0.1, gamma=0.01, kernel=poly;, score=-0.153 total time=   0.0s
[CV 4/5] END ...C=0.1, gamma=0.001, kernel=rbf;, score=-0.028 total time=   0.0s
[CV 5/5] END ...C=0.1, gamma=0.001, kernel=rbf;

125 fits failed out of a total of 375.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/lib/python3/dist-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/usr/lib/python3/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/lib/python3/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.Invalid

In [40]:
gird.best_estimator_

In [41]:
gird.best_params_

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [42]:
gird.best_score_

0.5205632165395404

In [44]:
grid_pred = gird.predict(X_test)

In [45]:
print(r2_score(y_test, grid_pred))
print(mean_absolute_error(y_test, grid_pred))
print(mean_squared_error(y_test, grid_pred))

0.5630727637616455
4.260483973872266
34.21448856486322
