In [2]:
print('HyperParameter Optimization using GridSearchCV')

HyperParameter Optimization using GridSearchCV


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

In [18]:
dataset = pd.read_csv('Advertising_data.csv')

display(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   User ID          400 non-null    int64  
 1   Gender           400 non-null    object 
 2   Age              400 non-null    float64
 3   EstimatedSalary  400 non-null    float64
 4   Purchased        400 non-null    int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 15.8+ KB


None

In [19]:
display(dataset.describe())

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [20]:
print('Dropping unnecessary features and seperating predictors and target.')

labels = dataset['Purchased']
dataset.drop(['User ID','Gender','Purchased'],axis=1,inplace=True)

print('Preview feature-dataset and target.')
display(dataset.head())
print(labels[0:5])

Dropping unnecessary features and seperating predictors and target.
Preview feature-dataset and target.


Unnamed: 0,Age,EstimatedSalary
0,19.0,19000.0
1,35.0,20000.0
2,26.0,43000.0
3,27.0,57000.0
4,19.0,76000.0


0    0
1    0
2    0
3    0
4    0
Name: Purchased, dtype: int64


In [23]:
print('Performing train-test split.')
from sklearn.model_selection import train_test_split

train_data,test_data,train_labels,test_labels = train_test_split(dataset,labels,test_size=0.2,random_state=42)
print('Train data size: ',train_data.shape,'\nTrain labels size: ',train_labels.shape)
print('Test data size: ',test_data.shape,'\nTest labels size: ',test_labels.shape)

Perform train-test split.
Train data size:  (320, 2) 
Train labels size:  (320,)
Test data size:  (80, 2) 
Test labels size:  (80,)


In [24]:
print('Performing feature scaling:')

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
temp = scaler.fit_transform(train_data)
train_data_prep = pd.DataFrame(temp)
print('Preview after scaling')
display(train_data_prep.head())

Performing feature scaling:
Preview after scaling


Unnamed: 0,0,1
0,-1.066752,-0.386344
1,0.797535,-1.229939
2,0.110692,1.853544
3,0.601294,-0.909955
4,1.876859,-1.288118


In [43]:
print('Using SVC with dafault parameters(randomly chosen parameters) on the train data.')

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

svc_1 = SVC(kernel='linear')
svc_1.fit(train_data_prep,train_labels)

train_mse_1 = -cross_val_score(svc_1,train_data_prep,train_labels,scoring='neg_mean_squared_error',cv=10).mean()

temp = scaler.fit_transform(test_data)
test_data_prep = pd.DataFrame(temp)

prediction_1 = svc_1.predict(test_data_prep)
test_mse_1 = mean_squared_error(prediction_1,test_labels)

print('Train MSE: ',train_mse_1,'\nTest MSE: ',test_mse_1)

Using SVC with dafault parameters(randomly chosen parameters) on the train data.
Train MSE:  0.18125 
Test MSE:  0.15


In [40]:
print('Using SVC with custom parameters optimized using GridSearchCV.')

from sklearn.model_selection import GridSearchCV

parameter = [{'C':[1,10,100,1000],'kernel':['linear']},
             {'C':[1,10,100,1000],'kernel':['rbf'],'gamma':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]},
             {'C':[1,10,100,100],'kernel':['poly'],'gamma':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],'degree':[2,3]}]

grid_search = GridSearchCV(estimator=svc_1,
                          param_grid=parameter,
                          scoring='neg_mean_squared_error',
                          cv=10,
                          n_jobs=-1)

grid_search.fit(train_data_prep,train_labels)

train_mse_2 = -grid_search.best_score_
print('Best Params: ',grid_search.best_params_)

Using SVC with custom parameters optimized using GridSearchCV.
Best Params:  {'C': 10, 'gamma': 0.3, 'kernel': 'rbf'}


In [44]:
print('Using the suggested parameter values for SVC model.')

svc_2 = SVC(C=10, gamma=0.3,kernel='rbf')
svc_2.fit(train_data_prep,train_labels)


test_mse_2 = mean_squared_error(svc_2.predict(test_data_prep),test_labels)

print('Train MSE: ',train_mse_2,'\nTest MSE: ',test_mse_2)

Using the suggested parameter values for SVC model.
Train MSE:  0.0875 
Test MSE:  0.075


In [45]:
#Thus we can see the initial train/test mse and train/test mse after GridSearchCV has considerable difference
#and the later mse is lower