Creative Commons CC BY 4.0 Lynd Bacon & Associates, Ltd. Not warranted to be suitable for any particular purpose. (You're on your own!)

#  More Support Vector Classification: Grid Searching

Here we'll try grid searching to find good parameter settings.

# Get Packages

In [113]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, Markdown
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn import linear_model  
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.metrics import roc_curve, precision_score, recall_score
from sklearn.svm import LinearSVC
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os
import shelve
%matplotlib inline

# Get Data and Munge It

In [114]:
cervical=pd.read_csv('risk_factors_cervical_cancer.csv',na_values='?')

In [115]:
cervicalFeats=cervical.loc[:,'Age':'Hormonal Contraceptives (years)'].copy()

In [116]:
cervical2=pd.concat([cervical.Biopsy,cervicalFeats],axis=1).dropna(axis=0)

In [117]:
X=cervical2.iloc[:,1:].to_numpy()  # features
y=cervical2.Biopsy.to_numpy()
X.shape
y.shape

(676, 9)

(676,)

# Set Up Train/Test Split, Pipeline, Grid Search Parameters



In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
            random_state=99,stratify=y)

In [119]:
pipe = Pipeline([("scaler", preprocessing.StandardScaler()), 
                 ("polynom",PolynomialFeatures()),
                  ("svm", LinearSVC(max_iter=100000,random_state=99))])


In [120]:
param_grid={"polynom__degree":[1,2],
           "svm__C":[0.001,0.01,0.1,1.0]}

In [121]:
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=10,
                       return_train_score=True,iid=False)

In [122]:
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('polynom', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('svm', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100000,
     multi_class='ovr', penalty='l2', random_state=99, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'polynom__degree': [1, 2], 'svm__C': [0.001, 0.01, 0.1, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [123]:
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))o

Best params:
{'polynom__degree': 1, 'svm__C': 0.001}

Best cross-validation score: 0.93
Test-set score: 0.93


# Multiple Learners in a Grid Search

Here we try training both logistic regression and a support vector classifier as part of our grid search.

In [125]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])

In [126]:
param_grid = [
    {'classifier': [LinearSVC(max_iter=1000000,random_state=99)],
     'preprocessing': [StandardScaler(), MinMaxScaler(), None],
     'classifier__C': [0.001, 0.01, 0.1, 1]},
    {'classifier': [linear_model.LogisticRegression(max_iter=1000000,
                            solver='lbfgs',random_state=99)],
     'preprocessing': [StandardScaler(), MinMaxScaler(), None],
     'classifier__C': [0.001, 0.01, 0.1, 1]}
    ]


In [127]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=10,
                   return_train_score=True,iid=False)

In [128]:
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid=False, n_jobs=None,
       param_grid=[{'classifier': [LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000000,
     multi_class='ovr', penalty='l2', random_state=99, tol=0.0001,
     verbose=0)], 'preprocessing': [StandardScaler(copy=True, with_mea...rue), MinMaxScaler(copy=True, feature_range=(0, 1)), None], 'classifier__C': [0.001, 0.01, 0.1, 1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [129]:
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Best params:
{'classifier': LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000000,
     multi_class='ovr', penalty='l2', random_state=99, tol=0.0001,
     verbose=0), 'classifier__C': 0.001, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

Best cross-validation score: 0.93
Test-set score: 0.93


In [130]:
cvresDF=pd.DataFrame(grid.cv_results_)

In [133]:
cvresDF.iloc[cvresDF.mean_test_score.idxmax(),]

mean_fit_time                                                 0.00151751
std_fit_time                                                 9.97572e-05
mean_score_time                                              0.000422573
std_score_time                                               6.82548e-05
param_classifier       LinearSVC(C=0.001, class_weight=None, dual=Tru...
param_classifier__C                                                0.001
param_preprocessing    StandardScaler(copy=True, with_mean=True, with...
params                 {'classifier': LinearSVC(C=0.001, class_weight...
split0_test_score                                               0.923077
split1_test_score                                               0.923077
split2_test_score                                               0.923077
split3_test_score                                               0.921569
split4_test_score                                                   0.94
split5_test_score                                  