Finding best model and hyper parameter tunning using GridSearchCV
For iris flower dataset in sklearn library, we are going to find out best model and best hyper parameters using GridSearchCV

In [5]:
# Load iris flower dataset
import pandas as pd
import numpy as np
from sklearn import svm, datasets
iris = datasets.load_iris()
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [4]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['flower'] = iris.target
df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


Approach 1: Use train_test_split and manually tune parameters by trial and error

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [12]:
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train, y_train)
model.score(X_test, y_test)

1.0

Approach 2: Use K Fold Cross validation
Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation

In [14]:
from sklearn.model_selection import cross_val_score
cross_val_score(svm.SVC(kernel='rbf',C=10,gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [16]:
cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'), iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [17]:
cross_val_score(svm.SVC(kernel='rbf',C=20,gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [20]:
# Above approach is tiresome and very manual. We can use for loop as an alternative
kernels = ['linear', 'rbf']
c = [1, 10, 20]
avg_scores = {}

for i in kernels:
    for j in c:
        cv_score = cross_val_score(svm.SVC(kernel=i, C=j, gamma='auto'), iris.data, iris.target, cv=5)
        avg_scores[i + '_' + str(j)] = np.average(cv_score)

avg_scores

{'linear_1': np.float64(0.9800000000000001),
 'linear_10': np.float64(0.9733333333333334),
 'linear_20': np.float64(0.9666666666666666),
 'rbf_1': np.float64(0.9800000000000001),
 'rbf_10': np.float64(0.9800000000000001),
 'rbf_20': np.float64(0.9666666666666668)}

From above results we can say that rbf with C=1 or 10 or linear with C=1 will give best performance

Approach 3: Use GridSearchCV
GridSearchCV does exactly same thing as for loop above but in a single line of code

In [33]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1, 10, 20],
    'kernel': ['rbf', 'linear', 'sigmoid']
}, cv=5, return_train_score=False)
clf.fit(iris.data, iris.target)
clf.cv_results_

{'mean_fit_time': array([0.00019994, 0.0005949 , 0.00099998, 0.00040421, 0.00040312,
        0.00059958, 0.00040021, 0.00039773, 0.00079751]),
 'std_fit_time': array([3.99875641e-04, 4.85929829e-04, 7.29420592e-07, 4.95118637e-04,
        4.93741942e-04, 4.89551352e-04, 4.90154449e-04, 4.87129438e-04,
        3.98981621e-04]),
 'mean_score_time': array([0.00039973, 0.00020542, 0.        , 0.00019598, 0.00040002,
        0.00039749, 0.00020008, 0.00040259, 0.00039907]),
 'std_score_time': array([0.00048957, 0.00041084, 0.        , 0.00039196, 0.00048992,
        0.00048685, 0.00040016, 0.00049312, 0.00048875]),
 'param_C': masked_array(data=[1, 1, 1, 10, 10, 10, 20, 20, 20],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value=999999),
 'param_kernel': masked_array(data=['rbf', 'linear', 'sigmoid', 'rbf', 'linear', 'sigmoid',
                    'rbf', 'linear', 'sigmoid'],
              mask=[False, False, False, Fal

In [34]:
df1 = pd.DataFrame(clf.cv_results_)
df1

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0002,0.0003998756,0.0004,0.00049,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.000595,0.0004859298,0.000205,0.000411,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.001,7.294206e-07,0.0,0.0,1,sigmoid,"{'C': 1, 'kernel': 'sigmoid'}",0.333333,0.1,0.0,0.033333,0.0,0.093333,0.125433,7
3,0.000404,0.0004951186,0.000196,0.000392,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
4,0.000403,0.0004937419,0.0004,0.00049,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
5,0.0006,0.0004895514,0.000397,0.000487,10,sigmoid,"{'C': 10, 'kernel': 'sigmoid'}",0.333333,0.1,0.0,0.033333,0.0,0.093333,0.125433,7
6,0.0004,0.0004901544,0.0002,0.0004,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
7,0.000398,0.0004871294,0.000403,0.000493,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
8,0.000798,0.0003989816,0.000399,0.000489,20,sigmoid,"{'C': 20, 'kernel': 'sigmoid'}",0.333333,0.1,0.0,0.033333,0.0,0.093333,0.125433,7


In [35]:
df1[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,1,sigmoid,0.093333
3,10,rbf,0.98
4,10,linear,0.973333
5,10,sigmoid,0.093333
6,20,rbf,0.966667
7,20,linear,0.966667
8,20,sigmoid,0.093333


In [28]:
dir(clf)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_format_results',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_routed_params_for_fit',
 '_get_scorers',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run

In [36]:
clf.best_score_

np.float64(0.9800000000000001)

In [37]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [43]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
    'C': [1, 10, 20],
    'kernel': ['rbf', 'linear']
    },
    cv=5,
    return_train_score=False,
    n_iter=2
)
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,10,rbf,0.98


In [44]:
# How about different models with different hyperparameters?

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm' : {
        'model': svm.SVC(gamma='auto'),
        'params': {
            'C': [1, 10, 20],
            'kernel': ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1, 5, 10]
        }
    },
    'logistic_regression': {
        'model':LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1, 5, 10]
        }
    }
}

In [46]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

scores = []

for model_name, mp in model_params.items():
    gs = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    gs.fit(iris.data, iris.target)
    scores.append({
        'model':model_name,
        'best_score': gs.best_score_,
        'best_parameter': gs.best_params_
    })

final = pd.DataFrame(scores, columns=['model', 'best_score', 'best_parameter'])
final

Unnamed: 0,model,best_score,best_parameter
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.966667,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}
