## Grid Search CV 

### Technique for Model Selection in Machine Learning

In [207]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

In [208]:
dataset = load_iris()
# print(dataset.feature_names)
data = pd.DataFrame(dataset.data,columns=dataset.feature_names)
data['target'] = dataset.target
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [209]:
X = dataset.data
Y = dataset.target

In [210]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score

In [211]:
cross_val_score(SVC(C=1,kernel='linear',gamma='auto'),X,Y)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [212]:
kernel = ['linear','rbf']
C = [0.1,1,10,100]
scores = {}
for i in kernel:
    for j in C:
        cvs = cross_val_score(SVC(C=j,kernel=i,gamma='auto'),X,Y)
        scores[i+'_'+str(j)] = np.average(cvs)
scores

{'linear_0.1': 0.9733333333333334,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_100': 0.9666666666666666,
 'rbf_0.1': 0.9466666666666667,
 'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_100': 0.96}

In [213]:
from sklearn.model_selection import GridSearchCV
parms = {
    'kernel': ['linear','rbf'],
    'C': [0.1,1,10,100]
}
clf = GridSearchCV(SVC(),parms,cv=5)
clf.fit(X,Y)
clf.cv_results_

{'mean_fit_time': array([0.0004662 , 0.00058494, 0.0003026 , 0.00034723, 0.00030885,
        0.00030708, 0.00029659, 0.00029845]),
 'std_fit_time': array([9.52176587e-05, 1.21470426e-04, 2.44308206e-05, 1.48369167e-05,
        2.71351767e-05, 2.57390497e-05, 1.30151215e-05, 2.78998232e-05]),
 'mean_score_time': array([0.00022178, 0.00027175, 0.00016594, 0.00019131, 0.00016942,
        0.00017052, 0.00014381, 0.00015087]),
 'std_score_time': array([4.10272309e-05, 6.56768367e-05, 7.17952730e-06, 1.26784894e-05,
        2.53063549e-05, 1.83184924e-05, 5.58815717e-06, 7.00707296e-06]),
 'param_C': masked_array(data=[0.1, 0.1, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf',
                    'linear', 'rbf'],
              mask=[False, False, False, False, False, False, False, False],
        fill_

In [214]:
pd.DataFrame(clf.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,0.1,linear,0.973333
1,0.1,rbf,0.92
2,1.0,linear,0.98
3,1.0,rbf,0.966667
4,10.0,linear,0.973333
5,10.0,rbf,0.98
6,100.0,linear,0.966667
7,100.0,rbf,0.973333


In [215]:
clf.best_estimator_

In [216]:
clf.best_params_,clf.best_score_

({'C': 1, 'kernel': 'linear'}, 0.9800000000000001)

In [217]:
models = {
    'SVM': {
        'model': SVC(gamma='auto'),
        'params': {
            'C': [0.1,1,10,100],
            'kernel': ['linear','rbf']
        }
    },
    'logistic_reg': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [0.1,1,10,100]
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1,10,100]
        }
    }
}

In [218]:
from sklearn.model_selection import StratifiedKFold

In [219]:
best_models = []
for model_name,mp in models.items():
    clf = GridSearchCV(mp['model'],mp['params'],cv=StratifiedKFold(n_splits=5,shuffle=True))
    clf.fit(X,Y)
    best_models.append({
        'model_name': model_name,
        'params': clf.best_params_,
        'score': clf.best_score_
    })
best_models

[{'model_name': 'SVM',
  'params': {'C': 1, 'kernel': 'linear'},
  'score': 0.9800000000000001},
 {'model_name': 'logistic_reg',
  'params': {'C': 1},
  'score': 0.9733333333333334},
 {'model_name': 'random_forest',
  'params': {'n_estimators': 10},
  'score': 0.96}]

In [220]:
pd.DataFrame(best_models)

Unnamed: 0,model_name,params,score
0,SVM,"{'C': 1, 'kernel': 'linear'}",0.98
1,logistic_reg,{'C': 1},0.973333
2,random_forest,{'n_estimators': 10},0.96


Here, we can see that score is highest for SVM(C=1,kernel=linear) model. 