In [2]:
import pandas as pd
from sklearn import svm, datasets
iris = datasets.load_iris()

In [4]:
print(dir(iris))
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df.head()

['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'frame', 'target', 'target_names']


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
df['flower'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [6]:
df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [9]:
# standard method includes train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.3)

In [10]:
# let's assume we are selecting SVM model
model = svm.SVC(kernel = 'rbf', C = 30, gamma = 'auto')
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9111111111111111

But this accuracy is not reliable as it changes with different training and test samples. <br>
For that we use K-fold cross validation

For SVM, we have 3 kernels: linear, rbf, polynomial. Tuning our model will choosing a kernel, selecting the number of iterations, number of folds, etc. <br>
This would be very time-consuming.

# This is when GridSearchCV comes into play.
## This API will automate the rigorous task of fine-tuning our model

In [15]:
from sklearn.model_selection import GridSearchCV

classifier = GridSearchCV(svm.SVC(gamma = 'auto'), {
    'C':[1,10,20],
    'kernel':['rbf', 'linear']
}, cv = 5, return_train_score = False)  # cv is cross-validation
classifier.fit(iris.data, iris.target)
classifier.cv_results_

{'mean_fit_time': array([0.00190535, 0.001441  , 0.00170989, 0.00200176, 0.00200062,
        0.00180044]),
 'std_fit_time': array([5.07319322e-04, 5.44669276e-04, 6.10038901e-04, 5.19730629e-06,
        9.46494734e-07, 4.00977030e-04]),
 'mean_score_time': array([0.00119944, 0.00080075, 0.00158978, 0.00099545, 0.00160065,
        0.00120077]),
 'std_score_time': array([4.00547737e-04, 4.00400238e-04, 5.18385090e-04, 4.81440261e-06,
        4.89415555e-04, 4.00148055e-04]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20

In [16]:
gscv_results = pd.DataFrame(classifier.cv_results_)
gscv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001905,0.0005073193,0.001199,0.000401,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001441,0.0005446693,0.000801,0.0004,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.00171,0.0006100389,0.00159,0.000518,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.002002,5.197306e-06,0.000995,5e-06,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.002001,9.464947e-07,0.001601,0.000489,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.0018,0.000400977,0.001201,0.0004,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [21]:
gscv_results[['param_C', 'param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


So, we can supply all the parameters that we need to assess our model on. GridSearchCV will train and test our model on every combination of the parameters.

But trying out every combination of parameters will be a computation cost to bear especially when the data is large.
sklearn provides randomisedCV for that.

In [23]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma = 'auto'),{
    'C': [1,10,20],
    'kernel':['rbf', 'linear']
}, cv = 5, return_train_score = False, n_iter = 2)  # n_iter is number of combinations
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.98
1,20,linear,0.966667


Let's apply the approach with different models to train.

In [27]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
model_params = {
    'svm': {
        'model': svm.SVC(gamma = 'auto'),
        'params': {
            'C': [1, 10, 20],
            'kernel': ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1, 5, 10]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver = 'liblinear'),
        'params': {
            'C': [1, 5, 10]
        }
    }
}

In [28]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv = 5, return_train_score = False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [29]:
df = pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.966667,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}


Hence, best model is <b>SVM</b>