In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets, svm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
iris = datasets.load_iris()

In [None]:
df = pd.DataFrame(iris['data'], columns =  iris['feature_names'])
df['flower'] = iris['target']
df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])

In [None]:
df[33:40]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
33,5.5,4.2,1.4,0.2,setosa
34,4.9,3.1,1.5,0.2,setosa
35,5.0,3.2,1.2,0.2,setosa
36,5.5,3.5,1.3,0.2,setosa
37,4.9,3.6,1.4,0.1,setosa
38,4.4,3.0,1.3,0.2,setosa
39,5.1,3.4,1.5,0.2,setosa


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2)


## Approach 1: Use train_test_split and manually tune parameters by trial and error

score changes every time the train_test_split is rerun (without random_state)

In [None]:
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9333333333333333

##Approach 2: Use K Fold Cross validation
Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation

Note: When using K Fold Cross Validation. there is  no need to use  train_test_split

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'),iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [None]:
cross_val_score(svm.SVC(kernel='rbf',C=10,gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [None]:
cross_val_score(svm.SVC(kernel='rbf',C=20,gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [None]:
cross_val_score(svm.SVC(kernel='poly',C=15,gamma='auto'),iris.data, iris.target, cv=5)

array([1.        , 0.96666667, 0.9       , 0.93333333, 1.        ])

here we can find the score by finding average of the scores

But it will vary each time the model parameters are altered

thus it is very manual, repetitve and tiresome

### We can use for loop as an alternative

In [None]:
import time
start = time.time()


kernels = ['rbf', 'linear', 'poly']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),iris.data, iris.target, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

end  = time.time()
time_taken = end - start
print(f"time taken = {time_taken} seconds")
avg_scores

time taken = 0.3874197006225586 seconds


{'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666,
 'poly_1': 0.9666666666666666,
 'poly_10': 0.9666666666666666,
 'poly_20': 0.9533333333333334,
 'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668}

Even this metod is not efficient as we nned to require multiple for loops and inreases the time complexity

##**Approach 3: Use GridSearchCV**

GridSearchCV does exactly same thing as for loop above but in a single line of code

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {
    'C': [1,10,20],
    'kernel': ['rbf','linear', 'poly']
}

In [None]:
clf = GridSearchCV(svm.SVC(gamma='auto'), param_grid= parameters, cv=5, return_train_score=False)

In [None]:
start = time.time()

clf.fit(iris.data, iris.target)
end  = time.time()
time_taken = end - start
print(f"time taken = {time_taken} seconds")
print("\n")
clf.cv_results_


time taken = 0.63323974609375 seconds




{'mean_fit_time': array([0.00414548, 0.00582848, 0.01474929, 0.00452805, 0.00367136,
        0.01745429, 0.00278907, 0.00105066, 0.03494205]),
 'mean_score_time': array([0.00340781, 0.0006969 , 0.0007844 , 0.00209885, 0.00068669,
        0.00081549, 0.00077896, 0.00055909, 0.00443635]),
 'mean_test_score': array([0.98      , 0.98      , 0.96666667, 0.98      , 0.97333333,
        0.96666667, 0.96666667, 0.96666667, 0.95333333]),
 'param_C': masked_array(data=[1, 1, 1, 10, 10, 10, 20, 20, 20],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'poly', 'rbf', 'linear', 'poly',
                    'rbf', 'linear', 'poly'],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel'

The Above scores are not easy to read

So sklearn provides a way to convert into dataframe for the sake of easy understanding

In [None]:
df1 = pd.DataFrame(clf.cv_results_)
df1

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004145,0.003146,0.003408,0.003184,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.005828,0.004111,0.000697,9.2e-05,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.014749,0.009638,0.000784,8.4e-05,1,poly,"{'C': 1, 'kernel': 'poly'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
3,0.004528,0.005706,0.002099,0.00257,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
4,0.003671,0.004791,0.000687,7.4e-05,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
5,0.017454,0.014532,0.000815,9.5e-05,10,poly,"{'C': 10, 'kernel': 'poly'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
6,0.002789,0.002586,0.000779,5.8e-05,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
7,0.001051,6.7e-05,0.000559,4e-05,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
8,0.034942,0.032852,0.004436,0.007288,20,poly,"{'C': 20, 'kernel': 'poly'}",0.966667,0.966667,0.9,0.933333,1.0,0.953333,0.033993,9


Extracting the required columns

In [None]:
df1[['param_C','param_kernel','params','mean_test_score', 'rank_test_score']]

Unnamed: 0,param_C,param_kernel,params,mean_test_score,rank_test_score
0,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.98,1
1,1,linear,"{'C': 1, 'kernel': 'linear'}",0.98,1
2,1,poly,"{'C': 1, 'kernel': 'poly'}",0.966667,6
3,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.98,1
4,10,linear,"{'C': 10, 'kernel': 'linear'}",0.973333,4
5,10,poly,"{'C': 10, 'kernel': 'poly'}",0.966667,6
6,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,5
7,20,linear,"{'C': 20, 'kernel': 'linear'}",0.966667,6
8,20,poly,"{'C': 20, 'kernel': 'poly'}",0.953333,9


Ranking  in the decreasing order of accuracy i.e. best model at top

In [None]:
df1[['param_C','param_kernel','params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')

Unnamed: 0,param_C,param_kernel,params,mean_test_score,rank_test_score
0,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.98,1
1,1,linear,"{'C': 1, 'kernel': 'linear'}",0.98,1
3,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.98,1
4,10,linear,"{'C': 10, 'kernel': 'linear'}",0.973333,4
6,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,5
2,1,poly,"{'C': 1, 'kernel': 'poly'}",0.966667,6
5,10,poly,"{'C': 10, 'kernel': 'poly'}",0.966667,6
7,20,linear,"{'C': 20, 'kernel': 'linear'}",0.966667,6
8,20,poly,"{'C': 20, 'kernel': 'poly'}",0.953333,9


In [None]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_

In [None]:
clf.best_score_

0.9800000000000001

In [None]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [None]:
clf.best_estimator_

SVC(C=1, gamma='auto')

In our case, the dataset is very limited and only limited parameter values (i.e. only 3 values for 'C' and 3 values for 'kernal'. So only 9 combinations).

But in cases where the dataset is huge and many parameter combinations (in a range) are to be tried, in such cases, we may require very high computations and still there is a possibility of missing the optimi=um or the ideal combination

**Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation**

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), param_distributions = parameters, cv=5, return_train_score=False,
                        n_iter = 2)     # No. of combinations to be tried (from the total 9)

# However the cominations are random and differ ev erytime we rerun the code

In [None]:
rs.fit(iris.data, iris.target)

RandomizedSearchCV(cv=5, estimator=SVC(gamma='auto'), n_iter=2,
                   param_distributions={'C': [1, 10, 20],
                                        'kernel': ['rbf', 'linear', 'poly']})

In [None]:
rs.cv_results_

{'mean_fit_time': array([0.00095563, 0.00078773]),
 'mean_score_time': array([0.00075817, 0.00036788]),
 'mean_test_score': array([0.97333333, 0.96666667]),
 'param_C': masked_array(data=[10, 20],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 10, 'kernel': 'linear'}, {'C': 20, 'kernel': 'rbf'}],
 'rank_test_score': array([1, 2], dtype=int32),
 'split0_test_score': array([1.        , 0.96666667]),
 'split1_test_score': array([1., 1.]),
 'split2_test_score': array([0.9, 0.9]),
 'split3_test_score': array([0.96666667, 0.96666667]),
 'split4_test_score': array([1., 1.]),
 'std_fit_time': array([3.35736076e-04, 7.63459327e-05]),
 'std_score_time': array([7.80713001e-04, 2.70333448e-05]),
 'std_test_score': array([0.03887301, 0.03651484])}

In [None]:
df2 = pd.DataFrame(rs.cv_results_)
df2

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000956,0.000336,0.000758,0.000781,linear,10,"{'kernel': 'linear', 'C': 10}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,1
1,0.000788,7.6e-05,0.000368,2.7e-05,rbf,20,"{'kernel': 'rbf', 'C': 20}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,2


In [None]:
df2[['param_C','param_kernel','params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')

Unnamed: 0,param_C,param_kernel,params,mean_test_score,rank_test_score
0,10,linear,"{'kernel': 'linear', 'C': 10}",0.973333,1
1,20,rbf,"{'kernel': 'rbf', 'C': 20}",0.966667,2


**How about different models with different hyperparameters?**

In [None]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [None]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.953333,{'n_estimators': 10}
2,logistic_regression,0.966667,{'C': 5}


Based on above, I can conclude that SVM with C=1 and kernel='rbf' is the best model for solving my problem of iris flower classification