In [1]:
# 1. Importing necessary libraries

import pandas as pd

from sklearn import svm, datasets
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.linear_model import LogisticRegression as lr

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV,RandomizedSearchCV

In [2]:
# 2. Loaded the iris dataset and changed to a dataframe

iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [3]:
# 3. Adding label column to the dataframe 

df['flower'] = iris.target
df['flower'] = df['flower'].apply(lambda x : iris.target_names[x])

In [4]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [5]:
# 4. Split the iris dataset

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [6]:
# 5. simple SVM classifier

svm_model = svm.SVC(kernel = 'rbf', C = 10, gamma = 'auto')
svm_model.fit(X_train, y_train)
svm_model.score(X_test, y_test)

0.9333333333333333

Every time score changes since train & test data split affects. K fold cross validation is one solution by taking average of all scores.

In [7]:
# 6. 6-fold cross validation in SVM classifier

cross_val_score(svm_model, iris.data, iris.target, cv=6)

array([0.96, 1.  , 0.92, 0.92, 0.96, 1.  ])

Parameters that chose may not be optimal. Try as much as possible parameters to yield high accuracy.

In [27]:
# 7. 6-fold cross validation with different parameters in SVM classifier

In [8]:
svm_model_1 = svm.SVC(kernel = 'rbf', C = 10, gamma = 'auto')
svm_model_2 = svm.SVC(kernel = 'linear', C = 50, gamma = 'auto')
svm_model_3 = svm.SVC(kernel = 'poly', C = 100, gamma = 'auto')

In [9]:
first_svm_model = cross_val_score(svm_model_1, iris.data, iris.target, cv=6)

print (first_svm_model)
print (sum(first_svm_model)/6)

[0.96 1.   0.92 0.92 0.96 1.  ]
0.96


In [10]:
second_svm_model = cross_val_score(svm_model_2, iris.data, iris.target, cv=6)

print (second_svm_model)
print (sum(second_svm_model)/6)

[1.   1.   0.96 0.92 0.92 1.  ]
0.9666666666666667


In [11]:
third_svm_model = cross_val_score(svm_model_3, iris.data, iris.target, cv=6)

print (third_svm_model)
print (sum(third_svm_model)/6)

[0.96 1.   0.88 0.96 0.92 1.  ]
0.9533333333333333


Can use for loops to cover all the possible parameters. But, computationally complex when it comes to real world applications due to heavy number of data points.

In [28]:
# 8. GridsearchCV technique to find best parameter for SVM classifier model

In [12]:
grid_svm_model = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': range(1,101),
    'kernel': ['rbf', 'linear', 'poly']
}, cv=5, return_train_score=False)

In [13]:
grid_svm_model.fit(iris.data, iris.target)

GridSearchCV(cv=5, estimator=SVC(gamma='auto'),
             param_grid={'C': range(1, 101),
                         'kernel': ['rbf', 'linear', 'poly']})

In [14]:
grid_svm_model.best_params_

{'C': 4, 'kernel': 'rbf'}

In [15]:
grid_svm_model.best_score_

0.9866666666666667

In [16]:
df2 = pd.DataFrame(grid_svm_model.cv_results_)

In [17]:
df2[['param_C', 'param_kernel', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score,std_test_score,rank_test_score
0,1,rbf,0.980000,0.016330,2
1,1,linear,0.980000,0.016330,2
2,1,poly,0.966667,0.042164,43
3,2,rbf,0.980000,0.016330,2
4,2,linear,0.980000,0.016330,2
...,...,...,...,...,...
295,99,linear,0.966667,0.042164,43
296,99,poly,0.946667,0.033993,299
297,100,rbf,0.960000,0.038873,121
298,100,linear,0.966667,0.042164,43


GridSearchCV uses all the possible parameters. RandomizedSearchCV is faster than GridSearchCV but can't expect best parameters.

In [29]:
# 9. RandomizedsearchCV technique to find best parameter for SVM classifier model

In [18]:
rand_svm_model = RandomizedSearchCV(svm.SVC(gamma='auto'), {
    'C': range(1,101),
    'kernel': ['rbf', 'linear', 'poly']
}, cv=5, return_train_score=False, n_iter=6)

In [19]:
rand_svm_model.fit(iris.data, iris.target)

RandomizedSearchCV(cv=5, estimator=SVC(gamma='auto'), n_iter=6,
                   param_distributions={'C': range(1, 101),
                                        'kernel': ['rbf', 'linear', 'poly']})

In [20]:
rand_svm_model.best_params_

{'kernel': 'linear', 'C': 66}

In [21]:
rand_svm_model.best_score_

0.9666666666666666

In [22]:
pd.DataFrame(rand_svm_model.cv_results_)[['param_C', 'param_kernel', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score,std_test_score,rank_test_score
0,31,linear,0.96,0.04899,3
1,66,linear,0.966667,0.042164,1
2,26,poly,0.953333,0.033993,5
3,2,poly,0.966667,0.042164,1
4,46,poly,0.96,0.024944,3
5,38,poly,0.953333,0.033993,5


We can't guarantee that SVC is the suitable model. We can use hyperparameter technique to determine best model with best parameters too.

In [30]:
# 10. GridsearchCV technique to find best parameter from 3 classifier models

In [23]:
model_params = {
    'svc':{
        'model': svm.SVC(gamma='auto'),
        'params':{
            'C': [1,10,50,100],
            'kernel': ['rbf', 'linear', 'poly'] 
        }
    },
    'dtc':{
        'model': dtc(),
        'params':{
            'criterion': ["gini", "entropy"],
            'splitter': ["best", "random"],
            'max_depth': [1,5,10]
        }        
    },
    'rfc':{
        'model': rfc(),
        'params':{
            'criterion': ["gini", "entropy"],
            'n_estimators': [10, 20, 50, 100, 150, 200],
            'max_depth': [1,5,10]
        }        
    }
}

In [24]:
scores = []

for model_name,model_param in model_params.items():
    clf = GridSearchCV(model_param['model'], model_param['params'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append(
    {
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

print (scores)    

[{'model': 'svc', 'best_score': 0.9800000000000001, 'best_params': {'C': 1, 'kernel': 'rbf'}}, {'model': 'dtc', 'best_score': 0.9600000000000002, 'best_params': {'criterion': 'gini', 'max_depth': 10, 'splitter': 'best'}}, {'model': 'rfc', 'best_score': 0.9666666666666668, 'best_params': {'criterion': 'gini', 'max_depth': 5, 'n_estimators': 150}}]


In [31]:
# 11. Best parameters from each ML algorithms

In [26]:
pd.DataFrame(scores)

Unnamed: 0,model,best_score,best_params
0,svc,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,dtc,0.96,"{'criterion': 'gini', 'max_depth': 10, 'splitt..."
2,rfc,0.966667,"{'criterion': 'gini', 'max_depth': 5, 'n_estim..."


SVM classifier with C = 1 and kernel = rbf is a better model for this iris dataset