In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
from sklearn.datasets import load_iris

In [3]:
iris = load_iris()

In [4]:
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [5]:
# Let's change target to names
df['target'] = df.target.apply(lambda x: iris.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
#Split the dataset
from sklearn.model_selection import train_test_split

In [7]:
X = df.drop('target',axis=1)
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
y = iris.target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [10]:
# Implement SVM
from sklearn.svm import SVC

In [11]:
svc = SVC(kernel='rbf',C=30,gamma='auto')

In [12]:
svc.fit(X_train,y_train)

In [13]:
svc.score(X_test,y_test)

0.9555555555555556

SO as we know using train_test_split our result changes as data change each time. So we use kfold cross validation.

In [14]:
from sklearn.model_selection import cross_val_score

In [15]:
cross_val_score(SVC(kernel='rbf',C=10,gamma='auto'),iris.data,iris.target,cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [16]:
cross_val_score(SVC(kernel='linear',C=10,gamma='auto'),iris.data,iris.target,cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [17]:
cross_val_score(SVC(kernel='rbf',C=20,gamma='auto'),iris.data,iris.target,cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

As we can see it's very repatative. we can also do the same thing using for loop.

In [18]:
kernals = ['rbf','linear']
C = [1,10,20,30]
score = {}
for k in kernals:
    for x in C:
        cv_scores = cross_val_score(SVC(kernel = k, C = x, gamma='auto'),iris.data,iris.target,cv=5)
        score[k + '_' + str(x)] = np.average(cv_scores)

score

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'rbf_30': 0.96,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666,
 'linear_30': 0.96}

We can perform all of that through **GRIDSEARCHCV**

In [19]:
from sklearn.model_selection import GridSearchCV

In [43]:
paramets = [{
    'C':[1,10,20],
    'kernel':['rbf','linear']},]
clf = GridSearchCV(SVC(gamma='auto'), param_grid=paramets,
    cv=5, return_train_score=False
)
clf.fit(iris.data,iris.target)

In [44]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [45]:
clf.best_score_

0.9800000000000001

In [23]:
clf.cv_results_

{'mean_fit_time': array([0.00045195, 0.00034065, 0.00039167, 0.00034776, 0.00039945,
        0.00034146]),
 'std_fit_time': array([3.86710526e-05, 1.55286470e-05, 9.21591898e-06, 2.62211015e-05,
        1.71791134e-05, 6.78618186e-06]),
 'mean_score_time': array([0.00027113, 0.00022459, 0.00024233, 0.00021377, 0.00023527,
        0.00021901]),
 'std_score_time': array([2.40003089e-05, 1.38996191e-05, 1.38419183e-05, 3.28844827e-06,
        1.27827938e-05, 1.36216945e-05]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'ker

In [24]:
# SO to see the above table clearly we  can simply create it's DataFrame
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000452,3.9e-05,0.000271,2.4e-05,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.000341,1.6e-05,0.000225,1.4e-05,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.000392,9e-06,0.000242,1.4e-05,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.000348,2.6e-05,0.000214,3e-06,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.000399,1.7e-05,0.000235,1.3e-05,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.000341,7e-06,0.000219,1.4e-05,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [25]:
# We only need params_c and params_kernel and mean test score columns
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


Same thing using clf.best_params_ & clf.best_score_

### Important
#### Sometimes for gridsearchcv it's hard to handle and take alot of computaional time if we have so many features and so many parameters.
#### **So to handle that we have RandomizedSearchCV**
So RandomizedSearchCV does not try all the combination of parameters but it try randomly and we can choose what those iterations could be.

**RandomizedSearchCV**

In [26]:
from sklearn.model_selection import RandomizedSearchCV

In [27]:
rsc = RandomizedSearchCV(SVC(gamma='auto'),{
    'C':[1,10,20],
    'kernel':['rbf','linear']},
    cv=5, 
    return_train_score=False,
    n_iter = 2 #So it will only use two combinations of params and give it's best result. it's good to save computation power.
)
rsc.fit(iris.data,iris.target)

In [28]:
rsc.best_params_

{'kernel': 'rbf', 'C': 10}

In [29]:
rsc.best_score_

0.9800000000000001

# Important

## How to Choose best model?

In [30]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [34]:
model_params = {
    'svm':{
        'model': SVC(gamma='auto'),
        'params':{
            'C' : [1,10,20],
            'kernel' : ['rbf','linear']
        }
    },
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1,5,10]
        }
    },
    'logistic_regression':{
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params':{
            'C':[1,5,10]
        }
    }
}

In [36]:
model_params.items()

dict_items([('svm', {'model': SVC(gamma='auto'), 'params': {'C': [1, 10, 20], 'kernel': ['rbf', 'linear']}}), ('random_forest', {'model': RandomForestClassifier(), 'params': {'n_estimators': [1, 5, 10]}}), ('logistic_regression', {'model': LogisticRegression(solver='liblinear'), 'params': {'C': [1, 5, 10]}})])

In [38]:
# for i,x in model_params.items():
#     print(i)
#     print(x)

svm
{'model': SVC(gamma='auto'), 'params': {'C': [1, 10, 20], 'kernel': ['rbf', 'linear']}}
random_forest
{'model': RandomForestClassifier(), 'params': {'n_estimators': [1, 5, 10]}}
logistic_regression
{'model': LogisticRegression(solver='liblinear'), 'params': {'C': [1, 5, 10]}}


In [39]:
best_model = []
for model_name,model_parameters in model_params.items():
    clf = GridSearchCV(model_parameters['model'],model_parameters['params'],cv=5,return_train_score=False)
    clf.fit(iris.data,iris.target)
    best_model.append({
        'model':model_name,
        'best_params':clf.best_params_,
        'best_score':clf.best_score_
        
    })

In [40]:
best_model

[{'model': 'svm',
  'best_params': {'C': 1, 'kernel': 'rbf'},
  'best_score': 0.9800000000000001},
 {'model': 'random_forest',
  'best_params': {'n_estimators': 10},
  'best_score': 0.9533333333333334},
 {'model': 'logistic_regression',
  'best_params': {'C': 5},
  'best_score': 0.9666666666666668}]

In [41]:
models = pd.DataFrame(best_model)
models.head()

Unnamed: 0,model,best_params,best_score
0,svm,"{'C': 1, 'kernel': 'rbf'}",0.98
1,random_forest,{'n_estimators': 10},0.953333
2,logistic_regression,{'C': 5},0.966667
