<a href="https://colab.research.google.com/github/Venkatpandey/DataScience_ML/blob/main/ml/Hyper_parameter_Tuning_(GridSearchCV).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import pandas as pd

In [2]:
from sklearn.datasets import load_digits
digits = load_digits()
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [3]:
print(digits.target_names)

[0 1 2 3 4 5 6 7 8 9]


In [4]:
print(digits.data.size)

115008


In [5]:
df = pd.DataFrame(digits.data, columns = digits.feature_names)
df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2)
print(X_train.size)

91968


In [7]:
svc_model = svm.SVC(kernel='rbf', C=30, gamma='auto')
svc_model.fit(X_train, y_train)
svc_model.score(X_test, y_test)

0.5138888888888888

In [8]:
from sklearn.model_selection import cross_val_score


In [9]:
cross_val_score(svm.SVC(kernel='rbf', C=30, gamma='auto'), digits.data, digits.target, cv=5)

array([0.45277778, 0.46944444, 0.47910864, 0.47910864, 0.50139276])

In [10]:
cross_val_score(svm.SVC(kernel='rbf', C=10, gamma='auto'), digits.data, digits.target, cv=5)

array([0.45277778, 0.46944444, 0.47910864, 0.47910864, 0.50139276])

In [11]:
cross_val_score(svm.SVC(kernel='linear', C=20, gamma='auto'), digits.data, digits.target, cv=5)

array([0.96388889, 0.91944444, 0.96657382, 0.9637883 , 0.92479109])

In [12]:
# using for loop for multiple params
import numpy as np
kernel = ['rbf', 'linear']
C = [1,10,15,20,25,30]
avg_score = {}
for k in kernel:
  for cv in C:
    cv_score = cross_val_score(svm.SVC(kernel=k, C=cv, gamma='auto'), digits.data, digits.target, cv=5)
    avg_score[k + '_' + str(cv)] = np.average(cv_score)
print(avg_score)

{'rbf_1': 0.448545341999381, 'rbf_10': 0.47636645001547506, 'rbf_15': 0.47636645001547506, 'rbf_20': 0.47636645001547506, 'rbf_25': 0.47636645001547506, 'rbf_30': 0.47636645001547506, 'linear_1': 0.9476973073351903, 'linear_10': 0.9476973073351903, 'linear_15': 0.9476973073351903, 'linear_20': 0.9476973073351903, 'linear_25': 0.9476973073351903, 'linear_30': 0.9476973073351903}


In [13]:
# Using GridSearchCV
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C' : [1,10,15,20,25,30],
    'kernel' : ['rbf', 'linear']
}, cv = 5, return_train_score=False)

clf.fit(digits.data, digits.target)
clf.cv_results_

{'mean_fit_time': array([0.26690397, 0.02176003, 0.27214403, 0.02209406, 0.27358966,
        0.02047515, 0.27042265, 0.02184854, 0.27040639, 0.0205359 ,
        0.27318821, 0.020717  ]),
 'std_fit_time': array([0.00278707, 0.00048225, 0.00663633, 0.00092846, 0.00417603,
        0.00075723, 0.00449801, 0.00058792, 0.0067463 , 0.00078374,
        0.00401265, 0.00080757]),
 'mean_score_time': array([0.07663274, 0.00604458, 0.07832985, 0.00623722, 0.08035188,
        0.00565047, 0.08097401, 0.00603862, 0.07965736, 0.0056704 ,
        0.08365369, 0.00576124]),
 'std_score_time': array([2.19733219e-03, 3.06217769e-04, 2.25141702e-03, 2.39780845e-04,
        2.44288348e-03, 9.13283708e-05, 7.97870354e-04, 2.91400661e-04,
        2.10190417e-03, 1.56447401e-04, 3.18052455e-03, 1.67756883e-04]),
 'param_C': masked_array(data=[1, 1, 10, 10, 15, 15, 20, 20, 25, 25, 30, 30],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],

In [14]:
cv_result = pd.DataFrame(clf.cv_results_)
print(cv_result[['param_C', 'param_kernel', 'mean_test_score']])

   param_C param_kernel  mean_test_score
0        1          rbf         0.448545
1        1       linear         0.947697
2       10          rbf         0.476366
3       10       linear         0.947697
4       15          rbf         0.476366
5       15       linear         0.947697
6       20          rbf         0.476366
7       20       linear         0.947697
8       25          rbf         0.476366
9       25       linear         0.947697
10      30          rbf         0.476366
11      30       linear         0.947697


In [15]:
clf.best_score_

0.9476973073351903

In [16]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [17]:
# Randomised SerachCV
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
    'C' : [1,10,15,20,25,30],
    'kernel' : ['rbf', 'linear']
}, cv = 5, return_train_score=False, n_iter=2)

In [18]:
rs.fit(digits.data, digits.target)
print(pd.DataFrame(rs.cv_results_)[['param_C', 'param_kernel', 'mean_test_score']])

  param_C param_kernel  mean_test_score
0      20          rbf         0.476366
1      30       linear         0.947697


In [19]:
model_param = {
    'svm' : {
        'model' : svm.SVC(gamma='auto'),
        'params' : {
            'C' : [1,10,20,30],
            'kernel' : ['rbf', 'linear']
        }
    },
    'random_forest' : {
      'model' : RandomForestClassifier(),
      'params' : {
          'n_estimators' : [1,5,10]
      }
    },
    'logistic_regressions' : {
        'model' : LogisticRegression(solver='liblinear', multi_class='auto'),
        'params' : {
            'C' : [1,5,10]
        }
    },
    'gaussian' : {
        'model' : GaussianNB(),
        'params' : {
            'var_smoothing' : [1.0, 2.5]
        }
    },
    'multinomial' : {
        'model' : MultinomialNB(),
        'params' : {
            'alpha' : [1.5, 2.5]
        }
    },
    'decisionTree' : {
        'model' : DecisionTreeClassifier(),
        'params' : {
            'criterion' : ['gini', 'entropy'],
            'max_depth' : [10]

        }
    }
}

In [20]:
scores = []

for mn,mp in model_param.items():
  clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
  clf.fit(digits.data, digits.target)
  scores.append({
      'model' : mn,
      'best_score' : clf.best_score_,
      'best_params' :clf.best_params_
  })

In [21]:
print(pd.DataFrame(scores, columns=('model', 'best_score', 'best_params')))

                  model  best_score                                best_params
0                   svm    0.947697               {'C': 1, 'kernel': 'linear'}
1         random_forest    0.895953                       {'n_estimators': 10}
2  logistic_regressions    0.922114                                   {'C': 1}
3              gaussian    0.882030                     {'var_smoothing': 1.0}
4           multinomial    0.872021                             {'alpha': 2.5}
5          decisionTree    0.810251  {'criterion': 'entropy', 'max_depth': 10}
