<a href="https://colab.research.google.com/github/Ruchit77/Data_Science_learning/blob/main/Hyperparameter_Optimization_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hyperparameter Optimization 

We will be learning about this topic in 3 parts

## Model run without any tuning

We will run the Support Vector Machine model on this Dataset , with repeated Stratified Cross validation.

In [None]:
# load dataset
import pandas as pd
import numpy as np


df = pd.read_csv('ionosphere.csv', header=None)

In [None]:
# splitting df into features and target values
data = df.values
X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)


(351, 34) (351,)


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
from numpy import mean
from numpy import std

# define model
model = SVC()

# define the cross validation technique
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# model evaluation
model_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(model_scores), std(model_scores)))

Accuracy: 0.937 (0.038)


Here we have got a pretty good accuracy score close to 94% , but we still would like to see if we can improve that score.

## Manual tuning of Hyperparameters

> Looking at the api of the SVM , we select the parameters that we would like to tune , here we will tune C (regularization parameter ) , type of kernel , degree and gammma values .

> Now in order to manually tune the parameters , we need to give a range to the parameters , in order for the system to do trial and error.

In [None]:
pip install scikit-optimize



In [None]:
# define the search space of hyperparameters
# making a list

import skopt.space.space as s
search_space = list()
search_space.append(s.Real(1e-6, 100.0, 'log-uniform', name='C'))
search_space.append(s.Categorical(['linear', 'poly', 'rbf', 'sigmoid'], name='kernel'))
search_space.append(s.Integer(1, 5, name='degree'))
search_space.append(s.Real(1e-6, 100.0, 'log-uniform', name='gamma'))

In [None]:
print(search_space)

[Real(low=1e-06, high=100.0, prior='log-uniform', transform='identity'), Categorical(categories=('linear', 'poly', 'rbf', 'sigmoid'), prior=None), Integer(low=1, high=5, prior='uniform', transform='identity'), Real(low=1e-06, high=100.0, prior='log-uniform', transform='identity')]


In [None]:
# define the function used to evaluate a given configuration
from skopt.utils import use_named_args
@use_named_args(search_space)

def evaluate_model(**params):
	
  # configure the model with specific hyperparameters
	model = SVC()
	model.set_params(**params)
	
  # define test harness
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	
  # calculate 5-fold cross validation
	result = cross_val_score(model, X, y, cv=cv, n_jobs=-1, scoring='accuracy')
	
  # calculate the mean of the scores
	estimate = mean(result)
	
  # convert from a maximizing score to a minimizing score
	return (1.0 - estimate)

In [None]:

# perform optimization
result = skopt.gp_minimize(evaluate_model, search_space)



In [None]:
print('Best Accuracy: %.3f' % (1.0 - result.fun))
print('Best Parameters: %s' % (result.x))

Best Accuracy: 0.950
Best Parameters: [26.336031435121907, 'rbf', 3, 0.3269725428431265]


## Automatically tuning the Hyperparameters

> scikit learn also gives us options to automatically get the above mentioned values of the hyperparameters. The methods include RandomizedSearchCV , GridSearchCV , BayesSearchCV etc. classes.

> 1. BayesSearchCV

In [None]:
# define search space
params = dict()
params['C'] = (1e-6, 100.0, 'log-uniform')
params['gamma'] = (1e-6, 100.0, 'log-uniform')
params['degree'] = (1,5)
params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']


# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# define the search
search = skopt.BayesSearchCV(estimator=SVC(), search_spaces=params, n_jobs=-1, cv=cv)

# perform the search
search.fit(X, y)

# report the best result
print(search.best_score_)
print(search.best_params_)

0.9221272554605888
OrderedDict([('C', 0.001207682064766801), ('degree', 2), ('gamma', 4.020302939608209), ('kernel', 'poly')])


> 2. GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm, datasets

# Defining the search space 
params = dict()
params['C'] = (1e-6, 100.0)
params['gamma'] = (1e-6, 100.0)
params['degree'] = (1,5)
params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']

svc = svm.SVC()

clf = GridSearchCV(estimator=svc, param_grid= params )

clf.fit(X, y)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1e-06, 100.0), 'degree': (1, 5),
                         'gamma': (1e-06, 100.0),
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [None]:
clf.best_params_

{'C': 100.0, 'degree': 1, 'gamma': 1e-06, 'kernel': 'linear'}

In [None]:
clf.best_score_

0.8833400402414486

> 3. RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

from sklearn import svm, datasets

# Defining the search space 
params = dict()
params['C'] = (1e-6, 100.0)
params['gamma'] = (1e-6, 100.0)
params['degree'] = (1,5)
params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']

svc = svm.SVC()

clf = RandomizedSearchCV(svc, params )

clf.fit(X, y)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'C': (1e-06, 100.0), 'degree': (1, 5),
                                        'gamma': (1e-06, 100.0),
                                        'kernel': ['linear', 'poly', 'rbf',
                                                   'sigmoid']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [None]:
clf.best_params_

{'C': 100.0, 'degree': 5, 'gamma': 1e-06, 'kernel': 'linear'}

In [None]:
clf.best_score_

0.8833400402414486

> Using the Random Search CV on Random Forest Classifier Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

param_space = {"bootstrap": [True],
        "max_depth": [6, 8, 10, 12, 14],
        "max_features": ['auto', 'sqrt','log2'],
        "min_samples_leaf": [2, 3, 4],
        "min_samples_split": [2, 3, 4, 5],
        "n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
}

forest_clf = RandomForestClassifier()

forest_rand_search = RandomizedSearchCV(forest_clf, param_space, n_iter=32,
                                        scoring="accuracy", verbose=True, cv=5,
                                        n_jobs=-1, random_state=42)

forest_rand_search.fit(X, y)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   34.6s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  2.0min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [None]:
forest_rand_search.best_params_

{'bootstrap': True,
 'max_depth': 14,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 300}

In [None]:
forest_rand_search.best_score_

0.9401609657947686