# Hyperparameter Optimization Techinique

##  GridSearchCV

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [2]:
df = pd.read_csv('iris.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Perform train test split

In [3]:
x = df.drop('species',axis=1)
y = df['species']

In [4]:
x.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
y.head()

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: species, dtype: object

In [6]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=21)

### First Apply RandomizedSearchCV then GridSearchCV

In [13]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

In [14]:
n_estimators = [int(x) for x in np.linspace(200,2000,10)]
max_features = ['auto','sqrt','log2']
max_depth = [int(x) for x in np.linspace(10,1000,10)]
min_samples_split = [1,3,5,6,8,9,10,13]
min_samples_leaf = [1,2,4,6,7,9,10,13]

In [15]:
rf = RandomForestClassifier()
rf_randomCV = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                                random_state=100,n_jobs=-1)
rf_randomCV.fit(x_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   34.9s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.3min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 7, 9,
                                                             10, 13],
                                        'min_samples_split': [1, 3, 5, 6, 8, 9,
                                                              10, 13],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
             

In [16]:
param_grid = {
    'criterion':[rf_randomCV.best_params_['criterion']],
    'max_depth':[rf_randomCV.best_params_['max_depth']],
    'max_features':[rf_randomCV.best_params_['max_features']],
    'min_samples_leaf':[rf_randomCV.best_params_['min_samples_leaf'],
                       rf_randomCV.best_params_['min_samples_leaf']+2,
                       rf_randomCV.best_params_['min_samples_leaf']+4],
    'min_samples_split':[rf_randomCV.best_params_['min_samples_split']-2,
                        rf_randomCV.best_params_['min_samples_split']-1,
                        rf_randomCV.best_params_['min_samples_split'],
                        rf_randomCV.best_params_['min_samples_split']+1,
                        rf_randomCV.best_params_['min_samples_split']+2],
    'n_estimators':[rf_randomCV.best_params_['n_estimators']-200,
                   rf_randomCV.best_params_['n_estimators']-100,
                   rf_randomCV.best_params_['n_estimators'],
                   rf_randomCV.best_params_['n_estimators']+100,
                   rf_randomCV.best_params_['n_estimators']+200],
}
print(param_grid)

{'criterion': ['gini'], 'max_depth': [230], 'max_features': ['sqrt'], 'min_samples_leaf': [2, 4, 6], 'min_samples_split': [4, 5, 6, 7, 8], 'n_estimators': [0, 100, 200, 300, 400]}


#### Fit GridSearchCV to data

In [18]:
rf =RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(x_train,y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed:   35.5s finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [230],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [2, 4, 6],
                         'min_samples_split': [4, 5, 6, 7, 8],
                         'n_estimators': [0, 100, 200, 300, 400]},
             verbose=2)

In [19]:
grid_est = rf_randomCV.best_estimator_
print(grid_est)

RandomForestClassifier(max_depth=230, max_features='sqrt', min_samples_leaf=2,
                       min_samples_split=6, n_estimators=200)


In [20]:
prediction = grid_est.predict(x_test)
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[11  0  0]
 [ 0 10  2]
 [ 0  0  7]]
0.9333333333333333
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        11
  versicolor       1.00      0.83      0.91        12
   virginica       0.78      1.00      0.88         7

    accuracy                           0.93        30
   macro avg       0.93      0.94      0.93        30
weighted avg       0.95      0.93      0.93        30

