### Techniques of Hyper Parameter Optimization
1. GridSearchCV
2. RandomizedSearchCV
3. Bayesian Optimization -Automate Hyperparameter Tuning (Hyperopt)
4. Sequential Model Based Optimization(Tuning a scikit-learn estimator with skopt)
5. Optuna- Automate Hyperparameter Tuning
6. Genetic Algorithms (TPOT Classifier)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Lifecycle of Data Science Project
1. EDA
2. Feature Engineering
3. Feature Creation
4. Model Creation.
5. Hyperparameter Tuning (RFC)

In [3]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df['Glucose'].median()

np.float64(117.0)

In [4]:
# replacing 0 by median
def impute_zero(df, variable):
    df[variable] = np.where(df[variable]==0,df[variable].median(),df[variable])

In [7]:
impute_zero(df, 'Glucose')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35,0,33.6,0.627,50,1
1,1,85.0,66,29,0,26.6,0.351,31,0
2,8,183.0,64,0,0,23.3,0.672,32,1
3,1,89.0,66,23,94,28.1,0.167,21,0
4,0,137.0,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76,48,180,32.9,0.171,63,0
764,2,122.0,70,27,0,36.8,0.340,27,0
765,5,121.0,72,23,112,26.2,0.245,30,0
766,1,126.0,60,0,0,30.1,0.349,47,1


In [8]:
df['Glucose'].unique()

array([148.,  85., 183.,  89., 137., 116.,  78., 115., 197., 125., 110.,
       168., 139., 189., 166., 100., 118., 107., 103., 126.,  99., 196.,
       119., 143., 147.,  97., 145., 117., 109., 158.,  88.,  92., 122.,
       138., 102.,  90., 111., 180., 133., 106., 171., 159., 146.,  71.,
       105., 101., 176., 150.,  73., 187.,  84.,  44., 141., 114.,  95.,
       129.,  79.,  62., 131., 112., 113.,  74.,  83., 136.,  80., 123.,
        81., 134., 142., 144.,  93., 163., 151.,  96., 155.,  76., 160.,
       124., 162., 132., 120., 173., 170., 128., 108., 154.,  57., 156.,
       153., 188., 152., 104.,  87.,  75., 179., 130., 194., 181., 135.,
       184., 140., 177., 164.,  91., 165.,  86., 193., 191., 161., 167.,
        77., 182., 157., 178.,  61.,  98., 127.,  82.,  72., 172.,  94.,
       175., 195.,  68., 186., 198., 121.,  67., 174., 199.,  56., 169.,
       149.,  65., 190.])

In [9]:
impute_zero(df, 'Insulin')

In [10]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35,30.5,33.6,0.627,50,1
1,1,85.0,66,29,30.5,26.6,0.351,31,0
2,8,183.0,64,0,30.5,23.3,0.672,32,1
3,1,89.0,66,23,94.0,28.1,0.167,21,0
4,0,137.0,40,35,168.0,43.1,2.288,33,1


In [11]:
impute_zero(df, 'SkinThickness')

In [12]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [None]:
X = df.drop(['Outcome'], axis = 1)
y = df['Outcome']
X 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35.0,30.5,33.6,0.627,50
1,1,85.0,66,29.0,30.5,26.6,0.351,31
2,8,183.0,64,23.0,30.5,23.3,0.672,32
3,1,89.0,66,23.0,94.0,28.1,0.167,21
4,0,137.0,40,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76,48.0,180.0,32.9,0.171,63
764,2,122.0,70,27.0,30.5,36.8,0.340,27
765,5,121.0,72,23.0,112.0,26.2,0.245,30
766,1,126.0,60,23.0,30.5,30.1,0.349,47


In [15]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 33)

In [18]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 10)

In [19]:
rfc.fit(X_train, y_train)

In [23]:
y_pred = rfc.predict(X_test)

In [24]:
y.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [25]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score

In [27]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))


[[84 15]
 [25 30]]
              precision    recall  f1-score   support

           0       0.77      0.85      0.81        99
           1       0.67      0.55      0.60        55

    accuracy                           0.74       154
   macro avg       0.72      0.70      0.70       154
weighted avg       0.73      0.74      0.73       154

0.7402597402597403


### Randomized Search CV
* First we should always used Randomized Search CV as it will narrow down our results.
* Then we will apply GridSearch CV.

In [30]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(10, 1000, 10)]
min_samples_split = [1,3,4,5,7,9]
min_samples_leaf = [1, 2, 4, 6, 8]

random_grid = {'n_estimators' : n_estimators,
               'max_features' : max_features,
               'max_depth' : max_depth,
               'min_samples_split' : min_samples_split,
               'min_samples_leaf' : min_samples_leaf,
               'criterion' : ['entropy', 'gini']}

In [31]:
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [1, 3, 4, 5, 7, 9], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [33]:
rf = RandomForestClassifier()
rf_randomcv = RandomizedSearchCV(estimator = rf,
                                param_distributions= random_grid,
                                n_iter = 100,
                                cv = 3,
                                verbose = 2,
                                random_state = 100,
                                n_jobs = -1)

In [34]:
rf_randomcv.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


120 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
53 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\HP\OneDrive\Desktop\hyper_tuning\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\HP\OneDrive\Desktop\hyper_tuning\myenv\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\HP\OneDrive\Desktop\hyper_tuning\myenv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
   

In [35]:
rf_randomcv.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 560,
 'criterion': 'gini'}

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 560,
 'criterion': 'gini'}

 * By performing Randomized Search CV, I found that this is the region where my model is gonna be performing well.

In [36]:
rf_randomcv

In [37]:
rf_randomcv.best_estimator_

In [39]:
best_model = rf_randomcv.best_estimator_
y_pred1 = best_model.predict(X_test)
print(confusion_matrix(y_test, y_pred1))
print(classification_report(y_test, y_pred1))
print(accuracy_score(y_test, y_pred1))

[[85 14]
 [25 30]]
              precision    recall  f1-score   support

           0       0.77      0.86      0.81        99
           1       0.68      0.55      0.61        55

    accuracy                           0.75       154
   macro avg       0.73      0.70      0.71       154
weighted avg       0.74      0.75      0.74       154

0.7467532467532467


In [40]:
rfcc = RandomForestClassifier(max_depth=560, min_samples_leaf=2, min_samples_split=5,
                       n_estimators=200, criterion = 'gini', max_features = 'sqrt')
rfcc.fit(X_train, y_train)
y_pred2 = rfcc.predict(X_test)
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2))


[[85 14]
 [24 31]]
              precision    recall  f1-score   support

           0       0.78      0.86      0.82        99
           1       0.69      0.56      0.62        55

    accuracy                           0.75       154
   macro avg       0.73      0.71      0.72       154
weighted avg       0.75      0.75      0.75       154

0.7532467532467533


### Grid Search CV
* Now we will perform search around the best paramters given by RandomizedSearch CV.
* Here we dont have option for iteration.
* Iterations will be the product of the no. of values in each parameter.

In [41]:
[rf_randomcv.best_params_['min_samples_split'],
                                       rf_randomcv.best_params_['min_samples_split'] + 1,
                                       rf_randomcv.best_params_['min_samples_split'] - 1,
                                       rf_randomcv.best_params_['min_samples_split'] + 2,
                                       rf_randomcv.best_params_['min_samples_split'] - 2]

[5, 6, 4, 7, 3]

In [42]:
[rf_randomcv.best_params_['min_samples_leaf'],
                                   rf_randomcv.best_params_['min_samples_leaf'] + 2,
                                    rf_randomcv.best_params_['min_samples_leaf'] + 4]

[2, 4, 6]

In [43]:
[rf_randomcv.best_params_['n_estimators'] - 200,
                                  rf_randomcv.best_params_['n_estimators'],
                                  rf_randomcv.best_params_['n_estimators'] + 200]

[0, 200, 400]

In [46]:
param_grid = {'criterion' : [rf_randomcv.best_params_['criterion']],
              'max_depth' : [rf_randomcv.best_params_['max_depth']],
              'max_features' : [rf_randomcv.best_params_['max_features']],
              'min_samples_leaf' : [rf_randomcv.best_params_['min_samples_leaf'],
                                   rf_randomcv.best_params_['min_samples_leaf'] + 2,
                                    rf_randomcv.best_params_['min_samples_leaf'] + 4],
                'min_samples_split' : [rf_randomcv.best_params_['min_samples_split'],
                                       rf_randomcv.best_params_['min_samples_split'] + 1,
                                       rf_randomcv.best_params_['min_samples_split'] - 1,
                                       rf_randomcv.best_params_['min_samples_split'] + 2,
                                       rf_randomcv.best_params_['min_samples_split'] - 2],
                'n_estimators' : [rf_randomcv.best_params_['n_estimators'] - 200,
                                  rf_randomcv.best_params_['n_estimators'] - 100,
                                  rf_randomcv.best_params_['n_estimators'],
                                  rf_randomcv.best_params_['n_estimators'] + 100,
                                  rf_randomcv.best_params_['n_estimators'] + 200]
            }

In [47]:
rf = RandomForestClassifier()
grid = GridSearchCV(estimator = rf, 
                    param_grid = param_grid,
                    cv = 10,
                    n_jobs = -1, 
                    verbose = 2)

grid.fit(X_train, y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


150 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\HP\OneDrive\Desktop\hyper_tuning\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\HP\OneDrive\Desktop\hyper_tuning\myenv\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\HP\OneDrive\Desktop\hyper_tuning\myenv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  

In [48]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 560,
 'max_features': 'sqrt',
 'min_samples_leaf': 6,
 'min_samples_split': 6,
 'n_estimators': 100}

In [49]:
grid.best_estimator_

In [50]:
best_grid = grid.best_estimator_

In [51]:
y_pred3 = best_grid.predict(X_test)
print(confusion_matrix(y_test, y_pred3))
print(classification_report(y_test, y_pred3))
print(accuracy_score(y_test, y_pred3))

[[88 11]
 [26 29]]
              precision    recall  f1-score   support

           0       0.77      0.89      0.83        99
           1       0.72      0.53      0.61        55

    accuracy                           0.76       154
   macro avg       0.75      0.71      0.72       154
weighted avg       0.76      0.76      0.75       154

0.7597402597402597


In [53]:
rf = RandomForestClassifier(max_depth=450, min_samples_leaf=3, min_samples_split=3, criterion = 'entropy', max_features = 'log2')
rf.fit(X_train, y_train)
y_pred4 = rf.predict(X_test)
print(confusion_matrix(y_test, y_pred4))
print(classification_report(y_test, y_pred4))
print(accuracy_score(y_test, y_pred4))

[[84 15]
 [22 33]]
              precision    recall  f1-score   support

           0       0.79      0.85      0.82        99
           1       0.69      0.60      0.64        55

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.76       154

0.7597402597402597
