## Automated Hyper Paramter Tuning
### Techniques:
1. Bayesian Optimization
2. Gradient Descent
3. Evolutionary Algorithms


### 1. Bayesian Optimization
* Uses probability to find the minimum of the function.
* Final aim is to find the input value to the function which can gives us the lowest possible output value.
* Better than random, grid and manual search.
* It takes 3 parameters:

  1. Objective Function      =  defines the loss function to minimize.
  2. Domain Space            =  defines the range of input values to test.
  3. Optimization Algorithm  =  defines search algorithm to use to select the best input value. 

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

In [5]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
# replacing 0 by median
def impute_zero(df, variable):
    df[variable] = np.where(df[variable]==0,df[variable].median(),df[variable])

impute_zero(df, 'Glucose')
impute_zero(df, 'Insulin')
impute_zero(df, 'SkinThickness')

In [7]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [8]:
X = df.drop(['Outcome'], axis = 1)
y = df['Outcome']
X 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35.0,30.5,33.6,0.627,50
1,1,85.0,66,29.0,30.5,26.6,0.351,31
2,8,183.0,64,23.0,30.5,23.3,0.672,32
3,1,89.0,66,23.0,94.0,28.1,0.167,21
4,0,137.0,40,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76,48.0,180.0,32.9,0.171,63
764,2,122.0,70,27.0,30.5,36.8,0.340,27
765,5,121.0,72,23.0,112.0,26.2,0.245,30
766,1,126.0,60,23.0,30.5,30.1,0.349,47


In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 33)

In [41]:
space = {'criterion' : hp.choice('criterion', ['entropy', 'gini']),
         'max_depth' : hp.quniform('max_depth', 10, 1200, 10),
         'max_features' : hp.choice('max_features', ['sqrt', 'log2', None]),
         'min_samples_leaf' : hp.uniform('min_samples_leaf', 0, 0.5),
         'min_samples_split' : hp.uniform('min_samples_split', 0,1),
         'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200, 1300, 1500])}

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

In [44]:
def objective(space):
    space['max_depth'] = int(space['max_depth'])
    space['n_estimators'] = int(space['n_estimators'])

    model = RandomForestClassifier(criterion = space['criterion'],
                                   max_depth = space['max_depth'],
                                   max_features = space['max_features'],
                                   min_samples_leaf = space['min_samples_leaf'],
                                   min_samples_split = space['min_samples_split'],
                                   n_estimators = space['n_estimators'])
    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()

    return {'loss' : -accuracy, 'status' : STATUS_OK}

In [45]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80,
            trials = trials)

best

100%|██████████| 80/80 [04:34<00:00,  3.43s/trial, best loss: -0.7720111955217913]


{'criterion': np.int64(1),
 'max_depth': np.float64(80.0),
 'max_features': np.int64(2),
 'min_samples_leaf': np.float64(0.002012455471628427),
 'min_samples_split': np.float64(0.11608133100312432),
 'n_estimators': np.int64(3)}

In [46]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'sqrt', 1: 'log2', 2: None}
estm = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200, 5: 1300, 6: 1500} 

print(crit[best['criterion']])
print(feat[best['max_features']])
print(estm[best['n_estimators']])

gini
None
750


In [49]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']],
                                       max_features = feat[best['max_features']],
                                       min_samples_leaf = best['min_samples_leaf'],
                                        min_samples_split = best['min_samples_split'],
                                        n_estimators = estm[best['n_estimators']])

trainedforest.fit(X_train, y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test, predictionforest))
print(accuracy_score(y_test, predictionforest))
print(classification_report(y_test, predictionforest))

[[86 13]
 [27 28]]
0.7402597402597403
              precision    recall  f1-score   support

           0       0.76      0.87      0.81        99
           1       0.68      0.51      0.58        55

    accuracy                           0.74       154
   macro avg       0.72      0.69      0.70       154
weighted avg       0.73      0.74      0.73       154



### Genetic Algorithms
* Genetic Algorithms tries to apply natural selection mechanisms to Machine Learning contexts.
* Let's immagine we create a population of N Machine Learning models with some predifined Hyperparameters. We can then calculate the accuracy of each model and decide to keep just half of the models (the ones that performs best). We can now generate some offsprings having similar Hyperparameters to the ones of the best models so that go get again a population of N models. At this point we can again caltulate the accuracy of each model and repeate the cycle for a defined number of generations. In this way, just the best models will survive at the end of the process.


In [58]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['sqrt','log2']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]

# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [None]:
# param = {
#     'n_estimators': [100, 200, 300],  # Start with fewer estimators
#     'max_features': ['auto', 'sqrt'],  # Keep fewer options for features
#     'max_depth': [10, 50, 100],  # Limited depth options
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2],
#     'criterion': ['gini', 'entropy']
# }


In [62]:
from tpot import TPOTClassifier

# Setup TPOT with the custom configuration dictionary
tpot_classifier = TPOTClassifier(
    generations=5,
    population_size=24,
    offspring_size=12,
    verbosity=2,
    early_stop=12,
    config_dict={'sklearn.ensemble.RandomForestClassifier': param},  # Ensure the correct format
    cv=4,
    scoring='accuracy'
)

# Fit the model
tpot_classifier.fit(X_train, y_train)


                                                                   
Generation 1 - Current best internal CV score: -inf
                                                                              

IndexError: list index out of range