In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

### As in Glucose, insulin column , some values are 0 . In real world we dont have the glucose and insulin as 0 , so we are simply replacing 0 with median value

In [6]:
df['Glucose'] = np.where(df['Glucose'] == 0, df['Glucose'].median(), df['Glucose'])
df['Insulin'] = np.where(df['Insulin'] == 0, df['Insulin'].median(), df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness'] == 0, df['SkinThickness'].median(), df['SkinThickness'])
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [7]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [8]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35.0,30.5,33.6,0.627,50
1,1,85.0,66,29.0,30.5,26.6,0.351,31
2,8,183.0,64,23.0,30.5,23.3,0.672,32
3,1,89.0,66,23.0,94.0,28.1,0.167,21
4,0,137.0,40,35.0,168.0,43.1,2.288,33


In [9]:
# spplitting the data into train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

In [10]:
X_train.shape

(614, 8)

In [11]:
X_test.shape

(154, 8)

### Type 1: Simply we are tuning with the no. of estimators and observing the performance of the model

In [12]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
rf_predict = rf_clf.predict(X_test)

In [13]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

print(classification_report(y_test, rf_predict))
print(accuracy_score(y_test, rf_predict))
print(confusion_matrix(y_test, rf_predict))

              precision    recall  f1-score   support

           0       0.73      0.94      0.82       100
           1       0.76      0.35      0.48        54

    accuracy                           0.73       154
   macro avg       0.74      0.65      0.65       154
weighted avg       0.74      0.73      0.70       154

0.7337662337662337
[[94  6]
 [35 19]]


In [14]:
###  Manual hyperparameter tuning 
rf_manual = RandomForestClassifier(n_estimators=250, criterion='entropy', max_depth=10, max_features='sqrt', min_samples_leaf=10, random_state=200).fit(X_train, y_train)
rf_man_predict = rf_manual.predict(X_test)

In [15]:
print(classification_report(y_test, rf_man_predict))
print(accuracy_score(y_test, rf_man_predict))
print(confusion_matrix(y_test, rf_man_predict))

              precision    recall  f1-score   support

           0       0.75      0.91      0.82       100
           1       0.73      0.44      0.55        54

    accuracy                           0.75       154
   macro avg       0.74      0.68      0.69       154
weighted avg       0.74      0.75      0.73       154

0.7467532467532467
[[91  9]
 [30 24]]


###  Randomized Search CV

###  <font color='red'>Randomized search cv will pick the hyperparameters randomly and apply the algo. and whichever gives the best result among the randomly taken hyperparameters those will be best parameters for that model</font>

In [16]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [17]:
rf_clf = RandomForestClassifier()
rf_randomcv = RandomizedSearchCV(estimator=rf_clf, param_distributions=random_grid, n_iter = 100,cv=4,random_state=100,n_jobs=-1)

In [18]:
rf_randomcv.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
rf_randomcv.best_params_

In [None]:
rf_randomcv.best_estimator_

In [None]:
best_randomcv = rf_randomcv.best_estimator_

In [None]:
y_pred=best_randomcv.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

###  Grid Search CV

###  <font color='red'>In Gridsearch cv , set of combinations of hyperparameters are applied on the algoand whichever performs best it will treat those hyperparameters as best .</font>

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [rf_randomcv.best_params_['criterion']],
    'max_depth': [rf_randomcv.best_params_['max_depth']],
    'max_features': [rf_randomcv.best_params_['max_features']],
    'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], 
                         rf_randomcv.best_params_['min_samples_leaf']+2, 
                         rf_randomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] - 2,
                          rf_randomcv.best_params_['min_samples_split'] - 1,
                          rf_randomcv.best_params_['min_samples_split'], 
                          rf_randomcv.best_params_['min_samples_split'] +1,
                          rf_randomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators'] - 100, 
                     rf_randomcv.best_params_['n_estimators'], 
                     rf_randomcv.best_params_['n_estimators'] + 100, rf_randomcv.best_params_['n_estimators'] + 200]
}

print(param_grid)

In [None]:
rf = RandomForestClassifier()
rf_Gridcv = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1,cv = 10)

In [None]:
rf_Gridcv.fit(X_train, y_train)

In [None]:
best_grid = rf_Gridcv.best_estimator_

In [None]:
grid_y_pred = best_grid.predict(X_test)

In [None]:
print(classification_report(y_test, grid_y_pred))
print(accuracy_score(y_test, grid_y_pred))
print(confusion_matrix(y_test, grid_y_pred))

###  Automated hyperparameter technique


Automated Hyperparameter Tuning can be done by using techniques such as

- a) Bayesian Optimization
- b) Gradient Descent
- c) Evolutionary Algorithms

#### a) Bayesian Optimization
Bayesian optimization uses probability to find the minimum of a function. The final aim is to find the input value to a function which can gives us the lowest possible output value.It usually performs better than random,grid and manual search providing better performance in the testing phase and reduced optimization time. In Hyperopt, Bayesian Optimization can be implemented giving 3 three main parameters to the function fmin.

- Domain Space = defines the range of input values to test (in Bayesian Optimization this space creates a probability distribution for each of the used Hyperparameters).
- Objective Function = defines the loss function to minimize.
- Optimization Algorithm = defines the search algorithm to use to select the best input values to use in each new iteration.

In [24]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

In [21]:
# step-1 : defining domain space
space = {
         'criterion':         hp.choice('criterion',['entropy','gini']),
         'n_estimators':      hp.choice('n_estimators',[200, 400, 600, 800, 1000]), 
         'max_features':      hp.choice('max_features',['auto', 'sqrt', 'log2',None]),
         'max_depth':         hp.quniform('max_depth', 10,1200,10), 
         'min_samples_split': hp.uniform('min_samples_split',0,1),
         'min_samples_leaf':  hp.uniform('min_samples_leaf',0,0.5)
}

In [22]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x12577a5bdc0>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x12577a5b8b0>,
 'max_features': <hyperopt.pyll.base.Apply at 0x12577a5bc10>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x12577a5b880>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x12577a5b790>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x12577a184f0>}

In [23]:
print(space['criterion'])

0 switch
1   hyperopt_param
2     Literal{criterion}
3     randint
4       Literal{2}
5   Literal{entropy}
6   Literal{gini}


In [25]:
#step-2 : define objective function
def objective(space):
    model = RandomForestClassifier(criterion=space['criterion'],max_depth=space['max_depth'],
                           max_features=space['max_features'],
                           min_samples_split=space['min_samples_split'],
                           min_samples_leaf= space['min_samples_leaf'],
                           n_estimators = space['n_estimators']
                          )
    accuracy = cross_val_score(model,X_train,y_train,cv=5).mean()  # cross_val_sore returns Array of scores of the estimator for each run of the cross validation.
    
    return{'loss':-accuracy,'status':STATUS_OK}

In [27]:
# step-3 : define optimization algorithm

# trials is responsible for minimizing the function
trials = Trials()
# fmin Minimizes a function over a hyperparameter space.
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=70,
            trials=trials)

100%|███████████████████████████████████████████████| 70/70 [10:02<00:00,  8.61s/trial, best loss: -0.7817672930827669]


In [28]:
best

{'criterion': 1,
 'max_depth': 780.0,
 'max_features': 2,
 'min_samples_leaf': 0.001993421747630005,
 'min_samples_split': 0.2988911495649224,
 'n_estimators': 0}

In [29]:
crit = {0:'entropy', 1:'gini'}
feat = {0:'auto',1:'sqrt', 2:'log2',3:'None'}
est  = {0:200, 1:400, 2:600, 3:800, 4:1000}

In [30]:
print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

gini
log2
200


In [34]:
# again we are training our model with best hyperparametrs that we got by hyperopt
trained_hyperopt_model = RandomForestClassifier(n_estimators=est[best['n_estimators']],max_features=feat[best['max_features']],criterion=crit[best['criterion']],max_depth=best['max_depth'],min_samples_leaf=best['min_samples_leaf'],min_samples_split=best['min_samples_split']).fit(X_train,y_train)
hyperopt_predict = trained_hyperopt_model.predict(X_test)

In [36]:
print(classification_report(y_test,hyperopt_predict))
print(accuracy_score(y_test,hyperopt_predict))
print(confusion_matrix(y_test,hyperopt_predict))

              precision    recall  f1-score   support

           0       0.73      0.93      0.82       100
           1       0.74      0.37      0.49        54

    accuracy                           0.73       154
   macro avg       0.74      0.65      0.66       154
weighted avg       0.74      0.73      0.71       154

0.7337662337662337
[[93  7]
 [34 20]]



### Genetic Algorithms
Genetic Algorithms tries to apply natural selection mechanisms to Machine Learning contexts.

Let's imagine we create a population of N Machine Learning models with some predifined Hyperparameters. We can then calculate the accuracy of each model and decide to keep just half of the models (the ones that performs best). We can now generate some offsprings having similar Hyperparameters to the ones of the best models so that go get again a population of N models. At this point we can again caltulate the accuracy of each model and repeate the cycle for a defined number of generations. In this way, just the best models will survive at the end of the process.

In [37]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [40]:
from tpot import TPOTClassifier
# Set up the genetic programming algorithm for pipeline optimization.
tpot_model = TPOTClassifier(generations=4,population_size=20,offspring_size=10,n_jobs=-1,early_stop=10,verbosity=1,config_dict={'sklearn.ensemble.RandomForestClassifier':random_grid},cv=4,scoring='accuracy')

In [41]:
tpot_model.fit(X_train,y_train)

Best pipeline: RandomForestClassifier(CombineDFs(CombineDFs(input_matrix, input_matrix), RandomForestClassifier(input_matrix, criterion=gini, max_depth=450, max_features=auto, min_samples_leaf=2, min_samples_split=14, n_estimators=1200)), criterion=gini, max_depth=890, max_features=auto, min_samples_leaf=6, min_samples_split=10, n_estimators=1800)


TPOTClassifier(config_dict={'sklearn.ensemble.RandomForestClassifier': {'criterion': ['entropy',
                                                                                      'gini'],
                                                                        'max_depth': [10,
                                                                                      120,
                                                                                      230,
                                                                                      340,
                                                                                      450,
                                                                                      560,
                                                                                      670,
                                                                                      780,
                                                                                 

In [42]:
score_acc = tpot_model.score(X_test,y_test)
print(score_acc)

0.7272727272727273



### Optimize hyperparameters of the model using Optuna

The hyperparameters of the above algorithm are n_estimators and max_depth for which we can try different values to see if the model accuracy can be improved. The objective function is modified to accept a trial object. This trial has several methods for sampling hyperparameters. We create a study to run the hyperparameter optimization and finally read the best hyperparameters.

In [44]:
# defining the objective function 
import optuna
import sklearn.svm

def objective_optuna(trial):
    classifier = trial.suggest_categorical('classifier',['RandomForest','SVC'])
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))
        clf = sklearn.ensemble.RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        clf = sklearn.svm.SVC(C=c, gamma='auto')

    return sklearn.model_selection.cross_val_score(clf,X_train,y_train, n_jobs=-1, cv=3).mean()

In [46]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_optuna, n_trials=100)

[32m[I 2021-09-23 12:41:50,452][0m A new study created in memory with name: no-name-a670d638-f352-4fd2-a94c-44026e4cb52d[0m
[32m[I 2021-09-23 12:41:59,329][0m Trial 0 finished with value: 0.7752271640363463 and parameters: {'classifier': 'RandomForest', 'n_estimators': 610, 'max_depth': 56.360337679753684}. Best is trial 0 with value: 0.7752271640363463.[0m
[32m[I 2021-09-23 12:42:06,318][0m Trial 1 finished with value: 0.7687549816674637 and parameters: {'classifier': 'RandomForest', 'n_estimators': 780, 'max_depth': 13.072628618298504}. Best is trial 0 with value: 0.7752271640363463.[0m
[32m[I 2021-09-23 12:42:06,554][0m Trial 2 finished with value: 0.6514666029013232 and parameters: {'classifier': 'SVC', 'svc_c': 235139.7276517323}. Best is trial 0 with value: 0.7752271640363463.[0m
[32m[I 2021-09-23 12:42:08,741][0m Trial 3 finished with value: 0.7638370795472661 and parameters: {'classifier': 'RandomForest', 'n_estimators': 400, 'max_depth': 53.42933663656593}. Best 

[32m[I 2021-09-23 12:44:28,369][0m Trial 35 finished with value: 0.7752670173760561 and parameters: {'classifier': 'RandomForest', 'n_estimators': 660, 'max_depth': 35.492867163023185}. Best is trial 35 with value: 0.7752670173760561.[0m
[32m[I 2021-09-23 12:44:28,485][0m Trial 36 finished with value: 0.6514666029013232 and parameters: {'classifier': 'SVC', 'svc_c': 2.8548981170447867e-06}. Best is trial 35 with value: 0.7752670173760561.[0m
[32m[I 2021-09-23 12:44:31,796][0m Trial 37 finished with value: 0.7671130240714171 and parameters: {'classifier': 'RandomForest', 'n_estimators': 630, 'max_depth': 10.470007385194952}. Best is trial 35 with value: 0.7752670173760561.[0m
[32m[I 2021-09-23 12:44:34,013][0m Trial 38 finished with value: 0.7687310696636378 and parameters: {'classifier': 'RandomForest', 'n_estimators': 410, 'max_depth': 69.1121548023174}. Best is trial 35 with value: 0.7752670173760561.[0m
[32m[I 2021-09-23 12:44:38,369][0m Trial 39 finished with value: 0

[32m[I 2021-09-23 12:46:37,827][0m Trial 70 finished with value: 0.6514666029013232 and parameters: {'classifier': 'SVC', 'svc_c': 0.0005479148226379179}. Best is trial 66 with value: 0.7768770923003347.[0m
[32m[I 2021-09-23 12:46:39,716][0m Trial 71 finished with value: 0.7654710664753707 and parameters: {'classifier': 'RandomForest', 'n_estimators': 360, 'max_depth': 18.403199860186582}. Best is trial 66 with value: 0.7768770923003347.[0m
[32m[I 2021-09-23 12:46:41,098][0m Trial 72 finished with value: 0.763860991551092 and parameters: {'classifier': 'RandomForest', 'n_estimators': 270, 'max_depth': 21.596643867648957}. Best is trial 66 with value: 0.7768770923003347.[0m
[32m[I 2021-09-23 12:46:43,929][0m Trial 73 finished with value: 0.7573330145066156 and parameters: {'classifier': 'RandomForest', 'n_estimators': 560, 'max_depth': 18.615452924685247}. Best is trial 66 with value: 0.7768770923003347.[0m
[32m[I 2021-09-23 12:46:45,613][0m Trial 74 finished with value: 0

In [47]:

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

Accuracy: 0.7768770923003347
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 470, 'max_depth': 17.71070186005702}


In [48]:
study.best_params

{'classifier': 'RandomForest',
 'n_estimators': 470,
 'max_depth': 17.71070186005702}

In [49]:
rf = RandomForestClassifier(n_estimators=470,max_depth=17.71070186005702).fit(X_train,y_train)

In [50]:
optuna_predict = rf.predict(X_test)

In [51]:
print(classification_report(y_test,optuna_predict))
print(accuracy_score(y_test,optuna_predict))
print(confusion_matrix(y_test,optuna_predict))

              precision    recall  f1-score   support

           0       0.75      0.89      0.81       100
           1       0.69      0.44      0.54        54

    accuracy                           0.73       154
   macro avg       0.72      0.67      0.68       154
weighted avg       0.73      0.73      0.72       154

0.7337662337662337
[[89 11]
 [30 24]]
