In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
df = pd.read_csv('C:\diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
import numpy as np
df['Glucose'] = np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin'] = np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [4]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [5]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35.0,30.5,33.6,0.627,50
1,1,85.0,66,29.0,30.5,26.6,0.351,31
2,8,183.0,64,23.0,30.5,23.3,0.672,32
3,1,89.0,66,23.0,94.0,28.1,0.167,21
4,0,137.0,40,35.0,168.0,43.1,2.288,33


In [6]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=0)

In [8]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
prediction = classifier.predict(X_test)

In [9]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[92 15]
 [19 28]]
0.7792207792207793
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       107
           1       0.65      0.60      0.62        47

    accuracy                           0.78       154
   macro avg       0.74      0.73      0.73       154
weighted avg       0.77      0.78      0.78       154



In [10]:
## Bayesian optimization


In [11]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [19]:
space = { 'criterion' : hp.choice('criterion', ['entropy', 'gini']),
         'max_depth' : hp.quniform('max_depth', 10, 1200,10),
         'max_features' : hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),
         'min_samples_leaf' : hp.uniform('min_samples_leaf', 0,0.5),
         'min_samples_split' : hp.uniform('min_samples_split',0,1),
         'n_estimators' : hp.choice('n_estimators', [10,50,300,750,1200,1300])
    
}

In [20]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x27d317ceee0>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x27d317cef10>,
 'max_features': <hyperopt.pyll.base.Apply at 0x27d317cee50>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x27d317e9850>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x27d317e91c0>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x27d317e9640>}

In [21]:
space['criterion']

<hyperopt.pyll.base.Apply at 0x27d317ceee0>

In [22]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'],
                                   max_depth=space['max_depth'],
                                   max_features=space['max_features'],
                                  min_samples_leaf=space['min_samples_leaf'],
                                   min_samples_split=space['min_samples_split'],
                                   n_estimators=space['n_estimators'])
    accuracy = cross_val_score(model,X_train, y_train, cv=5).mean()
    
    
    return{'loss': -accuracy, 'status' : STATUS_OK}
    
    

In [23]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn=objective,
           space=space,
           algo=tpe.suggest,
           max_evals = 80,
           trials = trials)
best

100%|███████████████████████████████████████████████| 80/80 [04:13<00:00,  3.17s/trial, best loss: -0.7638144742103159]


{'criterion': 1,
 'max_depth': 110.0,
 'max_features': 2,
 'min_samples_leaf': 0.16987392597623263,
 'min_samples_split': 0.2859110326057796,
 'n_estimators': 1}

In [24]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])


gini
log2
50


In [25]:
best['min_samples_leaf']


0.16987392597623263

In [27]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(accuracy_score(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc5 = accuracy_score(y_test,predictionforest)

[[99  8]
 [30 17]]
0.7532467532467533
              precision    recall  f1-score   support

           0       0.77      0.93      0.84       107
           1       0.68      0.36      0.47        47

    accuracy                           0.75       154
   macro avg       0.72      0.64      0.66       154
weighted avg       0.74      0.75      0.73       154



In [28]:
##GENETIC ALGORITHM

In [29]:
##TPOT CLASSIFIER

In [30]:
import numpy as np

In [31]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt','log2']
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
min_samples_split = [2, 5, 10,14]
min_samples_leaf = [1, 2, 4,6,8]
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [32]:
from tpot import TPOTClassifier
tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_train,y_train)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=84.0, style=ProgressStyle(des…


Generation 1 - Current best internal CV score: 0.7573423308717426
Generation 2 - Current best internal CV score: 0.7573423308717426
Generation 3 - Current best internal CV score: 0.7622124607418725
Generation 4 - Current best internal CV score: 0.7622124607418725
Generation 5 - Current best internal CV score: 0.7622124607418725
Best pipeline: RandomForestClassifier(RandomForestClassifier(input_matrix, criterion=gini, max_depth=1000, max_features=sqrt, min_samples_leaf=6, min_samples_split=10, n_estimators=400), criterion=entropy, max_depth=340, max_features=log2, min_samples_leaf=6, min_samples_split=5, n_estimators=1000)


TPOTClassifier(config_dict={'sklearn.ensemble.RandomForestClassifier': {'criterion': ['entropy',
                                                                                      'gini'],
                                                                        'max_depth': [10,
                                                                                      120,
                                                                                      230,
                                                                                      340,
                                                                                      450,
                                                                                      560,
                                                                                      670,
                                                                                      780,
                                                                                 

In [33]:
accuracy = tpot_classifier.score(X_test, y_test)
print(accuracy)

0.8571428571428571
