In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV,train_test_split,RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from scipy.stats import randint as sp_randint

import warnings
warnings.filterwarnings("ignore")

In [3]:
#Importing the Data set
df = pd.read_csv('data/diabetes_data.csv')

#Checking the data structure
df.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
#Taking the target variables:
X = df.drop(columns = ['diabetes'])
y = df['diabetes']

In [5]:
#Splitting the data into train and test sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [47]:
seed=123
#KNN Model:
#intiate the KNN Model:
knn = KNeighborsClassifier()
#Listing the parameters to best tested :
params_knn = {'n_neighbors': sp_randint(1,80),
              'algorithm' : ( 'ball_tree', 'kd_tree', 'brute'),
              'weights':['distance','uniform'],
              'p':[1,2,3,4],
             'leaf_size': sp_randint(1,100)}
#Randomized Grid Search to get the best parameters for the model

n_iter_search = 20
knn_gs = RandomizedSearchCV(knn, param_distributions=params_knn,
                                   n_iter=n_iter_search, cv=5,random_state=998766)


 
#fitting the model to training data
knn_gs.fit(X_train, y_train)

#saving the best model:
knn_best = knn_gs.best_estimator_

#checking the best perameters values:
print(knn_gs.best_params_)

{'algorithm': 'brute', 'leaf_size': 64, 'n_neighbors': 11, 'p': 1, 'weights': 'distance'}


In [48]:
y_pred_knn=knn_best.predict(X_test)
print('knn roc_auc: {}'.format(roc_auc_score(y_test,y_pred_knn)))



knn roc_auc: 0.7134567901234568


In [56]:
# Random Forest:
#Initiate the Random Forest Model:
rf = RandomForestClassifier()

#Listing the parameters to best tested :
params_rf = {'n_estimators': sp_randint(50,700), 
             'max_leaf_nodes': sp_randint(20,600),
            'bootstrap':[True,False]}

#Grid Search to get the best parameters for the model
n_iter_search = 20
rf_gs = RandomizedSearchCV(rf, param_distributions=params_rf,
                                   n_iter=n_iter_search, cv=5,random_state=998766)

#fitting the model to training data
rf_gs.fit(X_train, y_train)

#saving the best model:
rf_best = rf_gs.best_estimator_

#check best n_estimators value
print(rf_gs.best_params_)

y_pred_rf=rf_best.predict(X_test)
print('rf roc_auc: {}'.format(roc_auc_score(y_test,y_pred_rf)))


{'bootstrap': False, 'max_leaf_nodes': 364, 'n_estimators': 442}
rf roc_auc: 0.7101234567901235


In [57]:
# Logistic Regression:
#create a new logistic regression model
log_reg = LogisticRegression()
#Listing the parameters to best tested :
params_lg = {'C': [0.01, 0.1, 1,2], 'max_iter':[50,100,500]}
#Grid Search to get the best parameters for the model

lg_gs = GridSearchCV(log_reg, params_lg, cv=15)

#fitting the model to training data
lg_gs.fit(X_train, y_train)
#saving the best model:
lg_best = lg_gs.best_estimator_

#check best n_estimators value
print(lg_gs.best_params_)

{'C': 2, 'max_iter': 50}


In [58]:
#Extra Tree:
#create a new Extra tree model
ET = ExtraTreesClassifier()
#Listing the parameters to best tested :
params_ET = {'n_estimators':[100,500],'max_depth':[ 5,10,20,100]}
#Grid Search to get the best parameters for the model
ET_gs = GridSearchCV(ET, params_ET, cv=15)

#fitting the model to training data
ET_gs.fit(X_train, y_train)
#saving the best model:
ET_best = ET_gs.best_estimator_

#check best n_estimators value
print(ET_gs.best_params_)



{'max_depth': 20, 'n_estimators': 100}


In [59]:
#Adaboost
#create a new Extra tree model
Ada = AdaBoostClassifier()
#Listing the parameters to best tested :
params_ada = {'n_estimators':[100,500],'learning_rate':[ 0.001,0.01]}
#Grid Search to get the best parameters for the model
Ada_gs = GridSearchCV(Ada, params_ada, cv=15)

#fitting the model to training data
Ada_gs.fit(X_train, y_train)

#saving the best model:
Ada_best = Ada_gs.best_estimator_

#check best n_estimators value
print(Ada_gs.best_params_)

{'learning_rate': 0.01, 'n_estimators': 500}


In [60]:
#LightGBM
#create a new Extra tree model
lgb = LGBMClassifier( metric='auc',max_depth= -1)
#Listing the parameters to best tested :
params_lgb = {'n_estimators':[500,1000],'learning_rate':[ 0.0089,0.01], 
              'num_leaves': [10,20,50],    'feature_fraction': [0.22,0.033,0.04]}
#Grid Search to get the best parameters for the model
lgb_gs = GridSearchCV(lgb, params_lgb, cv=15)

#fitting the model to training data
lgb_gs.fit(X_train, y_train)

#saving the best model:
lgb_best = lgb_gs.best_estimator_

#check best n_estimators value
print(lgb_gs.best_params_)

{'feature_fraction': 0.22, 'learning_rate': 0.01, 'n_estimators': 500, 'num_leaves': 20}


In [61]:
#Printing the accuracy scores of the models

print('knn: {}'.format(knn_best.score(X_test, y_test)))
print('rf: {}'.format(rf_best.score(X_test, y_test)))
print('log_reg: {}'.format(lg_best.score(X_test, y_test)))
print('ExtraTree: {}'.format(ET_best.score(X_test, y_test)))
print('AdaBoost: {}'.format(Ada_best.score(X_test, y_test)))
print('LightGBM: {}'.format(lgb_best.score(X_test, y_test)))

knn: 0.7532467532467533
rf: 0.7489177489177489
log_reg: 0.7748917748917749
ExtraTree: 0.7316017316017316
AdaBoost: 0.7532467532467533
LightGBM: 0.70995670995671


In [62]:
# Calculating the predicted values of y with the  models;
y_pred_knn=knn_best.predict(X_test)
y_pred_rf=rf_best.predict(X_test)
y_pred_lg=lg_best.predict(X_test)
y_pred_ET=ET_best.predict(X_test)
y_pred_Ada=Ada_best.predict(X_test)
y_pred_lgb=lgb_best.predict(X_test)

#Printing the ROC AUC values of the seperate models:
print('knn roc_auc: {}'.format(roc_auc_score(y_test,y_pred_knn)))
print('rf roc_auc: {}'.format(roc_auc_score(y_test,y_pred_rf)))
print('lg_reg roc_auc: {}'.format(roc_auc_score(y_test,y_pred_lg)))
print('Extra Tree roc_auc: {}'.format(roc_auc_score(y_test,y_pred_ET)))
print('Adaboost roc_auc: {}'.format(roc_auc_score(y_test,y_pred_Ada)))
print('LightGBM roc_auc: {}'.format(roc_auc_score(y_test,y_pred_lgb)))

knn roc_auc: 0.7134567901234568
rf roc_auc: 0.7101234567901235
lg_reg roc_auc: 0.7358024691358025
Extra Tree roc_auc: 0.6911111111111111
Adaboost roc_auc: 0.7049382716049383
LightGBM roc_auc: 0.6488888888888888


In [63]:
### The Stacked Ensemble:
#Creating a dictionnary of the  models:
estimators=[('knn', knn_best), ('rf', rf_best), ('log_reg', lg_best),('Extra tree', ET_best), ('Adaboost',Ada_best), 
           ('lightGBM',lgb_best)]

#using the voting classifier to stack the models:
ensemble = VotingClassifier(estimators, voting='hard')

#fitting the new stacked model:
ensemble.fit(X_train, y_train)

#testing the stacked model against the test set:
ensemble.score(X_test, y_test)
y_pred_ensemble=ensemble.predict(X_test)
#Printing the ROC AUC value for the new stacked model:
print('ensemble: {}'.format(roc_auc_score(y_test,y_pred_ensemble)))

ensemble: 0.7020987654320988


An other way to Ensemble :

In [83]:
### An other way to ensembling / Hyper Parameer tunning
voting = VotingClassifier(estimators=[('knn',KNeighborsClassifier() ), 
                                      ('rf', RandomForestClassifier()),('lg',LogisticRegression())],
                                      voting='hard')

#Use the key for the classifier followed by __ and the attribute
params = {'knn__n_neighbors': np.arange(1, 20,50),
          'knn__weights':['distance','uniform'],
          'knn__p':[1,2],
          'rf__max_leaf_nodes':[10,20,50],          
          'rf__n_estimators': [50,25],
           'lg__C':  [0.01, 0.1, 1,2,6,9],
           'lg__max_iter':[50,100,200]
         }

Voting_grid = GridSearchCV(estimator=voting, param_grid=params, cv=15,n_jobs=-1)

Voting_grid.fit(X_train,y_train)

print (Voting_grid.best_params_)

#saving the best model:
VT_best = Voting_grid.best_estimator_

#check best n_estimators value
print(Voting_grid.best_params_)




{'knn__n_neighbors': 1, 'knn__p': 1, 'knn__weights': 'uniform', 'lg__C': 9, 'lg__max_iter': 200, 'rf__max_leaf_nodes': 50, 'rf__n_estimators': 25}
{'knn__n_neighbors': 1, 'knn__p': 1, 'knn__weights': 'uniform', 'lg__C': 9, 'lg__max_iter': 200, 'rf__max_leaf_nodes': 50, 'rf__n_estimators': 25}


In [84]:
y_pred_Voting=VT_best.predict(X_test)

print('ensemble: {}'.format(roc_auc_score(y_test,y_pred_Voting)))

ensemble: 0.7424691358024691
