## Predicting Diabetes using Ensemble Learning - Voting

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

Load the dataset

In [2]:
dataset=pd.read_csv("diabetes.csv")
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
dataset.size

6912

In [4]:
dataset.shape

(768, 9)

Pre-processing

In [5]:
#Check for missing values
print(dataset.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [6]:
#Handle missing values
dataset.fillna(dataset.mean(),inplace=True)

In [7]:
#Normalize the data
scaler=MinMaxScaler()
dataset[dataset.columns]=scaler.fit_transform(dataset[dataset.columns])

In [8]:
# X as the Feature Matrix
# y as the response vector

X=dataset[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]
y=dataset['Outcome']

In [9]:
#Split the data into training and testing sets
X_trainset,X_testset,y_trainset,y_testset=train_test_split(X,y,test_size=0.2,random_state=0)

Modeling

In [10]:
#Create the base models
rf=RandomForestClassifier(random_state=0)
gb=GradientBoostingClassifier(random_state=0)

In [11]:
# Create an instance of the voting classifier
voting_clf = VotingClassifier(estimators=[('rf', rf), ('gb', gb)], voting='soft')

In [12]:
# Fit the voting classifier to the training data
voting_clf.fit(X_trainset, y_trainset)

VotingClassifier(estimators=[('rf', RandomForestClassifier(random_state=0)),
                             ('gb',
                              GradientBoostingClassifier(random_state=0))],
                 voting='soft')

In [13]:
# Make predictions using the voting classifier
y_pred = voting_clf.predict(X_testset)

In [14]:
# Getting prediction results for tetsing part of the report (last 15 outcomes)
y_pred[y_pred.size-15:y_pred.size]

array([0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [15]:
# Getting actual results for tetsing part of the report (last 15 outcomes)
y_testset
y_testset.tail(15)

653    0.0
331    0.0
568    0.0
196    0.0
76     0.0
64     1.0
671    0.0
52     0.0
310    0.0
416    0.0
476    1.0
482    0.0
230    1.0
527    0.0
380    0.0
Name: Outcome, dtype: float64

Evaluation

In [16]:
print("Ensemble Accuracy: %0.4f" % accuracy_score(y_testset, y_pred))
print("Precision: %0.4f" % precision_score(y_testset, y_pred))
print("Recall: %0.4f" % recall_score(y_testset, y_pred))
print("F1-score: %0.4f" % f1_score(y_testset, y_pred))

Ensemble Accuracy: 0.8247
Precision: 0.7083
Recall: 0.7234
F1-score: 0.7158


In [17]:
# To get the AUC
y_pred_proba = voting_clf.predict_proba(X_testset)[:,1]

# Compute the AUC score
auc = roc_auc_score(y_testset, y_pred_proba)
print("Ensemble AUC: %0.2f" % roc_auc_score(y_testset, y_pred_proba))

Ensemble AUC: 0.85


In [18]:
# Perform k-fold cross validation
scoresAcc = cross_val_score(voting_clf, X, y, cv=20, scoring='roc_auc')
scoresAuc = cross_val_score(voting_clf, X, y, cv=20, scoring='accuracy')

# Print the mean and standard deviation of the scores
print("Ensemble Cross-Validated AUC: %0.2f (+/- %0.2f)" % (scoresAcc.mean(), scoresAcc.std()))
print("Ensemble Cross-Validated Accuracy: %0.4f (+/- %0.4f)" % (scoresAuc.mean(), scoresAuc.std()))

Ensemble Cross-Validated AUC: 0.83 (+/- 0.08)
Ensemble Cross-Validated Accuracy: 0.7698 (+/- 0.0853)
