# **Diabetes Prediction Model**

------------

In [None]:
import numpy as np 
import pandas as pd 
from scipy import stats 
import seaborn as sns

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
data= pd.read_csv('/content/gdrive/MyDrive/UIA Hackathon/diabetes.csv')
data.head()

Unnamed: 0,Glucose,BloodPressure,Insulin,BMI,Age,Outcome
0,148,72,0,33.6,50,1
1,85,66,0,26.6,31,0
2,183,64,0,23.3,32,1
3,89,66,94,28.1,21,0
4,137,40,168,43.1,33,1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Glucose        768 non-null    int64  
 1   BloodPressure  768 non-null    int64  
 2   Insulin        768 non-null    int64  
 3   BMI            768 non-null    float64
 4   Age            768 non-null    int64  
 5   Outcome        768 non-null    int64  
dtypes: float64(1), int64(5)
memory usage: 36.1 KB


In [None]:
data.dtypes

Glucose            int64
BloodPressure      int64
Insulin            int64
BMI              float64
Age                int64
Outcome            int64
dtype: object

This data does not contain any null values. So we don't need to worry about filling/dropping values.

In [None]:
data.isnull().sum()

Glucose          0
BloodPressure    0
Insulin          0
BMI              0
Age              0
Outcome          0
dtype: int64

In [None]:
data.duplicated()
data=data.drop_duplicates()
data.head()

Unnamed: 0,Glucose,BloodPressure,Insulin,BMI,Age,Outcome
0,148,72,0,33.6,50,1
1,85,66,0,26.6,31,0
2,183,64,0,23.3,32,1
3,89,66,94,28.1,21,0
4,137,40,168,43.1,33,1


The data doesn't contain any duplicate values also. Hence we need not worry about them.

In [None]:
data.describe()

Unnamed: 0,Glucose,BloodPressure,Insulin,BMI,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0
mean,120.894531,69.105469,79.799479,31.992578,33.240885,0.348958
std,31.972618,19.355807,115.244002,7.88416,11.760232,0.476951
min,0.0,0.0,0.0,0.0,21.0,0.0
25%,99.0,62.0,0.0,27.3,24.0,0.0
50%,117.0,72.0,30.5,32.0,29.0,0.0
75%,140.25,80.0,127.25,36.6,41.0,1.0
max,199.0,122.0,846.0,67.1,81.0,1.0


All bioparameters are in the range. Hence the data does not have observational/structural errors in them. Hence we need to worry about them.

In [None]:
for x in data.columns:
    z=np.abs(stats.zscore(data[x]))
    print(x+str(z))

Glucose0      0.848324
1      1.123396
2      1.943724
3      0.998208
4      0.504055
         ...   
763    0.622642
764    0.034598
765    0.003301
766    0.159787
767    0.873019
Name: Glucose, Length: 768, dtype: float64
BloodPressure0      0.149641
1      0.160546
2      0.263941
3      0.160546
4      1.504687
         ...   
763    0.356432
764    0.046245
765    0.149641
766    0.470732
767    0.046245
Name: BloodPressure, Length: 768, dtype: float64
Insulin0      0.692891
1      0.692891
2      0.692891
3      0.123302
4      0.765836
         ...   
763    0.870031
764    0.692891
765    0.279594
766    0.692891
767    0.692891
Name: Insulin, Length: 768, dtype: float64
BMI0      0.204013
1      0.684422
2      1.103255
3      0.494043
4      1.409746
         ...   
763    0.115169
764    0.610154
765    0.735190
766    0.240205
767    0.202129
Name: BMI, Length: 768, dtype: float64
Age0      1.425995
1      0.190672
2      0.105584
3      1.041549
4      0.020496
         

According to the z-score statistical parameter, the above datapoints are considered as outliers. But I dont think these are outliers and I consider that these data are essential for identfying diabetes condition.

The next step is feature scaling. I am doing a comparison of ML and DL for this dataset. For ML, i am considering SVM, which is a distance based algorithm. Hence normalization of data would be appropriate. On the other hand, for DL I am considering ANN, which is gradient descent based algorithm for which standardisation of data would be appropriate, since it can help in faster identification of local minima.

In [None]:
data_norm=data.copy()
for column in data.columns:
    data_norm[column] = (data_norm[column] - data_norm[column].min()) / (data_norm[column].max() - data_norm[column].min()) 
data_norm.head()

Unnamed: 0,Glucose,BloodPressure,Insulin,BMI,Age,Outcome
0,0.743719,0.590164,0.0,0.500745,0.483333,1.0
1,0.427136,0.540984,0.0,0.396423,0.166667,0.0
2,0.919598,0.52459,0.0,0.347243,0.183333,1.0
3,0.447236,0.540984,0.111111,0.418778,0.0,0.0
4,0.688442,0.327869,0.198582,0.642325,0.2,1.0


In [None]:
lis = ['Glucose','BloodPressure','BMI','Age']
def standartization(x):
    x_std = x.copy(deep=True)
    for column in lis:
        x_std[column] = (x_std[column] - x_std[column].mean()) / x_std[column].std() 
    return x_std

data = standartization(data)
data.head()

Unnamed: 0,Glucose,BloodPressure,Insulin,BMI,Age,Outcome
0,0.847771,0.149543,0,0.20388,1.425067,1
1,-1.122665,-0.160441,0,-0.683976,-0.190548,0
2,1.942458,-0.263769,0,-1.102537,-0.105515,1
3,-0.997558,-0.160441,94,-0.493721,-1.040871,0
4,0.503727,-1.503707,168,1.408828,-0.020483,1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 0 to 767
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Glucose        768 non-null    float64
 1   BloodPressure  768 non-null    float64
 2   Insulin        768 non-null    int64  
 3   BMI            768 non-null    float64
 4   Age            768 non-null    float64
 5   Outcome        768 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 42.0 KB


In [None]:
data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [None]:
y=data['Outcome']
x=data.drop(['Outcome'],axis=1)

In [None]:
x.shape

(768, 5)

In [None]:
y.shape

(768,)

In [None]:
yn=data_norm['Outcome']
xn=data_norm.drop(['Outcome'],axis=1)

I am creating separate train and test sets for standardised and normalised data. the ones having n as suffix are normalised.

In [None]:
xtrain,xtest,ytrain,ytest= train_test_split(x,y,test_size=0.15,stratify=y)
print("Xtrain :- " + str(xtrain.shape) + "\n" + "Xtest :- " + str(xtest.shape) + "\n" + "Ytrain :- " + str(ytrain.shape) + "\n" + "Ytest :- " + str(ytest.shape))

Xtrain :- (652, 5)
Xtest :- (116, 5)
Ytrain :- (652,)
Ytest :- (116,)


In [None]:
xntrain,xntest,yntrain,yntest= train_test_split(xn,yn,test_size=0.15,stratify=y)
print("Xntrain :- " + str(xntrain.shape) + "\n" + "Xntest :- " + str(xntest.shape) + "\n" + "Yntrain :- " + str(yntrain.shape) + "\n" + "Yntest :- " + str(yntest.shape))

Xntrain :- (652, 5)
Xntest :- (116, 5)
Yntrain :- (652,)
Yntest :- (116,)


In [None]:
model = Sequential()
model.add(Dense(256,activation='relu',input_shape=([5])))
model.add(Dense(256,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.summary()
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               1536      
                                                                 
 dense_1 (Dense)             (None, 256)               65792     
                                                                 
 dense_2 (Dense)             (None, 1)                 257       
                                                                 
Total params: 67,585
Trainable params: 67,585
Non-trainable params: 0
_________________________________________________________________


In [None]:
ytest

176    0
765    0
727    0
487    0
590    1
      ..
637    0
101    0
427    1
503    0
249    0
Name: Outcome, Length: 116, dtype: int64

In [None]:
nepochs = 50
history = model.fit(xtrain ,
                    ytrain ,
                    epochs= nepochs ,
                    steps_per_epoch=200,
                    validation_data=(xtest ,ytest))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50





In [None]:
model.evaluate(xtrain,ytrain)



[0.38407522439956665, 0.808282196521759]

In [None]:
model.evaluate(xtest,ytest)



[0.7537275552749634, 0.7155172228813171]

As you can see, the ANN produced 68% test accuracy which is way less than SVM. Hence we can say that the ML algorithm produced well than that of DL algorithm. Can we stop with this conclusion or are we missing something??

In [None]:
print(data['Outcome'].value_counts())
df_class_0 = data[data['Outcome'] == 0]
df_class_1 = data[data['Outcome'] == 1]

0    500
1    268
Name: Outcome, dtype: int64


As you can see, there is class imbalance, the amount of diabetes negative is twice than that of diabetes positive. In this scenario, we cant compare the performance of algorithms based on accuracy. So to overcome the class imbalance I oversampled the minority class to the samples of majority class (500). So the total data consits of 1000 samples with equal distribution. I repeated this process for the standardised and normalized datasets.


There isn't much change in the accuracy of the algorithm but there is huge improvement in the classification report before and after sampling, especially for the diabetes class.

In [None]:
nepochs = 50
history = model.fit(xntrain ,
                    yntrain ,
                    epochs= nepochs ,
                    steps_per_epoch=200,
                    validation_data=(xntest ,yntest))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50





In [None]:
model.evaluate(xtest,ytest)



[93.18208312988281, 0.681034505367279]

The ANN trained on the standardised and upsampled data performed the best result with 68% test accuracy.

In [None]:
model.save("DiaCare.h5")

In [None]:
from tensorflow import keras
model1 = keras.models.load_model("DiaCare.h5")

In [None]:
model1.predict()

TypeError: ignored

In [None]:
def prediction(arr):
  temp_vr = np.array(arr)
  temp_vr = temp_vr.reshape(-1,6,)
  return np.argmax(model.predict(temp_vr))

In [None]:
print(type(xtrain))

make a dataframe -> with the same format of xtrain -> row of user input add -> we will give this to predict to get the outcome