In [None]:
import numpy as np 
import pandas as pd 
from scipy import stats 
import seaborn as sns 
data= pd.read_csv('../input/diabetes-data-set/diabetes.csv')
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated()
data=data.drop_duplicates()
data.head()

The data doesn't contain any duplicate values also. Hence we need not worry about them.

In [None]:
data.describe()

All bioparameters are in the range. Hence the data does not have observational/structural errors in them. Hence we need to worry about them.

In [None]:
for x in data.columns:
    z=np.abs(stats.zscore(data[x]))
    print(x+str(z))

According to the z-score statistical parameter, the above datapoints are considered as outliers. But I dont think these are outliers and I consider that these data are essential for identfying diabetes condition.

The next step is feature scaling. I am doing a comparison of ML and DL for this dataset. For ML, i am considering SVM, which is a distance based algorithm. Hence normalization of data would be appropriate. On the other hand, for DL I am considering ANN, which is gradient descent based algorithm for which standardisation of data would be appropriate, since it can help in faster identification of local minima.

In [None]:
data_norm=data.copy()
for column in data.columns:
    data_norm[column] = (data_norm[column] - data_norm[column].min()) / (data_norm[column].max() - data_norm[column].min()) 
data_norm.head()

In [None]:
lis=['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Pregnancies']
def standartization(x):
    x_std = x.copy(deep=True)
    for column in lis:
        x_std[column] = (x_std[column] - x_std[column].mean()) / x_std[column].std() 
    return x_std

data= standartization(data)
data.head()

In [None]:
data.info()

In [None]:
data['Outcome'].value_counts()

In [None]:
y=data['Outcome']
x=data.drop(['Outcome'],axis=1)

In [None]:
yn=data_norm['Outcome']
xn=data_norm.drop(['Outcome'],axis=1)

I am creating separate train and test sets for standardised and normalised data. the ones having n as suffix are normalised.

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest= train_test_split(x,y,test_size=0.15,stratify=y)
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

In [None]:
from sklearn.model_selection import train_test_split
xntrain,xntest,yntrain,yntest= train_test_split(xn,yn,test_size=0.15,stratify=y)
print(xntrain.shape)
print(xntest.shape)
print(yntrain.shape)
print(yntest.shape)

In [None]:
from sklearn.svm import SVC
svm_model= SVC(kernel='rbf',gamma=8)
svm_model.fit(xntrain,yntrain)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions= svm_model.predict(xntrain)
percentage=svm_model.score(xntrain,yntrain)
res=confusion_matrix(yntrain,predictions)
print("Training confusion matrix")
print(res)
predictions= svm_model.predict(xntest)
percentage=svm_model.score(xntest,yntest)
res=confusion_matrix(yntest,predictions)
print("validation confusion matrix")
print(res)
print(classification_report(ytest, predictions))
# check the accuracy on the training set
print('training accuracy = '+str(svm_model.score(xntrain, yntrain)*100))
print('testing accuracy = '+str(svm_model.score(xntest, yntest)*100))

I trained the SVM without feature scaling and with standardisation. It produced test acc of 55% and 62% for no feature scaling and with standardisation. Hence normalisation is good for distance based algorithms like SVM.

For the DL part, I am considering ANN, comprising of 2 layers of 256 neurons as hidden layers. Considering more neurons and layers resulted in overfitting. Hence i limited with this hyperparameters. It was compiled using adam optimiser and crossentropy loss function.

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
dl_model = Sequential() 

dl_model.add(Dense(256,  activation = 'relu' ,input_shape=([8]))) #input layer
dl_model.add(Dense(256,  activation = 'relu'))
dl_model.add(Dense(1,activation = 'sigmoid'))
dl_model.summary()
dl_model.compile(optimizer = 'adam' , loss = 'binary_crossentropy' ,metrics = ['accuracy','Precision','Recall','AUC'])

In [None]:
num_epochs = 50
history = dl_model.fit(xtrain ,
                    ytrain ,
                    epochs= num_epochs ,
                    steps_per_epoch=200,
                    validation_data=(xtest ,ytest))

In [None]:
dl_model.evaluate(xtrain,ytrain)

In [None]:
dl_model.evaluate(xtest,ytest)

As you can see, the ANN produced 68% test accuracy which is way less than SVM. Hence we can say that the ML algorithm produced well than that of DL algorithm. Can we stop with this conclusion or are we missing something??

In [None]:
print(data['Outcome'].value_counts())
df_class_0 = data[data['Outcome'] == 0]
df_class_1 = data[data['Outcome'] == 1]

As you can see, there is class imbalance, the amount of diabetes negative is twice than that of diabetes positive. In this scenario, we cant compare the performance of algorithms based on accuracy. So to overcome the class imbalance I oversampled the minority class to the samples of majority class (500). So the total data consits of 1000 samples with equal distribution. I repeated this process for the standardised and normalized datasets.

In [None]:
print(data_norm['Outcome'].value_counts())
df_n_class_0 = data_norm[data_norm['Outcome'] == 0]
df_n_class_1 = data_norm[data_norm['Outcome'] == 1]

In [None]:
df_class_1_over = df_class_1.sample(500, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)
df_test_over.info()

In [None]:
df_n_class_1_over = df_n_class_1.sample(500, replace=True)
df_test_n_over = pd.concat([df_n_class_0, df_n_class_1_over], axis=0)
df_test_n_over.info()

In [None]:
y1=df_test_over['Outcome']
df_test_over=df_test_over.drop(['Outcome'],axis=1)
X1=df_test_over

In [None]:
y1n=df_test_n_over['Outcome']
df_test_n_over=df_test_n_over.drop(['Outcome'],axis=1)
X1n=df_test_n_over

The rest of the parameters are the same. This includes the train test split ratio and the algorithm parameters. Now the SVM and ANN are trained using the upsampled datasets. The same as before, normalized dataset for SVM and standardized dataset for ANN. 

In [None]:
from sklearn.model_selection import train_test_split

X1_s_train,X1_s_test ,y1_s_train, y1_s_test = train_test_split(X1,y1,
                                                   test_size=0.2,
                                                   random_state=0,
                                                  shuffle = True,
                                                  stratify = y1)

print('training data shape is :{}.'.format(X1_s_train.shape))
print('training label shape is :{}.'.format(y1_s_train.shape))
print('testing data shape is :{}.'.format(X1_s_test.shape))
print('testing label shape is :{}.'.format(y1_s_test.shape))

In [None]:
from sklearn.model_selection import train_test_split

X1_s_n_train,X1_s_n_test ,y1_s_n_train, y1_s_n_test = train_test_split(X1n,y1n,
                                                   test_size=0.2,
                                                   random_state=0,
                                                  shuffle = True,
                                                  stratify = y1n)

print('training data shape is :{}.'.format(X1_s_n_train.shape))
print('training label shape is :{}.'.format(y1_s_n_train.shape))
print('testing data shape is :{}.'.format(X1_s_n_test.shape))
print('testing label shape is :{}.'.format(y1_s_n_test.shape))

In [None]:
from sklearn.svm import SVC
svc_s_model = SVC(kernel='rbf',gamma=8)
svc_s_model.fit(X1_s_n_train, y1_s_n_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions= svc_s_model.predict(X1_s_n_train)
percentage=svc_s_model.score(X1_s_n_train,y1_s_n_train)
res=confusion_matrix(y1_s_n_train,predictions)
print("Training confusion matrix")
print(res)
predictions= svc_s_model.predict(X1_s_n_test)
percentage=svc_s_model.score(X1_s_n_test,y1_s_n_test)
res=confusion_matrix(y1_s_n_test,predictions)
print("validation confusion matrix")
print(res)
print(classification_report(y1_s_n_test, predictions))
# check the accuracy on the training set
print('training accuracy = '+str(svc_s_model.score(X1_s_n_train, y1_s_n_train)*100))
print('testing accuracy = '+str(svc_s_model.score(X1_s_n_test, y1_s_n_test)*100))

There isn't much change in the accuracy of the algorithm but there is huge improvement in the classification report before and after sampling, especially for the diabetes class. 

In [None]:
num_epochs = 50
history = dl_model.fit(X1_s_train ,
                    y1_s_train ,
                    epochs= num_epochs ,
                    steps_per_epoch=200,
                    validation_data=(X1_s_test ,y1_s_test))

In [None]:
dl_model.evaluate(X1_s_train ,
                    y1_s_train)

In [None]:
dl_model.evaluate(X1_s_test ,y1_s_test)

The ANN trained on the standardised and upsampled data performed the best result with 93% test accuracy. I have done a lot of work in this notebook, hope this deserves an upvote!! Thanks...
Please do mention if I have done something wrong.