In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/My Drive/Colab Notebooks

In [None]:
df = pd.read_csv('diabetes.csv')

In [None]:
print(df.head())



*   Pregnancies: Number of previous pregnancies
*   Glucose: Plagma glucose concentration

*   BloodPressure: Diastolic blood pressure
*   SkinThickess: Skin fold thickness measured from the triceps


*   Insulin: Blood serum insulin concentration
*   BMI: Body Mass Index

*   DiabetesPedigreeFunction: A summerized score that indicates the genetic predisposition of the patient for diabetes, as extrapolated from the patient's family record for diabetes
*   Age: Age in years


*   Outcome: The target variable we are trying to predict , 1 for patients that developed diabetes within 5 years of the initial measurement and 0 otherwise











In [None]:
from matplotlib import pyplot as plt
df.hist()


In [None]:
plt.show()

In [None]:
import seaborn as sns
# create a subplot of 3x3
plt.subplots(3,3,figsize=(15,15))
#plot a density plot for each variable
# Plot a density plot for each variable
for idx, col in enumerate(df.columns):
    ax = plt.subplot(3,3,idx+1)
    ax.yaxis.set_ticklabels([])
    sns.distplot(df.loc[df.Outcome == 0][col], hist=False, axlabel= False, kde_kws={'linestyle':'-', 'color':'black', 'label':"No Diabetes"})
    sns.distplot(df.loc[df.Outcome == 1][col], hist=False, axlabel= False, kde_kws={'linestyle':'--', 'color':'black', 'label':"Diabetes"})
    ax.set_title(col)
#Hide the 9th subplot (bottom right since there are only 8 plots)
plt.subplot(3,3,9).set_visible(False)

plt.show()


100mg/dL

*   Dash line is Diabetics
*   Solid line is non-Diabetics


150mg/dL


In [None]:
print(df.isnull().any())

In [None]:
print(df.describe())

In [None]:
print("Number of rows with 0 values for each variable")
for col in df.columns:
  missing_rows = df.loc[df[col]==0].shape[0]

  print(col + ": "+str(missing_rows))

There are several techniques to handle these missing values:


*   Remove (discard) any rows with missing values
*   Replace the missing values with the mean/median/mode of the non-missing values.


*   Predict the actual values using a seperate machine learning model





In [None]:
#Replace 0 value with NaN
import numpy as np
df['Glucose'] = df['Glucose'].replace(0,np.nan)
df['BloodPressure'] = df['BloodPressure'].replace(0,np.nan)
df['SkinThickness'] = df['SkinThickness'].replace(0,np.nan)
df['Insulin'] = df['Insulin'].replace(0,np.nan)

df['BMI'] = df['BMI'].replace(0,np.nan)

In [None]:
print("Number of rows with 0 values for each variable")
for col in df.columns:
  missing_rows = df.loc[df[col]==0].shape[0]

  print(col + ": "+str(missing_rows))

In [None]:
# Replace Nan values with the mean of the non-missing values by using fillna()

In [None]:
df['Glucose'] = df['Glucose'].fillna(df['Glucose'].mean())
df['BloodPressure'] = df['BloodPressure'].fillna(df['BloodPressure'].mean())
df['SkinThickness'] = df['SkinThickness'].fillna(df['SkinThickness'].mean())
df['Insulin'] = df['Insulin'].fillna(df['Insulin'].mean())

df['BMI'] = df['BMI'].fillna(df['BMI'].mean())

The goal of data standardization is to transform the numeric variables so that each variable has zero mean and unit varianace.


*   Insulin and DiabetesPedigreeFunction have vastly differennt scales. The maximum value for Insulin is 846 while the maximum value for DiabetesPedigreeFunction is only 2.42




In [None]:
from sklearn import preprocessing
df_scaled = preprocessing.scale(df)
df_scaled = pd.DataFrame(df_scaled,columns=df.columns)
df_scaled['Outcome'] = df['Outcome']
df = df_scaled
print(df.describe().loc[['mean','std','max'],].round(2).abs())



1.   Training set: The neural network will be trained on this subset of the data
2.   Validation set: This set of data allows us to perform hyperparameter tuning(That is, tuning the number of hidden layers) using an unbiased source of data
1.   Testing set: The final evaluation of the neural network will be based on this subset of the data.
The purpose of splitting the data into training, testing and validation sets is to avoid overfitting and to provide an unbiased source of data for evaluating performance

*   If we allocate most of our data for training purposes, model performance will increase at the detriment of our ability to avoid overfitting.
*   If we allocate most of our data for validation and testing purposes, model performance will decrease as there might be insufficient data for training.

There are some steps:

*   Original data is 1st spliited into training (80%) and testing (20%)
*   Training set is 2nd splitted into training set, validation set and testing set.
Splitting data must be done at random
*   First, let's seperate the dataset into X(input features) and y(target variable)











In [None]:
from sklearn.model_selection import train_test_split
X = df.loc[:, df.columns != 'Outcome']
y = df.loc[:, 'Outcome']
# Split the data into training set(80%) and the testing set(20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Make a second split to create the final training set and the validation set
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size = 0.2)

In [None]:
from keras.models import Sequential
model = Sequential()



*   The first hidden layer will have 32 nodes
*   The input dimension will be 8 because there 8 columns in X_train



In [None]:
from keras.layers import Dense # Add the first Hidden layer
model.add(Dense(32, activation='relu', input_dim=8))
model.add(Dense(16, activation='relu'))# Add the second hidden layer
model.add(Dense(1, activation='sigmoid'))

There are three different parameters we need to define for the training process

*   Optimizer: Let's use the adam optimizer, whihc is a popular optimizer in Keras. For most datasets, the adam optimizer will work well without much tuning
*   Loss function: We will use binary_crossentropy as our loss function since the problem at hand is a binary classification problem.

*   Metrics: We will use accuracy (that is the percentage of correctly classified samples) as our evaluation metric.






In [None]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
#Train the model for 200 epochs
model.fit(X_train,y_train,epochs=200)

In [None]:
scores = model.evaluate(X_train,y_train)
print("Training Accuracy: %.2f%%\n" % (scores[1]*100))
scores = model.evaluate(X_test,y_test)
print("Testing Accuracy: %.2f%%\n" % (scores[1]*100))



*   True negative: Actual class is negative (no diabetes), and the model predicted negative (no diabetes)
*   False positive : Actual class is negative (no diabetes), but the model predicted positive (diabetes)


*   False negative: Actial class is positive (diabetes) but the model predicted negative (no diabetes)
*   True positive: Actual class is positve (diabetes), and the model predicted positive (diabetes)





In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
y_test_pred = model.predict_classes(X_test)
c_matrix = confusion_matrix(y_test,y_test_pred)
ax = sns.heatmap(c_matrix, annot=True, xticklabels=['No Diabetes','Diabetes'], yticklabels=['No Diabetes','Diabetes'], cbar= False, cmap='Blues')
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")



*   True positive rate (TPR) = True positive / (True positive + false negative)
*   False positive Rate (FPR) = False positive /(true Negative + False Positive)



In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

y_test_pred_probs = model.predict(X_test)
FPR,TPR, _= roc_curve(y_test,y_test_pred_probs)

In [None]:
plt.plot(FPR,TPR)
plt.plot([0,1],[0,1],'--',color='black')#diagonal line

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')