In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# importing basic python libraries
import pandas as pd
import numpy as np

# importing data visualisation libraries
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import math
import seaborn as sns

# import scikit-learn split function
from sklearn.model_selection import train_test_split

# import K-Fold Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# import scikit-learn module to encode classes into one-hot coding matrix
from sklearn.preprocessing import LabelEncoder

# import keras module
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

Using TensorFlow backend.


## Python code to load the dataset into a Pandas dataframe:

In [3]:
# Specify what and where is the data file
filename = 'pima_diabetes.csv'

In [4]:
# Specify the fields with their names
col_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [5]:
# Load the data into a Pandas DataFrame
df = pd.read_csv(filename, names=col_names)

## Preprocess the dataset, find and clean missing values

In [6]:
# mark zero values as missing or NaN
df[['plas','pres','skin','test','mass','pedi','age']]=df[['plas','pres','skin','test','mass','pedi','age']].replace(0,np.NaN)

In [7]:
# checking percentage of unwanted data in dataset

def missing(df):
    print(df.isnull().sum() * 100/ len(df))

missing(df)

preg      0.000000
plas      0.651042
pres      4.557292
skin     29.557292
test     48.697917
mass      1.432292
pedi      0.000000
age       0.000000
class     0.000000
dtype: float64


In [8]:
# skin and test columns have considerable effect on data removal
# thus removing NaN values is not a good idea
# Instead replace the values with mean/median values

new_df=df.fillna(df.mean())
new_df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


# seperate dataset into input and output arrays

In [9]:
array = new_df.values
X = array[:,0:8]
Y = array[:,-1]

# split data into train ad test data

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# Encode Class Values - One-Hot Coding

In [11]:
#encode class values as integers
encoder_train = LabelEncoder()
encoder_train.fit(Y_train)
encoded_Y_train = encoder_train.transform(Y_train)

#convert integers into one-hot coding format
onehot_Y_train = np_utils.to_categorical(encoded_Y_train)

#----------
#encode class values as integers
encoder_test = LabelEncoder()
encoder_test.fit(Y_test)
encoded_Y_test = encoder_test.transform(Y_test)

#convert integers into one-hot coding format
onehot_Y_test = np_utils.to_categorical(encoded_Y_test)

# MLP with Keras

In [12]:
# define a function to create a baseline model
# the network: 2-layered MLP = INPUT neurons + 1 hidden layer + OUTPUT layer
# optimization algorith: the adam
# loss function : cross entropy

def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(16, input_dim=8, activation='relu'))
    model.add(Dense(64, activation='sigmoid'))
    model.add(Dense(32, activation='sigmoid'))
    model.add(Dense(2, activation='softmax'))
    
    #compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [13]:
# create the model
model = baseline_model()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [14]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 16)                144       
_________________________________________________________________
dense_2 (Dense)              (None, 64)                1088      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 66        
Total params: 3,378
Trainable params: 3,378
Non-trainable params: 0
_________________________________________________________________


# train the model

In [15]:
# train the model
model.fit(X_train, onehot_Y_train, epochs=1000, batch_size=70)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoc

<keras.callbacks.callbacks.History at 0x7f37f81a9e90>

## Evaluate model with K-Fold cross validation

In [16]:
# create kerasClassifier to evaluate the model
evaluator = KerasClassifier(build_fn=baseline_model, epochs=1500, batch_size=70)

In [17]:
# evaluate the model
kfold = KFold(n_splits=10, shuffle = True, random_state=42)

In [18]:
# using k-fold cross validation to evaluate the model
results = cross_val_score(evaluator, X_test, onehot_Y_test, cv=kfold)

print("baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Epoch 1/1500
Epoch 2/1500
Epoch 3/1500
Epoch 4/1500
Epoch 5/1500
Epoch 6/1500
Epoch 7/1500
Epoch 8/1500
Epoch 9/1500
Epoch 10/1500
Epoch 11/1500
Epoch 12/1500
Epoch 13/1500
Epoch 14/1500
Epoch 15/1500
Epoch 16/1500
Epoch 17/1500
Epoch 18/1500
Epoch 19/1500
Epoch 20/1500
Epoch 21/1500
Epoch 22/1500
Epoch 23/1500
Epoch 24/1500
Epoch 25/1500
Epoch 26/1500
Epoch 27/1500
Epoch 28/1500
Epoch 29/1500
Epoch 30/1500
Epoch 31/1500
Epoch 32/1500
Epoch 33/1500
Epoch 34/1500
Epoch 35/1500
Epoch 36/1500
Epoch 37/1500
Epoch 38/1500
Epoch 39/1500
Epoch 40/1500
Epoch 41/1500
Epoch 42/1500
Epoch 43/1500
Epoch 44/1500
Epoch 45/1500
Epoch 46/1500
Epoch 47/1500
Epoch 48/1500
Epoch 49/1500
Epoch 50/1500
Epoch 51/1500
Epoch 52/1500
Epoch 53/1500
Epoch 54/1500
Epoch 55/1500
Epoch 56/1500
Epoch 57/1500
Epoch 58/1500
Epoch 59/1500
Epoch 60/1500
Epoch 61/1500
Epoch 62/1500
Epoch 63/1500
Epoch 64/1500
Epoch 65/1500
Epoch 66/1500
Epoch 67/1500
Epoch 68/1500
Epoch 69/1500
Epoch 70/1500
Epoch 71/1500
Epoch 72/1500
E

# Final Training and Validation Accuracy

In [19]:
#training accuracy
trainng_scores = model.evaluate(X_train, onehot_Y_train)
print("\n%s: %.2f%%" % (model.metrics_names[1], trainng_scores[1]*100))


accuracy: 93.77%


In [20]:
#validation accuracy
test_scores = model.evaluate(X_test, onehot_Y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], test_scores[1]*100))


accuracy: 66.54%
