In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# importing basic python libraries
import pandas as pd
import numpy as np

# importing data visualisation libraries
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import math
import seaborn as sns

# import scikit-learn split function
from sklearn.model_selection import train_test_split

# import K-Fold Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# import scikit-learn module to encode classes into one-hot coding matrix
from sklearn.preprocessing import LabelEncoder

# import keras module
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

Using TensorFlow backend.


## Python code to load the dataset into a Pandas dataframe:

In [3]:
# Specify what and where is the data file
filename = 'pima_diabetes.csv'

In [4]:
# Specify the fields with their names
col_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [5]:
# Load the data into a Pandas DataFrame
df = pd.read_csv(filename, names=col_names)

## Preprocess the dataset, find and clean missing values

In [6]:
# mark zero values as missing or NaN
df[['plas','pres','skin','test','mass','pedi','age']]=df[['plas','pres','skin','test','mass','pedi','age']].replace(0,np.NaN)

In [7]:
# checking percentage of unwanted data in dataset

def missing(df):
    print(df.isnull().sum() * 100/ len(df))

missing(df)

preg      0.000000
plas      0.651042
pres      4.557292
skin     29.557292
test     48.697917
mass      1.432292
pedi      0.000000
age       0.000000
class     0.000000
dtype: float64


In [8]:
# skin and test columns have considerable effect on data removal
# thus removing NaN values is not a good idea
# Instead replace the values with mean/median values

new_df=df.fillna(df.mean())
new_df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


# seperate dataset into input and output arrays

In [9]:
array = new_df.values
X = array[:,0:8]
Y = array[:,-1]

# split data into train ad test data

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# Encode Class Values - One-Hot Coding

In [11]:
#encode class values as integers
encoder_train = LabelEncoder()
encoder_train.fit(Y_train)
encoded_Y_train = encoder_train.transform(Y_train)

#convert integers into one-hot coding format
onehot_Y_train = np_utils.to_categorical(encoded_Y_train)

#----------
#encode class values as integers
encoder_test = LabelEncoder()
encoder_test.fit(Y_test)
encoded_Y_test = encoder_test.transform(Y_test)

#convert integers into one-hot coding format
onehot_Y_test = np_utils.to_categorical(encoded_Y_test)

# MLP with Keras

In [12]:
# define a function to create a baseline model
# the network: 2-layered MLP = INPUT neurons + 1 hidden layer + OUTPUT layer
# optimization algorith: the adam
# loss function : cross entropy

def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(16, input_dim=8, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    
    #compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [13]:
# create the model
model = baseline_model()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [14]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 16)                144       
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 18        
Total params: 298
Trainable params: 298
Non-trainable params: 0
_________________________________________________________________


# train the model

In [15]:
# train the model
model.fit(X_train, onehot_Y_train, epochs=150, batch_size=10)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoc

<keras.callbacks.callbacks.History at 0x7f2fcdeee5d0>

## Evaluate model with K-Fold cross validation

In [16]:
# create kerasClassifier to evaluate the model
evaluator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5)

In [17]:
# evaluate the model
kfold = KFold(n_splits=10, shuffle = True, random_state=42)

In [18]:
# using k-fold cross validation to evaluate the model
results = cross_val_score(evaluator, X_test, onehot_Y_test, cv=kfold)

print("baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

# Final Training and Validation Accuracy

In [19]:
#training accuracy
trainng_scores = model.evaluate(X_train, onehot_Y_train)
print("\n%s: %.2f%%" % (model.metrics_names[1], trainng_scores[1]*100))


accuracy: 35.41%


In [20]:
#validation accuracy
test_scores = model.evaluate(X_test, onehot_Y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], test_scores[1]*100))


accuracy: 33.86%
