In [2]:
#IMPORTS
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split

In [4]:
#READ IN THE DATA
dataframe = pd.read_csv("heart_failure_clinical_records_dataset.csv")
dataframe.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [6]:
#SPLIT THE DATASET INTO TRAIN, VALIDATION AND TEST SETS
train, test = train_test_split(dataframe, test_size=0.2)
train, valid = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(valid), 'validation examples')
print(len(test), 'test examples')

191 train examples
48 validation examples
60 test examples


In [8]:
#Creating an input pipeline using tf.data
#wrapping the dataframe with tf.data, this enables us to use feature columns as a bridge
#to map from the columns in the Pandas dataframe to features used to train the model.

def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('DEATH_EVENT')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)
valid_ds = df_to_dataset(valid, batch_size=batch_size)
test_ds = df_to_dataset(test, batch_size=batch_size)

In [10]:
#what does it return?
for feature_batch, label_batch in train_ds.take(1):
    print('Features:', list(feature_batch.keys()))
    print('A batch of ages:', feature_batch['age'])
    print('A batch of targets:', label_batch)

Features: ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']
A batch of ages: tf.Tensor([72. 72. 53. 49. 75.], shape=(5,), dtype=float64)
A batch of targets: tf.Tensor([1 1 0 1 1], shape=(5,), dtype=int64)


In [15]:
#The output of a feature column becomes the input to the model.
#NUMERIC COLUMNS
creatinine_count = feature_column.numeric_column('creatinine_phosphokinase')


In [17]:
#BUCKETIZED COLUMNS: we don't want to feed some numbers directly into the model, but instead split its value into categories of numerical ranges.
#AGE
age = feature_column.numeric_column('age')
age_buckets = feature_column.bucketized_column(age, boundaries=[45, 60, 75])

#creatinine_phosphokinase
cre = feature_column.numeric_column('creatinine_phosphokinase')
cre_buckets = feature_column.bucketized_column(cre, boundaries=[115, 250, 582])

#ejection_fraction
eje = feature_column.numeric_column('ejection_fraction')
eje_buckets = feature_column.bucketized_column(eje, boundaries=[30, 38, 45])

#platelets
pla = feature_column.numeric_column('platelets')
pla_buckets = feature_column.bucketized_column(pla, boundaries=[212000, 262000, 304000])

#serum_creatinine
ser_cre = feature_column.numeric_column('serum_creatinine')
ser_cre_buckets = feature_column.bucketized_column(ser_cre, boundaries=[0.9, 1.1, 1.4])

#serum_sodium
ser_sod = feature_column.numeric_column('serum_sodium')
ser_sod_buckets = feature_column.bucketized_column(ser_sod, boundaries=[134, 137, 140])

#time (Follow-up period (days))
time = feature_column.numeric_column('time')
time_buckets = feature_column.bucketized_column(time, boundaries=[73, 115, 205])


In [19]:
#CHOOSE WHICH COLUMNS TO USE
feature_columns = []

#numeric columns
for header in ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']:
    feature_columns.append(feature_column.numeric_column(header))

#bucketized columns
for buckets in (age_buckets, cre_buckets, eje_buckets, pla_buckets, ser_cre_buckets, ser_sod_buckets, time_buckets):
    feature_columns.append(buckets)


In [21]:
#CREATE A FEATURE LAYER
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

#new input pipeline with larger batch size
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
valid_ds = df_to_dataset(valid, batch_size=batch_size)
test_ds = df_to_dataset(test, batch_size=batch_size)

In [23]:
#CREATE, COMPILE AND TRAIN THE MODEL
model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dropout(.1),
    layers.Dense(1)
])

model.compile(optimizer='adam',
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=['accuracy'])

model.fit(train_ds,
          validation_data=valid_ds, 
          epochs=15)

Epoch 1/15
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x2469b548400>

In [24]:
loss, accuracy = model.evaluate(test_ds)
print('Accuracy', accuracy)

Accuracy 0.8166666626930237
