#### Problem Statement
The dataset is provided by Cleveland Clinic Foundation for Heart Disease.
https://archive.ics.uci.edu/ml/datasets/heart+Disease

It's a csv with 303 rows each containing a patient information. We use these features to predict if a patient has a heart disease or not (binary classification)

##### Import Modules

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.layers import IntegerLookup
from tensorflow.keras.layers import Normalization
from tensorflow.keras.layers import StringLookup

import pydot

In [2]:
print(tf.__version__)

2.8.0


##### Preparing the data

In [3]:
file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
dataframe = pd.read_csv(file_url)

In [4]:
dataframe.shape

(303, 14)

In [5]:
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [6]:
# Spliting the data into train and validation set

In [7]:
val_dataframe = dataframe.sample(frac=0.2, random_state=1337)
train_dataframe = dataframe.drop(val_dataframe.index)

In [8]:
print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)

Using 242 samples for training and 61 for validation


In [9]:
# Let's generate tf.data.Dataset objects for each dataframe

In [10]:
def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("target")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

In [11]:
train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)

In [12]:
# Note: Each dataset yields a tuple (input, target) where input is a dictionary of features
# and target is the value 0 or 1

In [13]:
for x, y in train_ds.take(1):
    print("Input: ", x)
    print("Target: ", y)

Input:  {'age': <tf.Tensor: shape=(), dtype=int64, numpy=53>, 'sex': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'cp': <tf.Tensor: shape=(), dtype=int64, numpy=4>, 'trestbps': <tf.Tensor: shape=(), dtype=int64, numpy=142>, 'chol': <tf.Tensor: shape=(), dtype=int64, numpy=226>, 'fbs': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'restecg': <tf.Tensor: shape=(), dtype=int64, numpy=2>, 'thalach': <tf.Tensor: shape=(), dtype=int64, numpy=111>, 'exang': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'oldpeak': <tf.Tensor: shape=(), dtype=float64, numpy=0.0>, 'slope': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'ca': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'thal': <tf.Tensor: shape=(), dtype=string, numpy=b'reversible'>}
Target:  tf.Tensor(0, shape=(), dtype=int64)


In [14]:
# Let's batch the dataset -

In [15]:
train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)

In [16]:
train_ds

<BatchDataset element_spec=({'age': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'sex': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'cp': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'trestbps': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'chol': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'fbs': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'restecg': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'thalach': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'exang': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'oldpeak': TensorSpec(shape=(None,), dtype=tf.float64, name=None), 'slope': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'ca': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'thal': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

##### Feature pre-processing with Keras layers
The following features are categoriacal features encoded as integers -
* sex
* cp
* fbs
* restecg
* exang
* ca

We will encode these features uisng one-hot encoding. We have two options here -
1. Use CategoryEncoding(), which requires knowing the range of input values and will error on input outside the range
2. Use IntegerLookup() which will build a lookup table for inputs and reserve an output index for unknown input values

Here we want a solution which will handle out of range inputs at inference, so we will use IntegerLookup()

We also have a categorical feature encoded as a string: "thal". We will create an index of all possible features and encode output using the StringLookup() layer.

Finally, the following feature are continuous numerical features -
* age
* trestbps
* chol
* thalach
* oldpeak
* slope

For each of these features, we will use a Normalization() layer to make sure the mean of each feature is 0 and its standard deviation is 1.

In [17]:
# To apply featurewise normalization to numerical features

def encode_numerical_feature(feature, name, dataset):
    # create a Normalization layer for our feature
    normalizer = Normalization()
    
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    
    # Learn the statistics of the data -
    normalizer.adapt(feature_ds)
    
    # Normalize the input feature-
    encoded_feature = normalizer(feature)
    return encoded_feature

In [18]:
def encode_categorical_feature(feature, name, dataset, is_string):
    lookup_class = StringLookup if is_string else IntegerLookup
    
    # Create a lookup layer which will turn strings into integer indices
    lookup = lookup_class(output_mode = "binary")
    
    # Prepare a Dataset that only yields our feature -
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    
    # Learn the set of possible string values and assign them a fixed integer index -
    lookup.adapt(feature_ds)
    
    # Turn the string input into integer indices
    encoded_feature = lookup(feature)
    
    return encoded_feature

##### Build a Model
Let's build our end-to-end model

In [19]:
# Categorical features encoded as integers

In [20]:
sex = keras.Input(shape = (1, ), name = "sex", dtype = "int64")
cp = keras.Input(shape = (1, ), name = "cp", dtype = "int64")
fbs = keras.Input(shape = (1, ), name = "fbs", dtype = "int64")
restecg = keras.Input(shape = (1, ), name = "restecg", dtype = "int64")
exang = keras.Input(shape = (1, ), name = "exang", dtype = "int64")
ca = keras.Input(shape = (1, ), name = "ca", dtype = "int64")

In [21]:
# Categorical feature encoded as string

In [22]:
thal = keras.Input(shape = (1, ), name = "thal", dtype = "string")

In [23]:
# Numerical Features -

In [24]:
age = keras.Input(shape = (1, ), name = "age")
trestbps = keras.Input(shape = (1, ), name = "trestbps")
chol = keras.Input(shape = (1, ), name = "chol")
thalach = keras.Input(shape = (1, ), name = "thalach")
oldpeak = keras.Input(shape = (1, ), name = "oldpeak")
slope = keras.Input(shape = (1, ), name = "slope")

In [25]:
all_inputs = [
    sex,
    cp,
    fbs,
    restecg,
    exang,
    ca,
    thal,
    age,
    trestbps,
    chol,
    thalach,
    oldpeak,
    slope,
]

In [26]:
# Integer Categorical Features -

In [27]:
sex_encoded = encode_categorical_feature(sex, "sex", train_ds, False)
cp_encoded = encode_categorical_feature(cp, "cp", train_ds, False)
fbs_encoded = encode_categorical_feature(fbs, "fbs", train_ds, False)
restecg_encoded = encode_categorical_feature(restecg, "restecg", train_ds, False)
exang_encoded = encode_categorical_feature(exang, "exang", train_ds, False)
ca_encoded = encode_categorical_feature(ca, "ca", train_ds, False)

In [28]:
# String Categorical Features -

In [29]:
thal_encoded = encode_categorical_feature(thal, "thal", train_ds, True)

In [30]:
# Numerical Features - 

In [31]:
age_encoded = encode_numerical_feature(age, "age", train_ds)
trestbps_encoded = encode_numerical_feature(trestbps, "trestbps", train_ds)
chol_encoded = encode_numerical_feature(chol, "chol", train_ds)
thalach_encoded = encode_numerical_feature(thalach, "thalach", train_ds)
oldpeak_encoded = encode_numerical_feature(oldpeak, "oldpeak", train_ds)
slope_encoded = encode_numerical_feature(slope, "slope", train_ds)

In [32]:
all_features = layers.concatenate(
    [
        sex_encoded,
        cp_encoded,
        fbs_encoded,
        restecg_encoded,
        exang_encoded,
        slope_encoded,
        ca_encoded,
        thal_encoded,
        age_encoded,
        trestbps_encoded,
        chol_encoded,
        thalach_encoded,
        oldpeak_encoded,
    ]
)

In [33]:
x = layers.Dense(32, activation = "relu")(all_features)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1, activation = "sigmoid")(x)
model = keras.Model(all_inputs, output)
model.compile("adam", "binary_crossentropy", metrics = ["accuracy"])

In [34]:
# Let's visualize our connectivity graph - 

In [35]:
keras.utils.plot_model(model, show_shapes=True, rankdir="LR") # "LR" to make the graoh horizontal

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


#### Train the model

In [36]:
model.fit(train_ds, epochs=50, validation_data=val_ds)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2687d919490>

#### Inference on New Data

To get the prediction for a new sample, you can simply call model.predict(). There are just two things you need to do -

1. Wrap scalars into a list so as to have a batch dimension (models only process batches of data, and not single samples)

2. Call convert_to_tensor on each feature

In [37]:
sample = {
    "age": 60,
    "sex": 1,
    "cp": 1,
    "trestbps": 145,
    "chol": 233,
    "fbs": 1,
    "restecg": 2,
    "thalach": 150,
    "exang": 0,
    "oldpeak": 2.3,
    "slope": 3,
    "ca": 0,
    "thal": "fixed"
}

In [38]:
input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}

In [39]:
input_dict

{'age': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([60])>,
 'sex': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1])>,
 'cp': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1])>,
 'trestbps': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([145])>,
 'chol': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([233])>,
 'fbs': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1])>,
 'restecg': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([2])>,
 'thalach': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([150])>,
 'exang': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0])>,
 'oldpeak': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.3], dtype=float32)>,
 'slope': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([3])>,
 'ca': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0])>,
 'thal': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'fixed'], dtype=object)>}

In [40]:
predictions = model.predict(input_dict)

In [41]:
predictions

array([[0.23753887]], dtype=float32)

In [43]:
print(
    "This particular patient had a %.1f percent probability "
    "of having a heart disease, as evaluated by our model." % (100 * predictions[0][0],)
)

This particular patient had a 23.8 percent probability of having a heart disease, as evaluated by our model.
