## Load data

In [1]:
import pandas as pd

In [2]:
DATA_DIR = "dataset/heart.csv"
CONTINIOUS_ATTRIBUTES = ["age", "trestbps", "chol", "thalach", "oldpeak"]
DISCRETE_ATTRIBUTES = ["cp", "restecg", "slope", "ca", "thal"]
BINARY_ATTRIBUTES = ["sex", "fbs", "exang", "target"]

data = pd.read_csv(DATA_DIR)
data.loc[:, CONTINIOUS_ATTRIBUTES] = data.loc[:, CONTINIOUS_ATTRIBUTES].astype("float64", copy=False)
data.loc[:, DISCRETE_ATTRIBUTES] = data.loc[:, DISCRETE_ATTRIBUTES].astype("int8", copy=False)
data.loc[:, BINARY_ATTRIBUTES] = data.loc[:, BINARY_ATTRIBUTES].astype("int8", copy=False)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null float64
sex         303 non-null int8
cp          303 non-null int8
trestbps    303 non-null float64
chol        303 non-null float64
fbs         303 non-null int8
restecg     303 non-null int8
thalach     303 non-null float64
exang       303 non-null int8
oldpeak     303 non-null float64
slope       303 non-null int8
ca          303 non-null int8
thal        303 non-null int8
target      303 non-null int8
dtypes: float64(5), int8(9)
memory usage: 14.6 KB


## Train-test split

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train_set, test_set = train_test_split(data, test_size=100, stratify=data.target, random_state=42)

## Prepare data

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [7]:
# Since we're gonna use cross validation, we cannot normalize train data beforehand, 
# instead, we will normalize train-validation data on the go while training.
def normalize(train_set, test_set, attributes=None):
    """
    If attributes is not define, then, all attributes will be apply.
    """
    if attributes is not None:
        _train_set = train_set.loc[:, attributes]
        _test_set = test_set.loc[:, attributes]
    else:
        _train_set = train_set
        _test_set = test_set
    
    scaler = StandardScaler()
    scaler.fit(_train_set)
    
    if attributes is not None:
        train_set.loc[:, attributes] = scaler.transform(_train_set)
        test_set.loc[:, attributes] = scaler.transform(_test_set)
    else:
        train_set = scaler.transform(_train_set)
        test_set = scaler.transform(_test_set)
    
    return train_set, test_set

In [8]:
# One-hot encoder
def encode(train_set, test_set, attributes=None):
    """
    If attributes is not define, then, all attributes will be apply.
    """
    if attributes is not None:
        _train_set = train_set.loc[:, attributes]
        _test_set = test_set.loc[:, attributes]
    else:
        _train_set = train_set
        _test_set = test_set
        
    encoder = OneHotEncoder(categories="auto", drop="first")
    encoder.fit(_train_set)
    
    _train_set = encoder.transform(_train_set)
    _test_set = encoder.transform(_test_set)
    
    
    if attributes is not None:
        _train_set = pd.DataFrame(_train_set.toarray(), columns=encoder.get_feature_names(attributes), index=train_set.index)
        _test_set = pd.DataFrame(_test_set.toarray(), columns=encoder.get_feature_names(attributes), index=test_set.index)

        train_set = pd.concat([train_set.drop(attributes, axis=1), _train_set], axis=1, sort=False)
        test_set = pd.concat([test_set.drop(attributes, axis=1), _test_set], axis=1, sort=False)
    else:
        train_set = _train_set
        test_set = _test_set
        
    return train_set, test_set

In [11]:
train_prepared, test_prepared = normalize(train_set, test_set, attributes=CONTINIOUS_ATTRIBUTES)
train_prepared, test_prepared = encode(train_prepared, test_prepared, attributes=DISCRETE_ATTRIBUTES)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [12]:
# Split X-Y
train_X, train_Y = train_prepared.drop("target", axis=1).values, train_prepared.loc[:, "target"].values
test_X, test_Y = test_prepared.drop("target", axis=1).values, test_prepared.loc[:, "target"].values

## Build model

In [13]:
import numpy as np
from keras.models import Sequential
from keras import layers, optimizers
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [14]:
def model_evaluate(nodes=[8], optimizer="RMSprop", epochs=200, batch_size=32):
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    val_scores = []
    for train_index, val_index in kfold.split(train_X, train_Y):
        # create model
        model = build_model(nodes)
        # Compile model
        model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["acc"])
        # Fit the model
        model.fit(train_X[train_index], train_Y[train_index], epochs=epochs, batch_size=batch_size, verbose=0)
        # Evaluate the model
        scores = model.evaluate(train_X[val_index], train_Y[val_index], verbose=0)
        val_scores.append(scores[1] * 100)
    print("%.2f%% (+/- %.2f%%)" % (np.mean(val_scores), np.std(val_scores)))

In [15]:
def build_model(nodes=[8]):
    model = Sequential()
    model.add(layers.Dense(nodes[0], activation="relu", input_shape=(train_X.shape[-1], )))
    for i in range(len(nodes) - 1):
        model.add(layers.Dense(nodes[i], activation="relu"))
    model.add(layers.Dense(1, activation="sigmoid"))
    
    return model

In [17]:
# Vary node size
nodes = [[8], [8, 8], [8, 8, 8], [8 ,8, 8, 8]]
optimizer = optimizers.RMSprop(lr=0.001)
epochs = 200
batch_size = 32
for _nodes in nodes:
    model_evaluate(_nodes, optimizer, epochs, batch_size)
    print("------------------------------------")

83.23% (+/- 8.88%)
------------------------------------
81.30% (+/- 10.25%)
------------------------------------
80.28% (+/- 6.73%)
------------------------------------
80.78% (+/- 10.36%)
------------------------------------


In [None]:
# Vary number of nodes
nodes = [[8], [8, 8], [8, 8, 8]]
optimizer = optimizers.RMSprop(lr=0.001)
epochs = 200
batch_size = 32
for _nodes in nodes:
    model_evaluate(_nodes, optimizer, epochs, batch_size)
    print("------------------------------------")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
84.30% (+/- 9.26%)
------------------------------------


In [None]:
# Vary optimizers
nodes = [[8], [8, 8], [8, 8, 8]]
optimizer = optimizers.RMSprop(lr=0.001)
epochs = 200
batch_size = 32
for _optimizer in optimizer:
    model_evaluate(nodes, _optimizer, epochs, batch_size)
    print("------------------------------------")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
84.30% (+/- 9.26%)
------------------------------------


In [None]:
# Vary learning rate
nodes = [[8], [8, 8], [8, 8, 8]]
optimizer = optimizers.RMSprop(lr=0.001)
epochs = 200
batch_size = 32
for _optimizer in optimizer:
    model_evaluate(nodes, _optimizer, epochs, batch_size)
    print("------------------------------------")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
84.30% (+/- 9.26%)
------------------------------------
