In [1]:
import random
random.seed(564155161)
import os
import pandas as pd
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Activation, BatchNormalization, Embedding, Input, Reshape, Concatenate
from keras import metrics
from keras.optimizers import Adam, rmsprop
from keras.callbacks import TensorBoard, ReduceLROnPlateau, ModelCheckpoint
from time import time
from IPython.display import display
from sklearn.model_selection import StratifiedKFold
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
experiment_name='mlhw-mlp-with-preprocess'

## Load Dataset

In [3]:
x_train = pd.read_csv('dataset/train_values.csv')
y_train = pd.read_csv('dataset/train_labels.csv')
x_test = pd.read_csv('dataset/test_values.csv')


# Convert string values to integers
x_train["thal"] = x_train["thal"].astype('category')
x_train["thal"] = x_train["thal"].cat.codes

x_test["thal"] = x_test["thal"].astype('category')
x_test["thal"] = x_test["thal"].cat.codes



# Drop column
x_train = x_train.drop("patient_id", axis=1)
y_train = y_train.drop("patient_id", axis=1)

xsize = x_train.shape[1]
ysize = 1
label_column = 'heart_disease_present'
cols = list(x_train.columns)



In [4]:
display(x_train.head())

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,1,1,128,2,0,0,2,308,0.0,1,45,170,0
1,2,1,110,3,0,0,0,214,1.6,0,54,158,0
2,1,1,125,4,3,0,2,304,0.0,1,77,162,1
3,1,2,152,4,0,0,0,223,0.0,1,40,181,0
4,3,2,178,1,0,0,2,270,4.2,1,59,145,0


## Preprocessing

### Data Whitening
This will normalize our values so higher order values won't dominate other values

In [5]:
cat_cols = [
    'slope_of_peak_exercise_st_segment',
    'thal',
    'chest_pain_type',
    'fasting_blood_sugar_gt_120_mg_per_dl',
    'resting_ekg_results',
    'sex',
    'exercise_induced_angina'
]
cont_cols = [
    'num_major_vessels',
    'resting_blood_pressure',
    'serum_cholesterol_mg_per_dl',
    'oldpeak_eq_st_depression',
    'age',
    'max_heart_rate_achieved',
]

x_train_mean = x_train[cont_cols].mean(axis=0)
x_train_std = x_train[cont_cols].std(axis=0)
x_train[cont_cols] = (x_train[cont_cols] - x_train_mean) / x_train_std

x_test[cont_cols] = (x_test[cont_cols] - x_train_mean) / x_train_std


display(x_train[cols].head())

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,1,1,-0.194652,2,-0.716404,0,2,1.115158,-0.900694,1,-1.051032,0.929891,0
1,2,1,-1.252825,3,-0.716404,0,0,-0.667915,0.526148,0,-0.086892,0.386007,0
2,1,1,-0.371014,4,2.378462,0,2,1.039283,-0.900694,1,2.377024,0.567302,1
3,1,2,1.216246,4,-0.716404,0,0,-0.497195,-0.900694,1,-1.586666,1.428452,0
4,3,2,2.744719,1,-0.716404,0,2,0.394342,2.844768,1,0.448742,-0.203201,0


### Re-arrange Data for Categorical and Continous Variables

In [6]:

def preproc_input(input_data):
    input_list = []
    for cat_col in cat_cols:
        input_list.append(input_data[cat_col].values)

    input_list.append(input_data[cont_cols].values)
    return input_list
    

## Define Model 

In [13]:
initial_lr = 0.01
opt_name = 'adam'
# opt_name = 'rmsprop'

def create_model():
    inputs = []
    embeddings = []

    # For Categorical Variables
    for cat_col in cat_cols :

        no_of_unique_cat  = x_train[cat_col].nunique()
        embedding_size = min(np.ceil((no_of_unique_cat)/2), 50)
        embedding_size = int(embedding_size)
        vocab  = no_of_unique_cat + 1

        emb_input = Input(shape=(1,))
        emb = Embedding(vocab, embedding_size, input_length = 1)(emb_input)
        emb = Dropout(.01)(emb)
        emb = Reshape(target_shape=(embedding_size, ))(emb)
        inputs.append(emb_input)
        embeddings.append(emb)

    # Continuous variables
    input_cont = Input(shape=(len(cont_cols),))
    embedding_cont = BatchNormalization()(input_cont) 
    inputs.append(input_cont)
    embeddings.append(embedding_cont)

    x = Concatenate()(embeddings)
    x = Dense(32)(x)
    x = Activation('relu')(x)
#     x = BatchNormalization()(x)
#     x = Dropout(.15)(x)

    x = Dense(16)(x)
    x = Activation('relu')(x)
#     x = BatchNormalization()(x)
#     x = Dropout(.15)(x)

    x = Dense(1)(x)
    output = Activation('sigmoid')(x)

    model = Model(
        inputs=inputs, 
        outputs=[output]
    )


#     opt = rmsprop(lr=initial_lr, decay=1e-6)

    opt = Adam(lr=initial_lr)


    model.compile(
        loss='binary_crossentropy',
        optimizer=opt,
        metrics=[metrics.binary_accuracy]
    )
    return model

## Training

### K-Fold Validation

In [15]:
epochs = 15
batch_size = 64
K = 10
loss_list=[]
acc_list=[]
pred_list = np.zeros((np.shape(x_test)[0],K))

x_test_f = preproc_input(x_test)

modelCkpt = ModelCheckpoint('models/mlhw-mlp.h5', save_best_only=True, verbose=1)

callbacks = [ modelCkpt ]

kfold = StratifiedKFold(n_splits = K, 
                            random_state = 1, 
                            shuffle = True)  
for i, (train_idxs, val_indxs) in enumerate(kfold.split(x_train.values, y_train.values)):
    print('\nFold ', i + 1)
    x_train_f = x_train.loc[train_idxs]
    y_train_f = y_train.loc[train_idxs]

    x_val_f = x_train.loc[val_indxs]
    y_val_f = y_train.loc[val_indxs]
    
    
    x_train_f = preproc_input(x_train_f)
    x_val_f = preproc_input(x_val_f)
    
    y_train_f = y_train_f.values
    y_val_f = y_val_f.values
    model = create_model()
    model.fit(
        x_train_f,
        y_train_f,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(x_val_f, y_val_f),
        verbose=0,
        callbacks=callbacks
    )
    model_eval = model.evaluate(x_val_f, y_val_f)
    print("Loss: {}".format(model_eval[0]))
    print("Acc: {}".format(model_eval[1]))



Fold  1

Epoch 00001: val_loss improved from inf to 0.67682, saving model to models/mlhw-mlp.h5

Epoch 00002: val_loss improved from 0.67682 to 0.63271, saving model to models/mlhw-mlp.h5

Epoch 00003: val_loss improved from 0.63271 to 0.60119, saving model to models/mlhw-mlp.h5

Epoch 00004: val_loss did not improve from 0.60119

Epoch 00005: val_loss did not improve from 0.60119

Epoch 00006: val_loss did not improve from 0.60119

Epoch 00007: val_loss did not improve from 0.60119

Epoch 00008: val_loss did not improve from 0.60119

Epoch 00009: val_loss did not improve from 0.60119

Epoch 00010: val_loss did not improve from 0.60119

Epoch 00011: val_loss did not improve from 0.60119

Epoch 00012: val_loss did not improve from 0.60119

Epoch 00013: val_loss did not improve from 0.60119

Epoch 00014: val_loss did not improve from 0.60119

Epoch 00015: val_loss did not improve from 0.60119
Loss: 0.6522514820098877
Acc: 0.7777777910232544

Fold  2

Epoch 00001: val_loss improved from 


Epoch 00006: val_loss did not improve from 0.10853

Epoch 00007: val_loss did not improve from 0.10853

Epoch 00008: val_loss did not improve from 0.10853

Epoch 00009: val_loss did not improve from 0.10853

Epoch 00010: val_loss did not improve from 0.10853

Epoch 00011: val_loss did not improve from 0.10853

Epoch 00012: val_loss did not improve from 0.10853

Epoch 00013: val_loss did not improve from 0.10853

Epoch 00014: val_loss did not improve from 0.10853

Epoch 00015: val_loss did not improve from 0.10853
Loss: 1.0324243307113647
Acc: 0.7222222089767456

Fold  10

Epoch 00001: val_loss did not improve from 0.10853

Epoch 00002: val_loss did not improve from 0.10853

Epoch 00003: val_loss did not improve from 0.10853

Epoch 00004: val_loss did not improve from 0.10853

Epoch 00005: val_loss did not improve from 0.10853

Epoch 00006: val_loss did not improve from 0.10853

Epoch 00007: val_loss did not improve from 0.10853

Epoch 00008: val_loss did not improve from 0.10853

Epoc

## Write output to File

In [None]:
final_model = create_model()
final_model.load_weights('models/mlhw-mlp.h5')
predictions = final_model.predict(preproc_input(x_test));
out_df = pd.DataFrame({
    'patient_id': x_test['patient_id'],
    'heart_disease_present': predictions[:, 1]
})
out_df = out_df[['patient_id', 'heart_disease_present']]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(out_df)

out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
out_file = out_dir + '/' + experiment_name + '-' + 'out.csv'
out_df.to_csv(out_file, index=False)