In [1]:
import os
import pandas as pd
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Activation, BatchNormalization, Embedding, Input, Reshape, Concatenate
from keras import metrics
from keras.optimizers import Adam, rmsprop
from keras.callbacks import TensorBoard, ReduceLROnPlateau
from time import time
from IPython.display import display
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [2]:
experiment_name='mlhw-mlp-with-preprocess'

## Load Dataset

In [3]:
x_train = pd.read_csv('dataset/train_values.csv')
y_train = pd.read_csv('dataset/train_labels.csv')
x_test = pd.read_csv('dataset/test_values.csv')


# Convert string values to integers
x_train["thal"] = x_train["thal"].astype('category')
x_train["thal"] = x_train["thal"].cat.codes

x_test["thal"] = x_test["thal"].astype('category')
x_test["thal"] = x_test["thal"].cat.codes



# Drop column
x_train = x_train.drop("patient_id", axis=1)
y_train = y_train.drop("patient_id", axis=1)

xsize = x_train.shape[1]
ysize = 1
label_column = 'heart_disease_present'
cols = list(x_train.columns)



In [4]:
display(x_train.head())

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,1,1,128,2,0,0,2,308,0.0,1,45,170,0
1,2,1,110,3,0,0,0,214,1.6,0,54,158,0
2,1,1,125,4,3,0,2,304,0.0,1,77,162,1
3,1,2,152,4,0,0,0,223,0.0,1,40,181,0
4,3,2,178,1,0,0,2,270,4.2,1,59,145,0


## Preprocessing

### Data Whitening
This will normalize our values so higher order values won't dominate other values

In [5]:
x_train_mean = x_train[cols].mean(axis=0)
x_train_std = x_train[cols].std(axis=0)
x_train[cols] = (x_train[cols] - x_train_mean) / x_train_std

x_test[cols] = (x_test[cols] - x_train_mean) / x_train_std


display(x_train[cols].head())

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,-0.888762,-0.645259,-0.194652,-1.23134,-0.716404,-0.437019,0.951196,1.115158,-0.900694,0.670152,-1.051032,0.929891,-0.678852
1,0.727169,-0.645259,-1.252825,-0.165757,-0.716404,-0.437019,-1.051322,-0.667915,0.526148,-1.483908,-0.086892,0.386007,-0.678852
2,-0.888762,-0.645259,-0.371014,0.899825,2.378462,-0.437019,0.951196,1.039283,-0.900694,0.670152,2.377024,0.567302,1.464891
3,-0.888762,1.114538,1.216246,0.899825,-0.716404,-0.437019,-1.051322,-0.497195,-0.900694,0.670152,-1.586666,1.428452,-0.678852
4,2.3431,1.114538,2.744719,-2.296923,-0.716404,-0.437019,0.951196,0.394342,2.844768,0.670152,0.448742,-0.203201,-0.678852


### Re-arrange Data for Categorical and Continous Variables

In [6]:

cat_cols = [
    'thal',
    'chest_pain_type',
    'fasting_blood_sugar_gt_120_mg_per_dl',
    'resting_ekg_results',
    'sex',
    'exercise_induced_angina'
]
cont_cols = [
    'slope_of_peak_exercise_st_segment',
    'num_major_vessels',
    'resting_blood_pressure',
    'serum_cholesterol_mg_per_dl',
    'oldpeak_eq_st_depression',
    'age',
    'max_heart_rate_achieved',
]


def preproc_input(input_data):
    input_list = []
    for cat_col in cat_cols:
        input_list.append(input_data[cat_col].values)

    input_list.append(input_data[cont_cols].values)
    return input_list
    

## Define Model 

In [7]:
initial_lr = 0.001
opt_name = 'adam'
# opt_name = 'rmsprop'

def create_model():
    inputs = []
    embeddings = []

    # For Categorical Variables
    for cat_col in cat_cols :

        no_of_unique_cat  = x_train[cat_col].nunique()
        embedding_size = min(np.ceil((no_of_unique_cat)/2), 50)
        embedding_size = int(embedding_size)
        vocab  = no_of_unique_cat + 1

        emb_input = Input(shape=(1,))
        emb = Embedding(vocab, embedding_size, input_length = 1)(emb_input)
        emb = Dropout(.01)(emb)
        emb = Reshape(target_shape=(embedding_size, ))(emb)
        inputs.append(emb_input)
        embeddings.append(emb)

    # Continuous variables
    input_cont = Input(shape=(len(cont_cols),))
    embedding_cont = BatchNormalization()(input_cont) 
    inputs.append(input_cont)
    embeddings.append(embedding_cont)

    x = Concatenate()(embeddings)
    x = Dense(100)(x)
    x = Activation('relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(.15)(x)

    x = Dense(50)(x)
    x = Activation('relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(.15)(x)

    x = Dense(20)(x)
    x = Activation('relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(.15)(x)

    x = Dense(1)(x)
    output = Activation('sigmoid')(x)

    model = Model(
        inputs=inputs, 
        outputs=[output]
    )


#     opt = rmsprop(lr=initial_lr, decay=1e-6)

    opt = Adam(lr=initial_lr)


    model.compile(
        loss='binary_crossentropy',
        optimizer=opt,
        metrics=[metrics.binary_accuracy]
    )
    return model

## Training

### Stratified K-Fold Cross Validation

In [10]:
epochs = 20
batch_size = 32
K = 10
runs_per_fold = 3
loss_list=[]
acc_list=[]
val_pred_list = np.zeros((np.shape(x_test)[0],K))
pred_list = np.zeros((np.shape(x_test)[0],K))

x_test_f = preproc_input(x_test)


kfold = StratifiedKFold(n_splits = K, 
                            random_state = 1, 
                            shuffle = True)  
for i, (train_idxs, val_indxs) in enumerate(kfold.split(x_train.values, y_train.values)):
    print('\nFold ', i + 1)
    x_train_f = x_train.loc[train_idxs]
    y_train_f = y_train.loc[train_idxs]

    x_val_f = x_train.loc[val_indxs]
    y_val_f = y_train.loc[val_indxs]
    
    # Upsampling
    # Add positive samples
    
    positive_samples = y_train_f.query('heart_disease_present==True')
    positive_samples = pd.Series(positive_samples['heart_disease_present'])

    x_train_f = pd.concat([x_train, x_train.loc[positive_samples]], axis=0)
    y_train_f = pd.concat([y_train, y_train.loc[positive_samples]], axis=0)
    
    # Shuffle data
    idx = np.arange(len(x_train_f))
    np.random.shuffle(idx)
    x_train_f = x_train_f.iloc[idx]
    y_train_f = y_train_f.iloc[idx]
    
    x_train_f = preproc_input(x_train_f)
    x_val_f = preproc_input(x_val_f)
    
    for j in range(runs_per_fold):
        model = create_model()
        model.fit(
            x_train_f,
            y_train_f.values,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(x_val_f, y_val_f),
            verbose=1,
        )
        model_eval = model.evaluate(x_val_f, y_val_f)
        loss_list.append(model_eval[0])
        val_pred_list[:, i] += model_eval[1] / runs_per_fold
        pred_list[:, i] += model.predict(x_test_f)[:,0] / runs_per_fold

mean_loss = np.mean(loss_list)
mean_val_pred = np.mean(val_pred_list)
mean_pred = np.mean(pred_list, axis = 1)
print("Mean loss: {}".format(mean_loss))
print("Mean val_pred: {}".format(mean_val_pred))
print("Mean pred: {}".format(mean_pred))

## Save model and weights
# save_dir=os.path.join(os.getcwd(), 'saved_models')
# model_name= '{}.h5'.format(exp_stamp)
# if not os.path.isdir(save_dir):
#     os.makedirs(save_dir)
# model_path = os.path.join(save_dir, model_name)
# model.save(model_path)
# print('Saved  trained model at %s ' % model_path)


Fold  1
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Fold  2
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Fold  3
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Fold  4
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Fold  5
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Fold  6
Train on 252 samples, validate on 18 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Fold  7
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Fold  8
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Fold  9
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Fold  10
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 252 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Mean loss: 0.3217837651570638
Mean val_pred: 0.8666666607062022
Mean pred: [0.41034813 0.10681037 0.97834922 0.29263243 0.9827021  0.14690767
 0.80595184 0.99317165 0.13300972 0.24177785 0.7097801  0.70386635
 0.10586856 0.98307906 0.09736705 0.20354109 0.13605277 0.12568479
 0.99409736 0.13864687 0.97920821 0.26563346 0.33854739 0.090685

## Write output to File

In [18]:
out_df = pd.DataFrame({
    'patient_id': x_test['patient_id'],
    'heart_disease_present': mean_pred[:]
})
out_df = out_df[['patient_id', 'heart_disease_present']]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(out_df)

out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
out_file = out_dir + '/' + experiment_name + '-' + 'out.csv'
out_df.to_csv(out_file, index=False)

Unnamed: 0,patient_id,heart_disease_present
0,olalu7,0.410348
1,z9n6mx,0.10681
2,5k4413,0.978349
3,mrg7q5,0.292632
4,uki4do,0.982702
5,kev1sk,0.146908
6,9n6let,0.805952
7,jxmtyg,0.993172
8,51s2ff,0.13301
9,wi9mcs,0.241778
