## Load data

In [16]:
import os, sys, time
import numpy as np
import pandas as pd
from tqdm import tqdm
import cPickle as pk
np.random.seed(1) # to be reproductive

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import Conv2D, MaxPooling2D, Reshape
from keras.models import Sequential
from keras.models import load_model
from keras.utils.np_utils import to_categorical
from keras.callbacks import TensorBoard
from keras.callbacks import Callback, EarlyStopping
import keras.backend as K

In [2]:
# paths
NOTE_DATA_DIR = '/local/XW/DATA/MIMIC/noteevents_by_sid/'
ICD_FPATH = 'data/subject_diag_icds.txt'
PK_FPATH = 'data/diag_processed_data.pk' # './processed_data_small.pk'
MODEL_PATH = './models/'
LOG_PATH = './logs/'
# constants
N_LABELS = 50
K_ICDS_TOKEEP = N_LABELS - 1 # predict only on top K frequent icd codes
N_SUBJECTS = 41886
# word2vec configurations
GLOVE_DIR = '/local/XW/DATA/glove.6B/'
MAX_SEQ_LEN = 1000 # max length of input sequence (pad/truncate to fix length)
MAX_NB_WORDS = 20000 # top 20k most freq words
EMBEDDING_DIM = 100
# learning configurations
VALIDATION_SPLIT = 0.2
N_EPOCHS = 20
SZ_BATCH = 128

In [3]:
# load pickled data
pk_data = pk.load(open(PK_FPATH, 'rb'))
embedding_matrix = pk_data['embedding_matrix']
X_train, Y_train = pk_data['X_train'], pk_data['Y_train']
X_val, Y_val = pk_data['X_val'], pk_data['Y_val']
nb_words = MAX_NB_WORDS # forgot to pickle this number...

In [4]:
# found one row that is ALL ) (strange?)
print np.min( np.sum(Y_train, axis=1) ), np.min( np.sum(Y_val, axis=1) )
print np.argmin( np.sum(Y_train, axis=1) )
Y_train[11730]
Y_train = np.delete(Y_train, 11730, axis=0)
X_train = np.delete(X_train, 11730, axis=0)
print X_train.shape, Y_train.shape

0.0 1.0
11730
(36916, 1000) (36916, 50)


## Define evaluation metrics

**NB:** this metrics is the continus relaxation of what we really want, so the acc output during training is not precise.

In [5]:
def relax_acc(y_true, y_pred): # shape: (None,N_LABELS)
    '''relaxed accuracy for the case when y_true is K-hot 
    if the predicted icd code is in the patient's icds, then it's good
    
    **note:**
    the y_pred is the softmax output, we need to make it into 1-hot encoding 
    * via K.round() -- doesn't work well , lots of 0s
    * by hand -- doesn't work either: 
    >InvalidArgumentError: You must feed a value for placeholder tensor 'embedding_input_4' with dtype int32
    
    ==> so the output is not the accuracy as we defined, but a *continus relaxation* version...
    '''
    y_int = y_pred * y_true # element-wise mul, intersection

    return K.mean( K.sum(y_int, axis=-1) )

def multlabel_prec(y_true, y_pred):
    y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
    tp = K.sum(y_true * y_pred, axis =-1)
    sum_true = K.sum(y_true, axis=-1)
    sum_pred = K.sum(y_pred, axis=-1)
    return K.mean(tp/(sum_pred+1e-10)) # to avoid NaN precision
    
def multlabel_recall(y_true, y_pred):
    y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
    tp = K.sum(y_true * y_pred, axis =-1)
    sum_true = K.sum(y_true, axis=-1)
    sum_pred = K.sum(y_pred, axis=-1)
    return K.mean(tp/sum_true) 

def multlabel_F1(y_true, y_pred):
    y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
    tp = K.sum(y_true * y_pred, axis =-1)
    sum_true = K.sum(y_true, axis=-1)
    sum_pred = K.sum(y_pred, axis=-1)
    return 2*K.mean(tp/(sum_true+sum_pred))

def multlabel_acc(y_true, y_pred):
    y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
    intersect = y_true * y_pred
    intersect = K.sum(intersect, axis=-1)
    union = K.clip(y_true+y_pred, 0, 1)
    union = K.sum(union, axis=-1)
    return K.mean(intersect/union)

In [18]:
def evaluate_model(model):
    print 'evaluation on training set:'
    print model.evaluate(X_train, Y_train, batch_size=128)
    print 'evaluation on validation set:'
    print model.evaluate(X_val, Y_val, batch_size=128)

In [19]:
# wraps up operations on models
def compile_fit_evaluate(model, quick_test=False, print_summary=True,
                         save_log=True, save_model=True, del_model=False):
    
    model.compile(loss='binary_crossentropy',
             optimizer='rmsprop',
             metrics=[multlabel_prec, multlabel_recall, multlabel_F1, multlabel_acc])
    if print_summary:
        print model.summary()
        
    if quick_test: # use tiny data for quick test
        print '(quick test mode)'
        model.fit(X_train[:100], Y_train[:100], nb_epoch=1)
        return  
    
    _callbacks = [EarlyStopping(monitor='val_loss', patience=2)] #[RelaxAccHistory()]
    if save_log:
        logdir = os.path.join( LOG_PATH, time.strftime('%m%d')+'_'+str(model.name) )
        if not os.path.exists(logdir):
            os.makedirs(logdir)
        _callbacks.append(TensorBoard(log_dir=logdir))
        print 'run "tensorboard --logdir=%s" to launch tensorboard'%logdir
    
    model.fit( X_train, Y_train, 
              validation_data=(X_val, Y_val),
              nb_epoch=N_EPOCHS, batch_size=SZ_BATCH,
              callbacks=_callbacks )
    
    print 'evaluating model...'
    evaluate_model(model)
    
    if save_model: 
        model_fpath = os.path.join( MODEL_PATH, '%s.h5'% str(model.name) )
        model.save(model_fpath)
    
    if del_model:
        del model # delete the model to save memory

In [8]:
# ''' ***NOTE***
# To load models from file, we have to modify metrics.py at: 
# `/local/XW/SOFT/anaconda2/envs/thesis_nb/lib/python2.7/site-packages/keras` 
# to add the `multlabel_XXX` function, otherwise throws exception ! 

# cf issue: https://github.com/fchollet/keras/issues/3911
# '''
# m = load_model(os.path.sep.join([MODEL_PATH, 'model_1conv1d.h5']))

## Try different models

In [9]:
flag_quick_test = 0 # set to False/0 to run on whole data

In [20]:
model_1conv1d_dropout = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
              input_length=MAX_SEQ_LEN, trainable=False # keep the embeddings fixed
             ),# embedding layer
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dropout(p=0.2),
            Dense(N_LABELS, activation='sigmoid') 
        ], 
        name='model_1conv1d_dropout')
compile_fit_evaluate(model_1conv1d, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 1000, 100)     0           embedding_input_1[0][0]          
____________________________________________________________________________________________________
convolution1d_1 (Convolution1D)  (None, 996, 128)      64128       embedding_1[0][0]                
____________________________________________________________________________________________________
maxpooling1d_1 (MaxPooling1D)    (None, 199, 128)      0           convolution1d_1[0][0]            
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 25472)         0           maxpooling1d_1[0][0]             
___________________________________________________________________________________________

In [21]:
# 2 conv1d layers
model_2conv1d_dropout = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_matrix],input_length=MAX_SEQ_LEN, trainable=False ),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dropout(p=0.2),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_2conv1d_dropout')
compile_fit_evaluate(model_2conv1d_dropout, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_9 (Embedding)          (None, 1000, 100)     0           embedding_input_9[0][0]          
____________________________________________________________________________________________________
convolution1d_9 (Convolution1D)  (None, 996, 128)      64128       embedding_9[0][0]                
____________________________________________________________________________________________________
maxpooling1d_9 (MaxPooling1D)    (None, 199, 128)      0           convolution1d_9[0][0]            
____________________________________________________________________________________________________
convolution1d_10 (Convolution1D) (None, 195, 128)      82048       maxpooling1d_9[0][0]             
___________________________________________________________________________________________

In [22]:
model_2conv1d_dropout = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_matrix],input_length=MAX_SEQ_LEN, trainable=False ),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dropout(p=0.5),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_2conv1d_dropout0.5')
compile_fit_evaluate(model_2conv1d_dropout, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_10 (Embedding)         (None, 1000, 100)     0           embedding_input_10[0][0]         
____________________________________________________________________________________________________
convolution1d_11 (Convolution1D) (None, 996, 128)      64128       embedding_10[0][0]               
____________________________________________________________________________________________________
maxpooling1d_11 (MaxPooling1D)   (None, 199, 128)      0           convolution1d_11[0][0]           
____________________________________________________________________________________________________
convolution1d_12 (Convolution1D) (None, 195, 128)      82048       maxpooling1d_11[0][0]            
___________________________________________________________________________________________

In [23]:
model_3conv1d_dropout =Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_matrix],input_length=MAX_SEQ_LEN, trainable=False ),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dropout(p=0.5),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_3conv1d_dropout')

compile_fit_evaluate(model_3conv1d_dropout, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_11 (Embedding)         (None, 1000, 100)     0           embedding_input_11[0][0]         
____________________________________________________________________________________________________
convolution1d_13 (Convolution1D) (None, 996, 128)      64128       embedding_11[0][0]               
____________________________________________________________________________________________________
maxpooling1d_13 (MaxPooling1D)   (None, 199, 128)      0           convolution1d_13[0][0]           
____________________________________________________________________________________________________
convolution1d_14 (Convolution1D) (None, 195, 128)      82048       maxpooling1d_13[0][0]            
___________________________________________________________________________________________

In [10]:
# with only 1 conv1d layer
model_1conv1d = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
              input_length=MAX_SEQ_LEN, trainable=False # keep the embeddings fixed
             ),# embedding layer
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dense(N_LABELS, activation='sigmoid') 
        ], 
        name='model_1conv1d')
compile_fit_evaluate(model_1conv1d, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 1000, 100)     0           embedding_input_1[0][0]          
____________________________________________________________________________________________________
convolution1d_1 (Convolution1D)  (None, 996, 128)      64128       embedding_1[0][0]                
____________________________________________________________________________________________________
maxpooling1d_1 (MaxPooling1D)    (None, 199, 128)      0           convolution1d_1[0][0]            
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 25472)         0           maxpooling1d_1[0][0]             
___________________________________________________________________________________________

In [11]:
# 2 conv1d layers
model_2conv1d = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_matrix],input_length=MAX_SEQ_LEN, trainable=False ),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_2conv1d')
compile_fit_evaluate(model_2conv1d, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 1000, 100)     0           embedding_input_2[0][0]          
____________________________________________________________________________________________________
convolution1d_2 (Convolution1D)  (None, 996, 128)      64128       embedding_2[0][0]                
____________________________________________________________________________________________________
maxpooling1d_2 (MaxPooling1D)    (None, 199, 128)      0           convolution1d_2[0][0]            
____________________________________________________________________________________________________
convolution1d_3 (Convolution1D)  (None, 195, 128)      82048       maxpooling1d_2[0][0]             
___________________________________________________________________________________________

In [12]:
# 3 conv1d layers 
model_3conv1d =Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_matrix],input_length=MAX_SEQ_LEN, trainable=False ),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_3conv1d')

compile_fit_evaluate(model_3conv1d, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_3 (Embedding)          (None, 1000, 100)     0           embedding_input_3[0][0]          
____________________________________________________________________________________________________
convolution1d_4 (Convolution1D)  (None, 996, 128)      64128       embedding_3[0][0]                
____________________________________________________________________________________________________
maxpooling1d_4 (MaxPooling1D)    (None, 199, 128)      0           convolution1d_4[0][0]            
____________________________________________________________________________________________________
convolution1d_5 (Convolution1D)  (None, 195, 128)      82048       maxpooling1d_4[0][0]             
___________________________________________________________________________________________

In [13]:
# 2d conv models
'''for 2d conv, the nb_filters cann't be too big: 
   128*MAX_SEQ_LEN*EMBEDDING_DIM is too much memory
   nb_filter = 64 is fine for 1 conv2d layer
'''
model_1conv2d = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
              input_length=MAX_SEQ_LEN, trainable=False),
            Reshape( (MAX_SEQ_LEN, EMBEDDING_DIM, 1) ), # **need to manually reshape and add a channel**
            Conv2D(8, 5, 5, activation='relu' ), # , input_shape=(MAX_SEQ_LEN, EMBEDDING_DIM, 1)
            MaxPooling2D((10,10)),# need to downsample heavily to reduce parameters... 
            Flatten(),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_1conv2d')
# model_1conv2d.summary()
compile_fit_evaluate(model_1conv2d, flag_quick_test) 

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_4 (Embedding)          (None, 1000, 100)     0           embedding_input_4[0][0]          
____________________________________________________________________________________________________
reshape_1 (Reshape)              (None, 1000, 100, 1)  0           embedding_4[0][0]                
____________________________________________________________________________________________________
convolution2d_1 (Convolution2D)  (None, 996, 96, 8)    208         reshape_1[0][0]                  
____________________________________________________________________________________________________
maxpooling2d_1 (MaxPooling2D)    (None, 99, 9, 8)      0           convolution2d_1[0][0]            
___________________________________________________________________________________________

In [14]:
model_2conv2d = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
              input_length=MAX_SEQ_LEN, trainable=False),
            Reshape( (MAX_SEQ_LEN, EMBEDDING_DIM, 1) ), # **need to manually reshape and add a channel**
            Conv2D(32, 5, 5, activation='relu' ), # , input_shape=(MAX_SEQ_LEN, EMBEDDING_DIM, 1)
            MaxPooling2D((5,5)),
            Conv2D(8, 5, 5, activation='relu' ), 
            MaxPooling2D((2,2)),
            Flatten(),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_2conv2d')
compile_fit_evaluate(model_2conv2d, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 1000, 100)     0           embedding_input_5[0][0]          
____________________________________________________________________________________________________
reshape_2 (Reshape)              (None, 1000, 100, 1)  0           embedding_5[0][0]                
____________________________________________________________________________________________________
convolution2d_2 (Convolution2D)  (None, 996, 96, 32)   832         reshape_2[0][0]                  
____________________________________________________________________________________________________
maxpooling2d_2 (MaxPooling2D)    (None, 199, 19, 32)   0           convolution2d_2[0][0]            
___________________________________________________________________________________________

In [15]:
model_3conv2d = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
              input_length=MAX_SEQ_LEN, trainable=False),
            Reshape( (MAX_SEQ_LEN, EMBEDDING_DIM, 1) ), # **need to manually reshape and add a channel**
            Conv2D(64, 5, 5, activation='relu' ), # , input_shape=(MAX_SEQ_LEN, EMBEDDING_DIM, 1)
            MaxPooling2D((5,5)),
            Conv2D(32, 5, 5, activation='relu' ), 
            MaxPooling2D((2,2)),
            Conv2D(8, 5, 5, activation='relu' ), 
            MaxPooling2D((2,2)),
            Flatten(),
            Dense(N_LABELS, activation='softmax') ],
        name='model_3conv2d')
print model_3conv2d.summary()
# maybe this is too slow to compute? estimated time: 100 * 30 * (N_EPOCH+1) ~= 9hours ...
# compile_fit_evaluate(model_3conv2d, flag_quick_test) 

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_6 (Embedding)          (None, 1000, 100)     0           embedding_input_6[0][0]          
____________________________________________________________________________________________________
reshape_3 (Reshape)              (None, 1000, 100, 1)  0           embedding_6[0][0]                
____________________________________________________________________________________________________
convolution2d_4 (Convolution2D)  (None, 996, 96, 64)   1664        reshape_3[0][0]                  
____________________________________________________________________________________________________
maxpooling2d_4 (MaxPooling2D)    (None, 199, 19, 64)   0           convolution2d_4[0][0]            
___________________________________________________________________________________________