** In this notebook: run the model with the last label ("others") removed**

## Load data

In [1]:
import os, sys, time
import numpy as np
import pandas as pd
from tqdm import tqdm
import cPickle as pk
np.random.seed(1) # to be reproductive

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import Conv2D, MaxPooling2D, Reshape
from keras.models import Sequential
from keras.models import load_model
from keras.utils.np_utils import to_categorical
from keras.callbacks import TensorBoard
from keras.callbacks import Callback, EarlyStopping
import keras.backend as K

Using TensorFlow backend.


In [19]:
# paths
NOTE_DATA_DIR = '/local/XW/DATA/MIMIC/noteevents_by_sid/'
ICD_FPATH = 'data/subject_diag_icds.txt'
PK_FPATH = 'data/diag_processed_data.pk' # './processed_data_small.pk'
MODEL_PATH = './models/'
LOG_PATH = './logs/'
# constants
N_LABELS = 49 # *** <-- remove last "others" label ***
K_ICDS_TOKEEP = N_LABELS - 1 # predict only on top K frequent icd codes
N_SUBJECTS = 41886
# word2vec configurations
GLOVE_DIR = '/local/XW/DATA/glove.6B/'
MAX_SEQ_LEN = 1000 # max length of input sequence (pad/truncate to fix length)
MAX_NB_WORDS = 20000 # top 20k most freq words
EMBEDDING_DIM = 100
# learning configurations
VALIDATION_SPLIT = 0.2
N_EPOCHS = 20
SZ_BATCH = 512 # large batch size ?

In [3]:
# load pickled data
pk_data = pk.load(open(PK_FPATH, 'rb'))
embedding_matrix = pk_data['embedding_matrix']
X_train, Y_train = pk_data['X_train'], pk_data['Y_train']
X_val, Y_val = pk_data['X_val'], pk_data['Y_val']
nb_words = MAX_NB_WORDS # forgot to pickle this number...

In [5]:
# found one row that is ALL 0) (strange?)
print np.min( np.sum(Y_train, axis=1) ), np.min( np.sum(Y_val, axis=1) )
print np.argmin( np.sum(Y_train, axis=1) )
Y_train[11730]
Y_train = np.delete(Y_train, 11730, axis=0)
X_train = np.delete(X_train, 11730, axis=0)
print X_train.shape, Y_train.shape

0.0 0.0
53
(36916, 1000) (36916, 49)


In [4]:
# *** remove last column of Y_train and Y_val ***
Y_train = Y_train[:,:-1]
Y_val = Y_val[:, :-1]
print Y_train.shape, Y_val.shape

(36917, 49) (9229, 49)


In [43]:
y_n_poslabels = Y_train.sum(axis=1) # this can be used as sample weights: more sample the ones with more 1s...

## Define evaluation metrics

**NB:** this metrics is the continus relaxation of what we really want, so the acc output during training is not precise.

In [24]:
def multlabel_prec(y_true, y_pred):
    y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
    tp = K.sum(y_true * y_pred, axis =-1)
    sum_true = K.sum(y_true, axis=-1)
    sum_pred = K.sum(y_pred, axis=-1)
    return K.mean(tp/(sum_pred+1e-10)) # to avoid NaN precision
    
def multlabel_recall(y_true, y_pred):
    y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
    tp = K.sum(y_true * y_pred, axis =-1)
    sum_true = K.sum(y_true, axis=-1)
    sum_pred = K.sum(y_pred, axis=-1)
    return K.mean(tp/(sum_true+1e-10)) 

def multlabel_F1(y_true, y_pred):
    y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
    tp = K.sum(y_true * y_pred, axis =-1)
    sum_true = K.sum(y_true, axis=-1)
    sum_pred = K.sum(y_pred, axis=-1)
    return 2*K.mean(tp/(sum_true+sum_pred+1e-10))

def multlabel_acc(y_true, y_pred):
    y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
    intersect = y_true * y_pred
    intersect = K.sum(intersect, axis=-1)
    union = K.clip(y_true+y_pred, 0, 1)
    union = K.sum(union, axis=-1)
    return K.mean(intersect/(union+1e-10))

In [6]:
def evaluate_model(model):
    print 'evaluation on training set:'
    print model.evaluate(X_train, Y_train, batch_size=128)
    print 'evaluation on validation set:'
    print model.evaluate(X_val, Y_val, batch_size=128)

In [44]:
# wraps up operations on models
def compile_fit_evaluate(model, quick_test=False, print_summary=True,
                         save_log=True, save_model=True, del_model=False):
    
    model.compile(loss='binary_crossentropy',
             optimizer='rmsprop',
             metrics=[multlabel_prec, multlabel_recall, multlabel_F1, multlabel_acc])
    if print_summary:
        print model.summary()
        
    if quick_test: # use tiny data for quick test
        print '(quick test mode)'
        model.fit(X_train[:100], Y_train[:100], nb_epoch=1)
        return  
    
    _callbacks = [EarlyStopping(monitor='val_loss', patience=2)] #[RelaxAccHistory()]
    if save_log:
        logdir = os.path.join( LOG_PATH, time.strftime('%m%d')+'_'+str(model.name) )
        if not os.path.exists(logdir):
            os.makedirs(logdir)
        _callbacks.append(TensorBoard(log_dir=logdir))
        print 'run "tensorboard --logdir=%s" to launch tensorboard'%logdir
    
    model.fit( X_train, Y_train, 
              validation_data=(X_val, Y_val),
              nb_epoch=N_EPOCHS, batch_size=SZ_BATCH,
              sample_weight = y_n_poslabels,
              callbacks=_callbacks )
    
    print 'evaluating model...'
    evaluate_model(model)
    
    if save_model: 
        model_fpath = os.path.join( MODEL_PATH, '%s.h5'% str(model.name) )
        model.save(model_fpath)
    
    if del_model:
        del model # delete the model to save memory

In [8]:
# ''' ***NOTE***
# To load models from file, we have to modify metrics.py at: 
# `/local/XW/SOFT/anaconda2/envs/thesis_nb/lib/python2.7/site-packages/keras` 
# to add the `multlabel_XXX` function, otherwise throws exception ! 

# cf issue: https://github.com/fchollet/keras/issues/3911
# '''
# m = load_model(os.path.sep.join([MODEL_PATH, 'model_1conv1d.h5']))

## Try different models

In [45]:
flag_quick_test = 0 # set to False/0 to run on whole data

In [46]:
model_1conv1d_dropout = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
              input_length=MAX_SEQ_LEN, trainable=False # keep the embeddings fixed
             ),# embedding layer
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dropout(p=0.2),
            Dense(N_LABELS, activation='sigmoid') 
        ], 
        name='model_1conv1d_dropout')
compile_fit_evaluate(model_1conv1d_dropout, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_19 (Embedding)         (None, 1000, 100)     0           embedding_input_19[0][0]         
____________________________________________________________________________________________________
convolution1d_33 (Convolution1D) (None, 996, 128)      64128       embedding_19[0][0]               
____________________________________________________________________________________________________
maxpooling1d_33 (MaxPooling1D)   (None, 199, 128)      0           convolution1d_33[0][0]           
____________________________________________________________________________________________________
flatten_19 (Flatten)             (None, 25472)         0           maxpooling1d_33[0][0]            
___________________________________________________________________________________________

In [47]:
# 2 conv1d layers
model_2conv1d_dropout = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_matrix],input_length=MAX_SEQ_LEN, trainable=False ),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dropout(p=0.5),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_2conv1d_dropout')
compile_fit_evaluate(model_2conv1d_dropout, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_20 (Embedding)         (None, 1000, 100)     0           embedding_input_20[0][0]         
____________________________________________________________________________________________________
convolution1d_34 (Convolution1D) (None, 996, 128)      64128       embedding_20[0][0]               
____________________________________________________________________________________________________
maxpooling1d_34 (MaxPooling1D)   (None, 199, 128)      0           convolution1d_34[0][0]           
____________________________________________________________________________________________________
convolution1d_35 (Convolution1D) (None, 195, 128)      82048       maxpooling1d_34[0][0]            
___________________________________________________________________________________________

In [48]:
model_3conv1d_dropout =Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_matrix],input_length=MAX_SEQ_LEN, trainable=False ),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dropout(p=0.5),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_3conv1d_dropout')

compile_fit_evaluate(model_3conv1d_dropout, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_21 (Embedding)         (None, 1000, 100)     0           embedding_input_21[0][0]         
____________________________________________________________________________________________________
convolution1d_36 (Convolution1D) (None, 996, 128)      64128       embedding_21[0][0]               
____________________________________________________________________________________________________
maxpooling1d_36 (MaxPooling1D)   (None, 199, 128)      0           convolution1d_36[0][0]           
____________________________________________________________________________________________________
convolution1d_37 (Convolution1D) (None, 195, 128)      82048       maxpooling1d_36[0][0]            
___________________________________________________________________________________________

In [49]:
model_3conv1d_2FC =Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_matrix],input_length=MAX_SEQ_LEN, trainable=False ),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(64, 5, activation='relu'),
            MaxPooling1D(3),
            Conv1D(32, 5, activation='relu'),
            MaxPooling1D(2),
            Flatten(),
            Dropout(p=0.5),
            Dense(500, activation='relu'),
            Dropout(p=0.5),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_3conv1d_2FC')

compile_fit_evaluate(model_3conv1d_2FC, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_22 (Embedding)         (None, 1000, 100)     0           embedding_input_22[0][0]         
____________________________________________________________________________________________________
convolution1d_39 (Convolution1D) (None, 996, 128)      64128       embedding_22[0][0]               
____________________________________________________________________________________________________
maxpooling1d_39 (MaxPooling1D)   (None, 199, 128)      0           convolution1d_39[0][0]           
____________________________________________________________________________________________________
convolution1d_40 (Convolution1D) (None, 195, 64)       41024       maxpooling1d_39[0][0]            
___________________________________________________________________________________________