** In this notebook: run the model with the last label ("others") removed**

## Load data

In [114]:
import os, sys, time
import numpy as np
import pandas as pd
from tqdm import tqdm
import cPickle as pk
np.random.seed(1) # to be reproductive

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import Conv2D, MaxPooling2D, Reshape
from keras.models import Sequential
from keras.models import load_model
from keras.utils.np_utils import to_categorical
from keras.callbacks import TensorBoard
from keras.callbacks import Callback, EarlyStopping
import keras.backend as K

In [115]:
# paths
NOTE_DATA_DIR = '/local/XW/DATA/MIMIC/noteevents_by_sid/'
ICD_FPATH = 'data/subject_diag_icds.txt'
PK_FPATH = 'data/diag_processed_data.pk' # './processed_data_small.pk'
MODEL_PATH = './models/'
LOG_PATH = './logs/'
# constants
N_LABELS = 49 # *** <-- remove last "others" label ***
K_ICDS_TOKEEP = N_LABELS - 1 # predict only on top K frequent icd codes
N_SUBJECTS = 41886
# word2vec configurations
GLOVE_DIR = '/local/XW/DATA/glove.6B/'
MAX_SEQ_LEN = 1000 # max length of input sequence (pad/truncate to fix length)
MAX_NB_WORDS = 20000 # top 20k most freq words
EMBEDDING_DIM = 100
# learning configurations
VALIDATION_SPLIT = 0.2
N_EPOCHS = 20
SZ_BATCH = 512 # large batch size ?

In [116]:
# load pickled data
pk_data = pk.load(open(PK_FPATH, 'rb'))
embedding_matrix = pk_data['embedding_matrix']
X_train, Y_train = pk_data['X_train'], pk_data['Y_train']
X_val, Y_val = pk_data['X_val'], pk_data['Y_val']
nb_words = MAX_NB_WORDS # forgot to pickle this number...

In [117]:
# found one row that is ALL 0) (strange?)
print np.min( np.sum(Y_train, axis=1) ), np.min( np.sum(Y_val, axis=1) )
print np.argmin( np.sum(Y_train, axis=1) )
Y_train[11730]
Y_train = np.delete(Y_train, 11730, axis=0)
X_train = np.delete(X_train, 11730, axis=0)
print X_train.shape, Y_train.shape

0.0 1.0
11730
(36916, 1000) (36916, 50)


In [118]:
# *** remove last column of Y_train and Y_val ***
Y_train = Y_train[:,:-1]
Y_val = Y_val[:, :-1]
print Y_train.shape, Y_val.shape

(36916, 49) (9229, 49)


In [119]:
y_n_poslabels = Y_train.sum(axis=1) # this can be used as sample weights: more sample the ones with more 1s...

In [133]:
y_n_poslabels**(1.5)

array([ 22.627417  ,   5.19615242,   5.19615242, ...,   5.19615242,
         5.19615242,   1.        ])

In [134]:
inv_freq = 1e6*Y_train.sum(axis=0)**(-1.5)
inv_freq

array([  0.60232587,   1.25496374,   1.36692973,   1.43641759,
         2.06435057,   2.15704779,   2.22239335,   2.62282387,
         3.22839241,   3.24966444,   3.437979  ,   3.64780256,
         3.67914395,   4.02508074,   4.86485374,   5.02411415,
         5.28468609,   6.04044627,   6.46337026,   6.71336902,
         6.64585067,   7.05709269,   7.24824906,   7.81638511,
         8.71124363,   9.04224537,   8.78927112,   9.1492535 ,
         9.52807083,   9.70417161,   9.52165102,   9.59916762,
         9.85850607,  10.18691224,  10.94959546,  11.01470692,
        12.10316638,  12.54696038,  12.52666333,  12.40602183,
        12.59794352,  12.9325765 ,  13.2377194 ,  14.10470332,
        15.02935251,  15.02935251,  15.08443868,  15.75776903,  15.63965989])

In [135]:
sample_weight = (inv_freq * Y_train).sum(axis=1) # + (inv_freq * Y_train).max(axis=1)

In [136]:
sample_weight

array([ 50.89430951,  25.72348956,  16.54614951, ...,  13.33349412,
        13.33349412,   1.25496374])

## Define evaluation metrics

**NB:** this metrics is the continus relaxation of what we really want, so the acc output during training is not precise.

In [137]:
def multlabel_prec(y_true, y_pred):
    y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
    tp = K.sum(y_true * y_pred, axis =-1)
    sum_true = K.sum(y_true, axis=-1)
    sum_pred = K.sum(y_pred, axis=-1)
    return K.mean(tp/(sum_pred+1e-10)) # to avoid NaN precision
    
def multlabel_recall(y_true, y_pred):
    y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
    tp = K.sum(y_true * y_pred, axis =-1)
    sum_true = K.sum(y_true, axis=-1)
    sum_pred = K.sum(y_pred, axis=-1)
    return K.mean(tp/(sum_true+1e-10)) 

def multlabel_F1(y_true, y_pred):
    y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
    tp = K.sum(y_true * y_pred, axis =-1)
    sum_true = K.sum(y_true, axis=-1)
    sum_pred = K.sum(y_pred, axis=-1)
    return 2*K.mean(tp/(sum_true+sum_pred+1e-10))

def multlabel_acc(y_true, y_pred):
    y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
    intersect = y_true * y_pred
    intersect = K.sum(intersect, axis=-1)
    union = K.clip(y_true+y_pred, 0, 1)
    union = K.sum(union, axis=-1)
    return K.mean(intersect/(union+1e-10))

In [138]:
def evaluate_model(model):
    print 'evaluation on training set:'
    print model.evaluate(X_train, Y_train, batch_size=128)
    print 'evaluation on validation set:'
    print model.evaluate(X_val, Y_val, batch_size=128)

# wraps up operations on models
def compile_fit_evaluate(model, quick_test=False, print_summary=True,
                         save_log=True, save_model=True, del_model=False):
    
    model.compile(loss='binary_crossentropy',
             optimizer='rmsprop',
             metrics=[multlabel_prec, multlabel_recall, multlabel_F1, multlabel_acc])
    if print_summary:
        print model.summary()
        
    if quick_test: # use tiny data for quick test
        print '(quick test mode)'
        model.fit(X_train[:100], Y_train[:100], nb_epoch=1)
        return  
    
    _callbacks = [EarlyStopping(monitor='val_loss', patience=2)]
    if save_log:
        logdir = os.path.join( LOG_PATH, time.strftime('%m%d')+'_'+str(model.name) )
        if not os.path.exists(logdir):
            os.makedirs(logdir)
        _callbacks.append(TensorBoard(log_dir=logdir))
        print 'run "tensorboard --logdir=%s" to launch tensorboard'%logdir
    
    model.fit( X_train, Y_train, 
              validation_data=(X_val, Y_val),
              nb_epoch=N_EPOCHS, batch_size=SZ_BATCH,
              sample_weight = sample_weight,
              callbacks=_callbacks )
    
    print 'evaluating model...'
    evaluate_model(model)
    
    if save_model: 
        model_fpath = os.path.join( MODEL_PATH, '%s.h5'% str(model.name) )
        model.save(model_fpath)
    
    if del_model:
        del model # delete the model to save memory

In [139]:
# ''' ***NOTE***
# To load models from file, we have to modify metrics.py at: 
# `/local/XW/SOFT/anaconda2/envs/thesis_nb/lib/python2.7/site-packages/keras` 
# to add the `multlabel_XXX` function, otherwise throws exception ! 

# cf issue: https://github.com/fchollet/keras/issues/3911
# '''
# m = load_model(os.path.sep.join([MODEL_PATH, 'model_1conv1d.h5']))

## Try different models

In [140]:
flag_quick_test = 0 # set to False/0 to run on whole data

In [141]:
model_1conv1d_2FC = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
              input_length=MAX_SEQ_LEN, trainable=False # keep the embeddings fixed
             ),# embedding layer
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dropout(p=0.5),
            Dense(500, activation='relu'),
            Dropout(p=0.5),
            Dense(N_LABELS, activation='sigmoid') 
        ], 
        name='model_1conv1d_2FC')
compile_fit_evaluate(model_1conv1d_2FC, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_38 (Embedding)         (None, 1000, 100)     0           embedding_input_38[0][0]         
____________________________________________________________________________________________________
convolution1d_74 (Convolution1D) (None, 996, 128)      64128       embedding_38[0][0]               
____________________________________________________________________________________________________
maxpooling1d_74 (MaxPooling1D)   (None, 199, 128)      0           convolution1d_74[0][0]           
____________________________________________________________________________________________________
flatten_38 (Flatten)             (None, 25472)         0           maxpooling1d_74[0][0]            
___________________________________________________________________________________________

In [142]:
# 2 conv1d layers
model_2conv1d_dropout = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_matrix],input_length=MAX_SEQ_LEN, trainable=False ),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dropout(p=0.5),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_2conv1d_dropout')
compile_fit_evaluate(model_2conv1d_dropout, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_39 (Embedding)         (None, 1000, 100)     0           embedding_input_39[0][0]         
____________________________________________________________________________________________________
convolution1d_75 (Convolution1D) (None, 996, 128)      64128       embedding_39[0][0]               
____________________________________________________________________________________________________
maxpooling1d_75 (MaxPooling1D)   (None, 199, 128)      0           convolution1d_75[0][0]           
____________________________________________________________________________________________________
convolution1d_76 (Convolution1D) (None, 195, 128)      82048       maxpooling1d_75[0][0]            
___________________________________________________________________________________________

In [143]:
# 2 conv1d layers
model_2conv1d_2FC = Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_matrix],input_length=MAX_SEQ_LEN, trainable=False ),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dropout(p=0.5),
            Dense(500, activation='relu'),
            Dropout(p=0.5),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_2conv1d_2FC')
compile_fit_evaluate(model_2conv1d_2FC, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_40 (Embedding)         (None, 1000, 100)     0           embedding_input_40[0][0]         
____________________________________________________________________________________________________
convolution1d_77 (Convolution1D) (None, 996, 128)      64128       embedding_40[0][0]               
____________________________________________________________________________________________________
maxpooling1d_77 (MaxPooling1D)   (None, 199, 128)      0           convolution1d_77[0][0]           
____________________________________________________________________________________________________
convolution1d_78 (Convolution1D) (None, 195, 128)      82048       maxpooling1d_77[0][0]            
___________________________________________________________________________________________

In [144]:
model_3conv1d_dropout =Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_matrix],input_length=MAX_SEQ_LEN, trainable=False ),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dropout(p=0.5),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_3conv1d_dropout')

compile_fit_evaluate(model_3conv1d_dropout, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_41 (Embedding)         (None, 1000, 100)     0           embedding_input_41[0][0]         
____________________________________________________________________________________________________
convolution1d_79 (Convolution1D) (None, 996, 128)      64128       embedding_41[0][0]               
____________________________________________________________________________________________________
maxpooling1d_79 (MaxPooling1D)   (None, 199, 128)      0           convolution1d_79[0][0]           
____________________________________________________________________________________________________
convolution1d_80 (Convolution1D) (None, 195, 128)      82048       maxpooling1d_79[0][0]            
___________________________________________________________________________________________

In [145]:
model_3conv1d_2FC =Sequential(
        [ Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_matrix],input_length=MAX_SEQ_LEN, trainable=False ),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(64, 5, activation='relu'),
            MaxPooling1D(3),
            Conv1D(32, 5, activation='relu'),
            MaxPooling1D(2),
            Flatten(),
            Dropout(p=0.5),
            Dense(500, activation='relu'),
            Dropout(p=0.5),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_3conv1d_2FC')

compile_fit_evaluate(model_3conv1d_2FC, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_42 (Embedding)         (None, 1000, 100)     0           embedding_input_42[0][0]         
____________________________________________________________________________________________________
convolution1d_82 (Convolution1D) (None, 996, 128)      64128       embedding_42[0][0]               
____________________________________________________________________________________________________
maxpooling1d_82 (MaxPooling1D)   (None, 199, 128)      0           convolution1d_82[0][0]           
____________________________________________________________________________________________________
convolution1d_83 (Convolution1D) (None, 195, 64)       41024       maxpooling1d_82[0][0]            
___________________________________________________________________________________________