In [1]:
import os, sys, time, re
import numpy as np
import pandas as pd
from tqdm import tqdm
import cPickle as pk
np.random.seed(1) # to be reproductive

In [2]:
# paths
PK_FPATH = 'data/processed_data_sidhid.pk'
MODEL_FPATH = './models/1124_model_2embed_2conv1d_2FC.h5' # path of best trained model 
NOTES_DIR = '/local/XW/DATA/MIMIC/noteevents_by_sid_hid/'
# constants
MAX_NB_WORDS = 20000 # top 20k most freq words
MAX_SEQ_LEN = 1000
N_LABELS = 50
N_SIDHID = 58328

### Load data

In [3]:
# load pickled data
pk_data = pk.load(open(PK_FPATH, 'rb'))

In [4]:
X_train = pk_data['X_train']

In [5]:
print pk_data['description']

This file contains the prepared data for note2vec training, 
* sidhids:     list of the 58361 unique (sid,hid) pairs
* sidhid2icds: mapping from (sid,hid) pair --> set of icd codes
* sidhid2khot: mapping from (sid,hid) pair --> khot-encoding correponding to this sidhid pair
* sidhid2seq:  mapping from (sid,hid) pair --> fix-length sequences (len=1000) of word ids
* word2idx:    mapping from a word to its id used in the sequence
* embedding_w2v／embedding_glove: matrices for the embedding layer (used as the weights parameter)
* train_sidhids/val_sidhids: list of (sid,hid) pairs used as training/validation set
* X_train/Y_train/X_val/Y_val: ndarray generated for training/validation

And here are 2 useful functions' source code: 

def to_khot(sidhid2icds, K=N_LABELS): # generate khot encoding (useful if want to change the K)
    icds = zip( *icd_ctr.most_common(N_LABELS-1) )[0] + ('other',)
    sidhid2khot = {} # map subject_id to k-hot vector
    for sid,hid in sidhid2icds.keys():
       

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [7]:
# ***NOTE***
# To load models from file, we have to modify metrics.py at: 
# `/local/XW/SOFT/anaconda2/envs/thesis_nb/lib/python2.7/site-packages/keras/` 
# to add the custom metric function, otherwise `load_model` throws exception ! 
# cf issue: https://github.com/fchollet/keras/issues/3911
from keras.models import load_model
model = load_model(MODEL_FPATH)

In [8]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_3 (Embedding)          (None, 1000, 200)     0           main_input[0][0]                 
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 1000, 200)     0           main_input[0][0]                 
____________________________________________________________________________________________________
convolution1d_4 (Convolution1D)  (None, 996, 128)      128128      embedding_3[0][0]                
___________________________________________________________________________________________

In [9]:
print model.layers[0].input
print model.layers[11].output

Tensor("main_input:0", shape=(?, 1000), dtype=int32)
Tensor("Relu_3:0", shape=(?, 500), dtype=float32)


In [10]:
# use K.function to construct a model that outputs embedding vector
from keras import backend as K
get_embedvec = K.function([model.layers[0].input, K.learning_phase()],
                                  [model.layers[11].output])
embedvec = lambda X: get_embedvec([X,0])[0]

In [11]:
# output in test mode = 0
layer_output = embedvec(X_train[:10])
print layer_output.shape

(10, 500)


## Turn a paragraph into 500-dimensional input vector

In [12]:
# sidhids = []
# texts = [] # text bodies
# for fname in tqdm(os.listdir(NOTES_DIR)): # the data is 3.7G in size, can hold in memory...
#     sid,hid = map( int, fname[:-4].split('_') )
#     sidhids.append( (sid,hid) )
#     fpath = os.path.join(NOTES_DIR, fname)
#     df = pd.read_csv(fpath)
#     texts.append( '\n=======\n\n\n'.join(df['text']) )
# print('found %d texts' % len(texts))

# tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, # filter out numbers, otherwise lots of numbers
#                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+'0123456789') 
# print 'fitting on whole text corpus...',
# tokenizer.fit_on_texts(texts) # this might take some time
# print 'done. '

# pk.dump(tokenizer, open('data/tokenizer.pk', 'wb'), pk.HIGHEST_PROTOCOL)

with open('data/tokenizer.pk', 'rb') as f:
    tokenizer = pk.load(f)

In [13]:
def paragraph2vec(paragraph):
    seqs = tokenizer.texts_to_sequences([paragraph])
    seqs_padded = pad_sequences(seqs, maxlen=MAX_SEQ_LEN)
    return embedvec(seqs_padded)

In [15]:
paragraph_sample = ''' The imaged portions of the abdomen show a few [**Last Name (un) 36399**]-filled loops of bowel
   within the left abdomen.  No abnormal soft tissue mass or calcifications.  No
   free interperitoneal air.  The imaged bony structures are unremarkable.'''
paragraph2vec(paragraph_sample).shape

(1, 500)

In [16]:
def divide_paragraphs(text):
    pat = re.compile('\W*\n\W*\n')
    return pat.split(text)

## Helper functions to extract histvec from query/article

In [17]:
from tqdm import tqdm
from lxml import etree

In [18]:
PMC_PATH = '/local/XW/DATA/TREC/PMCs/'
pmcid2fpath = {}

for subdir1 in os.listdir(PMC_PATH):
    for subdir2 in os.listdir(os.path.join(PMC_PATH, subdir1)):
        diry = os.path.join(PMC_PATH, subdir1, subdir2)
        for fn in os.listdir(diry):
            pmcid = fn[:-5]
            fpath = os.path.join(diry, fn)
            pmcid2fpath[pmcid] = fpath

In [19]:
topic_tree = etree.parse('data/topics2016.xml')

def get_query_paragraphs(i):# returns the summary string of the ith topic
    note = topic_tree.xpath('//topic[@number="%d"]/note/text()'%i)[0]
    return divide_paragraphs( str(note).lower() )

In [20]:
len( get_query_paragraphs(1) )

2

In [21]:
def get_article_paragraphs(pmcid):
    'returns a list of texts, each as a paragraph'
    fpath = pmcid2fpath[pmcid]
    tree = etree.parse(fpath)
    ret = []
    body = tree.xpath('//body')[0]
    for p in body.xpath('.//p'):
        ret.append( p.xpath('string(.)') )
    return ret

In [22]:
# get_article_paragraphs('107838')

In [23]:
PARA_PLACEHOLDER = '</s>'
from numpy.linalg import norm

In [26]:
def get_histvec(query_para, pmcid):
    if query_para == PARA_PLACEHOLDER: 
        return np.zeros(30)
    qvec = paragraph2vec(query_para)
    dvecs = np.vstack( [ paragraph2vec(p.encode('ascii','ignore')) for p in get_article_paragraphs(pmcid)] )
    cossims = np.dot(dvecs, qvec.T) / norm(qvec) / norm(dvecs, axis=1)
    hist, _ = np.histogram( cossims, bins=30, range=(0,1) )
    ret = np.log(hist+1)
    return ret 

In [32]:
# get_histvec(get_query_paragraphs(1)[1], '107838')

In [28]:
query_para = get_query_paragraphs(1)[1] 
pmcid = '107838'
qvec = paragraph2vec(query_para)

In [29]:
def get_query_doc_feature(query, pmcid): # query: list of paragraphs
    return np.array([ get_histvec(p, pmcid) for p in query])

In [31]:
# get_query_doc_feature(get_query_paragraphs(1), '107838')

## Prepare data: padded queries, positive and negative histograms

In [33]:
QUERIES = [get_query_paragraphs(i) for i in xrange(1,31)] 

In [34]:
print map(len, QUERIES)

[2, 10, 1, 2, 2, 2, 5, 4, 9, 3, 2, 1, 5, 2, 1, 2, 3, 7, 5, 3, 1, 4, 6, 2, 2, 3, 6, 4, 4, 3]


In [35]:
N = max(map(len, QUERIES)) # = max query length
print N

10


In [36]:
def pad_query(q, SZ=N):
    return q + [PARA_PLACEHOLDER]*(SZ-len(q))
QUERIES = map(pad_query, QUERIES)

In [39]:
pmcid_2relevance = [{} for i in xrange(31)] # list of dict mapping pmcid to relevance
with open('data/qrels.txt') as f:
    for line in tqdm(f, total=37707): 
        topicid, _, pmcid, relevance = line.split()
        try:
            if len( get_article_paragraphs(pmcid) )==0: continue
            topicid = int(topicid)
            pmcid_2relevance[topicid][pmcid] = int(relevance)
        except: pass

100%|██████████| 37707/37707 [11:43<00:00, 53.61it/s]


---

## Define model: no gating this time ???

In [40]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, InputLayer, Flatten, Input, Merge, merge, Reshape
import keras.backend as K
from keras.callbacks import EarlyStopping, TensorBoard
import tensorflow as tf

In [42]:
feed_forward = Sequential(
    [Dense(input_dim=30, output_dim=10, activation='relu', bias=False),
     Dense(output_dim=5, activation='relu', bias=False),
     Dense(output_dim=1, activation='tanh', bias=False)], 
    name='feed_forward_nw')

In [43]:
# ***note: have to wrap ops into Lambda layers !!***
# cf: https://groups.google.com/forum/#!topic/keras-users/fbRS-FkZw_Q
from keras.layers.core import Lambda

# input: hist vectors
input_hists = Input(shape=(N,30), name='input_hists')

def slicei(x, i): return x[:,i,:]
def slicei_output_shape(input_shape): return (input_shape[0], input_shape[2])
zs = [ feed_forward( Lambda(lambda x:slicei(x,i), slicei_output_shape, name='slice%d'%i)(input_hists) )\
          for i in xrange(N) ]

def concat(x): return K.concatenate(x) 
def concat_output_shape(input_shape): return (input_shape[0][0], N)
zs = Lambda(concat, concat_output_shape, name='concat_zs')(zs)


def mean(x): return K.mean(x)
def mean_output_shape(input_shape): return (input_shape[0],1)
scores = Lambda(mean, mean_output_shape, name='mean')(zs)

scoring_model = Model(input=input_hists, output=scores, name='scoring_model')

In [46]:
# third input -- the negative hists vector 
input_hists_neg = Input(shape=(N,30), name='input_hists_neg')

zs_neg = [ feed_forward( Lambda(lambda x:slicei(x,i), slicei_output_shape, name='slice%d_neg'%i)(input_hists_neg) )\
          for i in xrange(N) ]

zs_neg = Lambda(concat, concat_output_shape, name='concat_zs_neg')(zs_neg)

scores_neg =  Lambda(mean, mean_output_shape, name='mean_neg')(zs_neg)

two_score_model = Model(input=[input_hists, input_hists_neg], 
                        output=[scores, scores_neg], 
                        name='two_score_model')

def diff(x): return tf.sub(x[0], x[1]) #x[0]-x[1]
def diff_output_shape(input_shape): return input_shape[0]
posneg_score_diff = Lambda(diff, diff_output_shape, name='posneg_score_diff')([scores, scores_neg])
ranking_model = Model(input=[input_hists,  input_hists_neg]
                      , output=[posneg_score_diff]
                      , name='ranking_model')

# define my loss function: hinge of score_pos - score_neg
def pairwise_hinge(y_true, y_pred): # y_pred = score_pos - score_neg, **y_true doesn't matter here**
    return K.mean( K.maximum(1. - y_pred, y_true*0.0) )  

# self-defined metrics
def ranking_acc(y_true, y_pred):
    y_pred = y_pred > 0 
    return K.mean(y_pred)

ranking_model.compile(optimizer='adagrad', loss=pairwise_hinge, metrics=[ranking_acc])