In [1]:
import os, sys, time
import numpy as np
import pandas as pd
from tqdm import tqdm
import cPickle as pk

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [14]:
np.random.seed(1) 
import settings
reload(settings)
from settings import * 

## 1. prepare Y: khot encoding (`sidhid2khot`)

In [5]:
# load the icd code into a dict
from collections import Counter
# map (subject_id, hadm_id) ---> icd codes of this patient (set)
sidhid2icds = {} # dict[(int,int), set(str)] 
icd_ctr = Counter()
# read icd info from the ICD file
with open(ICD_FPATH) as f: 
    for line in f: 
        sid, hid, _icds = line.split(',')
        sid, hid = map( int, (sid,hid) )
        _icds = _icds.split()
        icd_ctr.update(_icds)
        sidhid2icds[(sid,hid)] = set(_icds)

def to_khot(sidhid2icds, K=N_LABELS):
    '''generate khot encoding dict 
    * sidhid2icds is a dict[(int,int), set(str)], maps (sid,hid) pair to all icd codes for this patient/stay
    * keep top K-1 most freq icd codes (plus one 'other' label) 
    returns `sidhid2khot:dict[(int,int), np.array]`, that maps (sid,hid) to a khot encoding vector 
    returns the topicds as well 
    '''
    topicds = zip( *icd_ctr.most_common(K-1) )[0] + ('other',)
    # now turn each subject into a k-hot vector
    sidhid2khot = {} # map subject_id to k-hot vector
    for sid,hid in sidhid2icds.keys():
        _khot = np.zeros(K)
        for _icd in sidhid2icds[(sid,hid)]:
            if _icd in topicds: 
                _khot[topicds.index(_icd)] = 1
            else: # label 'other icds'
                _khot[-1] = 1
        if sum(_khot) == 0: print 'strange case: ', (sid,hid)
        sidhid2khot[(sid,hid)] = _khot
    return sidhid2khot, topicds 
sidhid2khot, topicds = to_khot(sidhid2icds, K=N_LABELS)

In [6]:
# print topicds

In [7]:
# print sidhid2icds[(73816, 159209)]

In [9]:
# sidhid2khot[(73816, 159209)]

## 2. prepare X: turn notes into fix-length of word ids (`sidhid2seq`)

In [10]:
sidhids = []
texts = [] # text bodies
for fname in tqdm(os.listdir(NOTES_DIR)): # the data is 3.7G in size, can hold in memory...
    sid,hid = map( int, fname[:-4].split('_') )
    sidhids.append( (sid,hid) )
    fpath = os.path.join(NOTES_DIR, fname)
    df = pd.read_csv(fpath)
    texts.append( '\n=======\n\n\n'.join(df['text']) )
print('found %d texts' % len(texts))

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, # filter out numbers, otherwise lots of numbers
                     filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+'0123456789') 
print 'fitting on whole text corpus...',
tokenizer.fit_on_texts(texts) # this might take some time
print 'done. '

seqs = tokenizer.texts_to_sequences(texts) # turn article into seq of ids
word2idx = tokenizer.word_index # dictionary mapping words (str) ---> their index (int)

print 'found %s unique tokens, use most frequent %d of them'%(len(word2idx), MAX_NB_WORDS)

# print sorted(word2idx.items(), key=lambda (k,v): v)[:100] # TODO: remove stopwords
print 'padding sequences...',
seqs_padded = pad_sequences(seqs, maxlen=MAX_SEQ_LEN) # turn into fix-length sequences
print 'done.'

sidhid2seq = {}
for (sid,hid), seq in zip(sidhids,seqs_padded):
    sidhid2seq[(sid,hid)] = seq

del texts

100%|██████████| 58328/58328 [02:54<00:00, 334.23it/s]


found 58328 texts
fitting on whole text corpus... done. 
found 356391 unique tokens, use most frequent 100000 of them


## 3. Prepare embedding matrix

In [11]:
# build index mapping: map word to its vector
word2vec = {} # maps word ---> embedding vector
with open(W2V_FPATH) as f:
    for line in tqdm(f, total=5443657):
        vals = line.split()
        word = vals[0]
        if word in word2idx or word=='</s>':
            word2vec[word] = np.asarray(vals[1:], dtype='float')
print 'found %d word vectors.' % len(word2vec)

nb_words = min(MAX_NB_WORDS, len(word2idx))
embedding_w2v = np.zeros( (nb_words+1, EMBEDDING_DIM) ) # +1 because ids in sequences starts from 1 ?
for word,wd_id in word2idx.items(): 
    if wd_id > MAX_NB_WORDS or word not in word2vec: # there might be 0 rows in embedding matrix
        continue # word_id>MAX_NB_WORDS, this id is not in the generated sequences, discard
    embedding_w2v[wd_id,:] = word2vec[word]

100%|██████████| 5443657/5443657 [01:14<00:00, 72777.92it/s]


found 86588 word vectors.


In [12]:
glove = {} # maps word ---> embedding vector
with open(GLOVE_FPATH) as f:
    for line in tqdm(f, total=400000):
        vals = line.split()
        word = vals[0]
        if word in word2idx or word=='</s>':
            glove[word] = np.asarray(vals[1:], dtype='float')
print 'found %d word vectors.' % len(glove)

embedding_glove = np.zeros( (nb_words+1, EMBEDDING_DIM) ) # +1 because ids in sequences starts from 1 ?
for word,wd_id in word2idx.items(): 
    if wd_id > MAX_NB_WORDS or word not in glove: # there might be 0 rows in embedding matrix
        continue # word_id>MAX_NB_WORDS, this id is not in the generated sequences, discard
    embedding_glove[wd_id,:] = glove[word]

100%|██████████| 400000/400000 [00:07<00:00, 53594.01it/s]


found 50708 word vectors.


## 4. Split data

Now we have all (sid,hid) pairs in the list `sidhids`, for each (sid,hid) pair, can get the sequence vector by dict `sidhid2seq`, the khot encoding by dict `sidhid2khot`, and if we want all icds (instead of the khot representation), just use the dict `sidhid2icds`. 

In [13]:
# split data
indices = np.arange(len(sidhids))
np.random.shuffle(indices)
validset_sz = int(VALIDATION_SPLIT*len(sidhids))
train_sidhids, val_sidhids = sidhids[:-validset_sz], sidhids[-validset_sz:]

def getXY(sidhid_lst, sidhid2seq=sidhid2seq, sidhid2khot=sidhid2khot): # give a list of (sid, hid) pairs, generate the X and Y
    data, labels = [], []
    for sidhid in sidhid_lst:
        data.append(sidhid2seq[sidhid])
        labels.append(sidhid2khot[sidhid])
    X = np.array(data)
    Y = np.array(labels)
    return X,Y

X_train, Y_train = getXY(train_sidhids)
print X_train.shape, Y_train.shape
X_val, Y_val = getXY(val_sidhids)
print X_val.shape, Y_val.shape

(46663, 1000) (46663, 50)
(11665, 1000) (11665, 50)


## 5. Dump to pk file

In [15]:
description = '''This file contains the prepared data for note2vec training, 
* sidhids:     list of the 58361 unique (sid,hid) pairs
* sidhid2icds: mapping from (sid,hid) pair --> set of icd codes
* sidhid2khot: mapping from (sid,hid) pair --> khot-encoding correponding to this sidhid pair
* sidhid2seq:  mapping from (sid,hid) pair --> fix-length sequences (len=1000) of word ids
* tokenizer: the tokenizer fit on corpus, toeknizer.word_index maps from a word to its idx used in the sequence
* embedding_w2v／embedding_glove: matrices for the embedding layer (used as the weights parameter)
* train_sidhids/val_sidhids: list of (sid,hid) pairs used as training/validation set
* X_train/Y_train/X_val/Y_val: ndarray generated for training/validation

And here are 2 useful functions' source code: 

def to_khot(sidhid2icds, K=N_LABELS): # generate khot encoding (useful if want to change the K)
    icds = zip( *icd_ctr.most_common(N_LABELS-1) )[0] + ('other',)
    sidhid2khot = {} # map subject_id to k-hot vector
    for sid,hid in sidhid2icds.keys():
        _khot = np.zeros(N_LABELS)
        for _icd in sidhid2icds[(sid,hid)]:
            if _icd in icds: 
                _khot[icds.index(_icd)] = 1
            else: # label 'other icds'
                _khot[-1] = 1
        sidhid2khot[(sid,hid)] = _khot
    return sidhid2khot

def getXY(sidhid_lst): # give a list of (sid, hid) pairs, generate the X and Y
    data, labels = [], []
    for sidhid in sidhid_lst:
        data.append(sidhid2seq[sidhid])
        labels.append(sidhid2khot[sidhid])
    X = np.array(data)
    Y = np.array(labels)
    return X,Y
'''

data_to_pickle = {
    'description'     : description,
    'sidhids'         : sidhids,
    'sidhid2icds'     : sidhid2icds,
    'sidhid2khot'     : sidhid2khot,
    'sidhid2seq'      : sidhid2seq,
    'tokenizer'       : tokenizer,
    'embedding_w2v'   : embedding_w2v,
    'embedding_glove' : embedding_glove,
    'train_sidhids'   : train_sidhids,
    'val_sidhids'     : val_sidhids,
    'X_train'         : X_train,
    'Y_train'         : Y_train,
    'X_val'           : X_val,
    'Y_val'           : Y_val,
}
with open(PK_FPATH, 'wb') as fout:
    pk.dump(data_to_pickle, fout, pk.HIGHEST_PROTOCOL)
print 'processed data is written into %s' % PK_FPATH

processed data is written into ../data/CNN_embedding_preprocessed.pk
