## Generating raw feature Xraw, using (semtype,cui) pair and output k-hot encoding raw features

# Getting X_raw

In [1]:
import os, sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from pprint import pprint
import cPickle as pk
np.random.seed(1) # to be reproductive

In [2]:
NOTE_DATA_DIR = '/local/XW/DATA/MIMIC/noteevents_by_sid/'
UMLS_DATA_DIR = '/local/XW/DATA/MIMIC/UMLS_by_sid/'

In [3]:
'''
in UMLS_DATA_DIR, each .pk file contains a list, each element in list corresponds to a noteevent

each noteevent is represented as a list of lists of dicts (one list per sentence?), 
the concepts are stored in the dicts

CUI: The Concept Unique Identifier
semtype: Semantic Type - One of the broad categories 

see: https://www.nlm.nih.gov/research/umls/new_users/online_learning/glossary.html
'''
# example:
# with open(os.path.join(UMLS_DATA_DIR, '2.pk')) as f:
#     example = pk.load(f)
#     pprint(example,indent=2, depth=3)
#     pprint(example[0][0])

'\nin UMLS_DATA_DIR, each .pk file contains a list, each element in list corresponds to a noteevent\n\neach noteevent is represented as a list of lists of dicts (one list per sentence?), \nthe concepts are stored in the dicts\n\nCUI: The Concept Unique Identifier\nsemtype: Semantic Type - One of the broad categories \n\nsee: https://www.nlm.nih.gov/research/umls/new_users/online_learning/glossary.html\n'

In [4]:
len(os.listdir(UMLS_DATA_DIR))

46146

In [5]:
unique_cui_semtype_pair = set()
for fn in tqdm(os.listdir(UMLS_DATA_DIR)[:]):
    fpath = os.path.join(UMLS_DATA_DIR, fn)
    with open(fpath) as f:
        concepts_per_sid = pk.load(f)
        for concepts_per_note in concepts_per_sid:
            for concept_per_sentence in concepts_per_note:
                for concept in concept_per_sentence:
                    cui = concept['cui']
                    for st in concept['semtypes']:
                        unique_cui_semtype_pair.add((st,cui))
print len(unique_cui_semtype_pair)

100%|██████████| 46146/46146 [40:52<00:00, 18.82it/s]

69574





In [6]:
i = 0 
concept2id = {} # each "concept" is a (semtype,cui) pair
for c in unique_cui_semtype_pair:
    concept2id[c] = i
    i += 1

In [7]:
X = []
sids = []
i = 0
for fn in tqdm(os.listdir(UMLS_DATA_DIR)[:]):
    x = set()
    sid = int(fn[:-3])
    sids.append(sid); i+=1
    fpath = os.path.join(UMLS_DATA_DIR, fn)
    with open(fpath) as f:
        concepts_per_sid = pk.load(f)
        for concepts_per_note in concepts_per_sid:
            for concept_per_sentence in concepts_per_note:
                for concept in concept_per_sentence:
                    cui = concept['cui']
                    for st in concept['semtypes']:
                        concept = (st, cui)
                        x.add(concept2id[concept])
    X.append(list(x))

100%|██████████| 46146/46146 [1:12:11<00:00, 10.65it/s]


In [8]:
# turn X into sparse matrix
from scipy.sparse import csr_matrix
data, rows, cols = [], [], []
for r in xrange(len(X)):
    for c in X[r]:
        rows.append(r)
        cols.append(c)
        data.append(1.0)
X = csr_matrix((data, (rows, cols)))
print X.shape
sids = np.array(sids)
print sids

(46146, 69574)
[    2     3     4 ..., 99992 99995 99999]


In [9]:
X[0].sum(), X[1].sum() # 1st and 2nd row, corresponds to sid=2,3

(90.0, 1419.0)

In [10]:
# shuffle X's rows (sids are to be shuffled too)
shuffle_index = np.arange(X.shape[0])
np.random.shuffle(shuffle_index)
X = X[shuffle_index]
sids = sids[shuffle_index]

In [11]:
sid2rowidx = {} # map sid(int) to the row index in the (shuffled) raw feature matrix X
for sid,rowidx in zip(sids, range(len(sids))):
    sid2rowidx[sid] = rowidx

In [12]:
X[sid2rowidx[2]].sum(), X[sid2rowidx[3]].sum() # check the mapping is correct...

(90.0, 1419.0)

In [13]:
# OUT_FILENAME = './data/umls_raw_features.pk'
# data_to_pickle ={
#     'X_raw': X,
#     'unique_concepts': unique_concepts,
#     'unique_semtypes': unique_semtypes,
#     'concept2id': concept2id,
#     'sid2rowidx': sid2rowidx 
# }
# with open(OUT_FILENAME,'wb') as f:
#     pk.dump(data_to_pickle, f, pk.HIGHEST_PROTOCOL)

In [14]:
len(sid2rowidx)

46146

In [15]:
Xraw = X

---

# Training denoising autoencoder

In [16]:
import time
from keras.layers import Dense, Input
from keras.callbacks import Callback, EarlyStopping, TensorBoard
from keras.models import Model

Using TensorFlow backend.


In [17]:
ENCODING_DIM = 500
INPUT_DIM = Xraw.shape[-1]
NOISE_PORTION = 0.5 # randomly mask protion
VALIDATION_SPLIT = 0.2
BATCH_SZ = 128
NB_EPOCH = 50

In [18]:
input_raw = Input(shape=(INPUT_DIM,))
hiddenlayer = Dense(ENCODING_DIM, activation='relu')
outputlayer = Dense(INPUT_DIM, activation='sigmoid')

encoded = hiddenlayer(input_raw)
decoded = outputlayer(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(input=input_raw, output=decoded, name='autoencoder')
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

In [19]:
def add_noise(X):
    nb_masked = int(INPUT_DIM*NOISE_PORTION)
    masks = []
    mask = [0]*nb_masked+[1]*(INPUT_DIM-nb_masked)
    for i in xrange(X.shape[0]):
        np.random.shuffle(mask)
        masks.append(mask)
    masks = np.array(masks)
    X_noisy = X * masks
    return X_noisy

In [20]:
def batch_generator(X, batch_size=BATCH_SZ): 
    # from sparse X, generate dense x_batch and x_batch_noisy
    shuffle_index = np.arange(X.shape[0])
    np.random.shuffle(shuffle_index)
    X =  X[shuffle_index, :]
    samples_per_epoch = X.shape[0]
    number_of_batches = samples_per_epoch//batch_size
    counter=0
    while 1:
        offset = batch_size*counter
        index_batch = shuffle_index[offset: min(samples_per_epoch, offset+batch_size)]
        X_batch = X[index_batch,:].toarray()
        X_batch_noisy = add_noise(X_batch)
        counter += 1
        if (counter >= number_of_batches):
            np.random.shuffle(shuffle_index)
            counter=0
        yield ( X_batch_noisy, X_batch ) # X: corrupted (input), y: original (label)

In [21]:
train_sz = int(Xraw.shape[0]*(1-VALIDATION_SPLIT))
train_sz = (train_sz//BATCH_SZ) * BATCH_SZ # make train_sz divisible by BATCH_SX
X_train = Xraw[:train_sz]
X_val = Xraw[train_sz:]
print X_train.shape, X_val.shape

(36864, 69574) (9282, 69574)


In [22]:
logdir = os.path.join('logs/autoencoder', time.strftime('%m%d_%Hh%M'))

_callbacks = [EarlyStopping(monitor='val_loss', patience=2),
              TensorBoard(log_dir=logdir, histogram_freq=0, write_graph=False) # 
             ]

autoencoder.fit_generator( # memory usage is ~11G 
    generator=batch_generator(X_train),
    samples_per_epoch=X_train.shape[0],
    validation_data = batch_generator(X_val),
    nb_val_samples = X_val.shape[0],
    nb_epoch=NB_EPOCH, 
    callbacks = _callbacks )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


NameError: name 'logdirb' is not defined

In [23]:
print 'run "tensorboard --logdir=%s" to launch tensorboard'%logdir
MODEL_PATH = './models/'
model_fpath = os.path.join( MODEL_PATH, 'autoencoder_%s.h5' % time.strftime('%m%d_%Hh%M') )
autoencoder.save(model_fpath)

run "tensorboard --logdir=logs/autoencoder/1108_19h37" to launch tensorboard


---

# Getting deep patient feature using trained autoencoder

In [24]:
encoded2 = hiddenlayer(decoded)
decoded2 = outputlayer(encoded2)
encoded3 = hiddenlayer(decoded2)

In [25]:
dpencoder = Model(input=input_raw, output=encoded3)
model_fpath = os.path.join(MODEL_PATH, 'dpencoder_%s.h5'% time.strftime('%m%d_%Hh%M') )
dpencoder.save(model_fpath)

In [26]:
dpvecs = []
for i in tqdm(xrange(0, Xraw.shape[0], BATCH_SZ)):
    x_batch = Xraw[i:min(i+BATCH_SZ, Xraw.shape[0])].toarray()
    dpveci = dpencoder.predict(x_batch)
    dpvecs.append(dpveci)
Xdp = np.vstack(dpvecs)

100%|██████████| 361/361 [05:10<00:00,  1.38it/s]


In [28]:
print Xdp.shape, Xraw.shape
print len(sid2rowidx), len(unique_cui_semtype_pair), len(concept2id)

description = '''
contains sid2rowidx mapping, and deep patient feature Xdp and raw feature Xraw
here the Xraw uses (semtype,cui) pair and each row is k-hot encoding
all unique such pairs are in unique_cui_semtype_pair
(st,cui) pair to encoding id is in concept2id.
'''

data_to_pickle = {
    'description': description,
    'sid2rowidx': sid2rowidx,
    'Xdp': Xdp,
    'Xraw': Xraw,
    'unique_cui_semtype_pair': unique_cui_semtype_pair,
    'concept2id': concept2id,
}
with open('./data/feature_DP_st_cui.pk', 'wb') as f:
    pk.dump(data_to_pickle, f, pk.HIGHEST_PROTOCOL)

(46146, 500) (46146, 69574)
46146 69574 69574
