In [94]:
import os,sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from pprint import pprint
import cPickle as pk
np.random.seed(1) # to be reproductive

In [95]:
NOTE_DATA_DIR = '/local/XW/DATA/MIMIC/noteevents_by_sid/'
UMLS_DATA_DIR = '/local/XW/DATA/MIMIC/UMLS_by_sid/'

In [96]:
'''
in UMLS_DATA_DIR, each .pk file contains a list, each element in list corresponds to a noteevent

each noteevent is represented as a list of lists of dicts (one list per sentence?), 
the concepts are stored in the dicts

CUI: The Concept Unique Identifier
semtype: Semantic Type - One of the broad categories 

see: https://www.nlm.nih.gov/research/umls/new_users/online_learning/glossary.html
'''
# example:
with open(os.path.join(UMLS_DATA_DIR, '2.pk')) as f:
    pprint(pk.load(f),indent=2, depth=3)

[ [ [{...}],
    [{...}, {...}, {...}, {...}],
    [{...}],
    [{...}],
    [{...}, {...}],
    [{...}, {...}],
    [{...}],
    [{...}, {...}, {...}],
    [{...}, {...}, {...}],
    [{...}],
    [{...}, {...}, {...}],
    [{...}, {...}, {...}],
    [{...}],
    [{...}],
    [{...}, {...}],
    [{...}],
    [{...}],
    [{...}]],
  [ [{...}, {...}],
    [{...}, {...}, {...}, {...}, {...}],
    [{...}, {...}],
    [{...}, {...}],
    [{...}, {...}],
    [ {...},
      {...},
      {...},
      {...},
      {...},
      {...},
      {...},
      {...},
      {...},
      {...},
      {...}],
    [{...}],
    [ {...},
      {...},
      {...},
      {...},
      {...},
      {...},
      {...},
      {...},
      {...},
      {...},
      {...}],
    [{...}, {...}, {...}],
    [{...}, {...}, {...}, {...}, {...}, {...}, {...}, {...}, {...}],
    [{...}, {...}],
    [{...}, {...}],
    [{...}],
    [{...}, {...}],
    [{...}, {...}],
    [{...}],
    [{...}],
    [{...}],
    [{...}, {...}

In [None]:
len(os.listdir(UMLS_DATA_DIR))

46146

In [None]:
unique_concepts = set()
unique_semtypes = set()
for fn in tqdm(os.listdir(UMLS_DATA_DIR)[:]):
    fpath = os.path.join(UMLS_DATA_DIR, fn)
    with open(fpath) as f:
        concepts_per_sid = pk.load(f)
        for concepts_per_note in concepts_per_sid:
            for concept_per_sentence in concepts_per_note:
                for concept in concept_per_sentence:
                    unique_concepts.add( concept['cui'] ) # CUI = Concept Unique Identifiers
                    unique_semtypes.update(concept['semtypes'])
print len(unique_concepts)
print len(unique_semtypes)

  0%|          | 188/46146 [00:14<49:45, 15.39it/s]  

In [None]:
i = 0 
concept2id = {}
for c in unique_concepts:
    concept2id[c] = i
    i += 1

In [None]:
X = []
sids = []
i = 0
for fn in tqdm(os.listdir(UMLS_DATA_DIR)[:]):
    x = set()
    sid = int(fn[:-3])
    sids.append(sid); i+=1
    fpath = os.path.join(UMLS_DATA_DIR, fn)
    with open(fpath) as f:
        concepts_per_sid = pk.load(f)
        for concepts_per_note in concepts_per_sid:
            for concept_per_sentence in concepts_per_note:
                for concept in concept_per_sentence:
                    cui = concept['cui']
                    x.add(concept_to_id[cui])
    X.append(list(x))


In [None]:
# turn X into sparse matrix
from scipy.sparse import csr_matrix
data, rows, cols = [], [], []
for r in xrange(len(X)):
    for c in X[r]:
        rows.append(r)
        cols.append(c)
        data.append(1.0)
X = csr_matrix((data, (rows, cols)))
print X.shape
sids = np.array(sids)
print sids

In [None]:
X[0].sum(), X[1].sum() # 1st and 2nd row, corresponds to sid=2,3

In [None]:
# shuffle X's rows (sids are to be shuffled too)
shuffle_index = np.arange(X.shape[0])
np.random.shuffle(shuffle_index)
X = X[shuffle_index]
sids = sids[shuffle_index]

In [None]:
sid2rowidx = {} # map sid(int) to the row index in the (shuffled) raw feature matrix X
for sid,rowidx in zip(sids, range(len(sids))):
    sid2raw[sid] = rowidx

In [None]:
X[sid2raw[2]].sum(), X[sid2raw[3]].sum() # check the mapping is correct...

In [None]:
OUT_FILENAME = './data/umls_raw_features.pk'
data_to_pickle ={
    'X_raw': X,
    'unique_concepts': unique_concepts,
    'unique_semtypes': unique_semtypes,
    'concept2id': concept2id,
    'sid2rowidx': sid2rowidx 
}
with open(OUT_FILENAME,'wb') as f:
    pk.dump(data_to_pickle, f, pk.HIGHEST_PROTOCOL)

---

### Below are some toy code for training/getting deep patient feature vectors

In [None]:
import os, sys, time
import numpy as np
import pandas as pd
from tqdm import tqdm
import cPickle as pk
np.random.seed(1) # to be reproductive

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import Conv2D, MaxPooling2D, Reshape
from keras.models import Sequential
from keras.models import load_model
from keras.utils.np_utils import to_categorical
from keras.callbacks import TensorBoard
from keras.callbacks import Callback, EarlyStopping
import keras.backend as K
from keras.layers import Input, Dense
from keras.models import Model

In [None]:
X = np.array(X[:100].todense())

In [None]:
ENCODING_DIM = 500
INPUT_DIM = X.shape[-1]
NOISE_PORTION = 0.5 # randomly mask protion
VALIDATION_SPLIT = 0.2
BATCH_SZ = 128
NB_EPOCH = 50

In [None]:
input_raw = Input(shape=(INPUT_DIM,))
# for final dpvec, should I take the activations before relu ???
hiddenlayer = Dense(ENCODING_DIM, activation='relu')
outputlayer = Dense(INPUT_DIM, activation='sigmoid')

encoded = hiddenlayer(input_raw)

decoded = outputlayer(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(input=input_raw, output=decoded)

autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

In [None]:
autoencoder.fit(X, X, nb_epoch=1, batch_size=10)

In [None]:
encoded2 = hiddenlayer(decoded)
decode2 =outputlayer(encoded2)
encoded3 = hiddenlayer(decode2)

In [None]:
encoder = Model(input=input_raw, output=encoded3)
# X_encoded = encoder.predict(X)

In [None]:
dpvec = encoder.predict(X) # deep patient feature vecs

In [None]:
print dpvec.shape