In [1]:
import os, sys, time
import numpy as np
import pandas as pd
from tqdm import tqdm
import cPickle as pk
np.random.seed(1) # to be reproductive

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import Conv2D, MaxPooling2D, Reshape
from keras.models import Sequential
from keras.models import load_model
from keras.utils.np_utils import to_categorical
from keras.callbacks import TensorBoard
from keras.callbacks import Callback, EarlyStopping
import keras.backend as K
from keras.layers import Input, Dense, Activation
from keras.models import Model

Using TensorFlow backend.


In [2]:
ENCODING_DIM = 500
INPUT_DIM = 59790
NOISE_PORTION = 0.5 # randomly mask protion
VALIDATION_SPLIT = 0.2
BATCH_SZ = 128
NB_EPOCH = 50

In [3]:
input_raw = Input(shape=(INPUT_DIM,))
# *** for final dpvec, should I take the activations before relu ??? ***
hiddenlayer = Dense(ENCODING_DIM, activation='relu')
outputlayer = Dense(INPUT_DIM, activation='sigmoid')

encoded = hiddenlayer(input_raw)
decoded = outputlayer(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(input=input_raw, output=decoded)
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

In [4]:
X = pk.load(open('./data/umls_raw_feature.pk'))

In [5]:
# X = X[:100]

In [6]:
shuffle_index = np.arange(np.shape(X)[0])
np.random.shuffle(shuffle_index)
X = X[shuffle_index]

In [7]:
train_sz = int(X.shape[0]*(1-VALIDATION_SPLIT))
X_train = X[:train_sz]
X_val = X[train_sz:]
print X_train.shape, X_val.shape

(36916, 59790) (9230, 59790)


In [8]:
def add_noise(X):
    nb_masked = int(INPUT_DIM*NOISE_PORTION)
    masks = []
    mask = [0]*nb_masked+[1]*(INPUT_DIM-nb_masked)
    for i in xrange(X.shape[0]):
        np.random.shuffle(mask)
        masks.append(mask)
    masks = np.array(masks)
    X_noisy = X * masks
    return X_noisy

In [9]:
def batch_generator(X, batch_size=BATCH_SZ): 
    shuffle_index = np.arange(X.shape[0])
    np.random.shuffle(shuffle_index)
    X =  X[shuffle_index, :]
    
    samples_per_epoch = X.shape[0]
    number_of_batches = samples_per_epoch//batch_size
    counter=0
    while 1:
        index_batch = shuffle_index[batch_size*counter: min(samples_per_epoch, batch_size*(counter+1))]
        X_batch = np.array(X[index_batch,:].todense())
        X_batch_noisy = add_noise(X_batch)
        counter += 1
        if (counter >= number_of_batches):
            np.random.shuffle(shuffle_index)
            counter=0
        yield ( X_batch_noisy, X_batch ) # X: corrupted, y: original 

In [None]:
logdir = 'logs/autoencoder'
_callbacks = [EarlyStopping(monitor='val_loss', patience=2),
              TensorBoard(log_dir=logdir, histogram_freq=0, write_graph=False) # 
             ]
# ~11G memory usage
autoencoder.fit_generator(
    generator=batch_generator(X_train, batch_size=BATCH_SZ),
    samples_per_epoch=X_train.shape[0],
    validation_data = batch_generator(X_val, batch_size=BATCH_SZ),
    nb_val_samples = X_val.shape[0],
    nb_epoch=NB_EPOCH, 
    callbacks = _callbacks )
print 'run "tensorboard --logdir=%s" to launch tensorboard'%logdir

Epoch 1/50
  896/36916 [..............................] - ETA: 744s - loss: 0.6931

In [None]:
sid2dpvec = {} # maps sid(int) to the deep patient vector (dim=500, dense vector)