In [2]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.optimizers import Adam
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.
  return f(*args, **kwds)
  return f(*args, **kwds)


### Sequence Classification Problem
#### convert text into word vectors
#### train LSTM
#### have a dense layer (activation is softmax) at the end
#### the output from the dense layer is a number, and we can think of it as a probability

In [3]:
# Data preparation and add a column for label:
path = "../data/"
admission = pd.read_csv(path + 'ADMISSIONS.csv', usecols=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 
                                                   'DEATHTIME', 'ADMISSION_TYPE', 'DISCHARGE_LOCATION', 'DIAGNOSIS'])
# convert admission time and discharge time death time to correct format
admission.ADMITTIME = pd.to_datetime(admission.ADMITTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
admission.DISCHTIME = pd.to_datetime(admission.DISCHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
admission.DEATHTIME = pd.to_datetime(admission.DEATHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
# sort before group by
admission = admission.sort_values(['SUBJECT_ID','ADMITTIME'])
admission = admission.reset_index(drop = True)

# add the next admission date and type for each subject 
admission['NEXT_ADMITTIME'] = admission.groupby('SUBJECT_ID').ADMITTIME.shift(-1)
admission['NEXT_ADMISSION_TYPE'] = admission.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)
admission = admission.sort_values(['SUBJECT_ID','ADMITTIME'])
# back fill
admission[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = admission.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')
# compute days elapsed until next readmission
admission['DAYS_NEXT_ADMIT']=  (admission.NEXT_ADMITTIME - admission.DISCHTIME).dt.total_seconds()/(24*60*60)
# number of records that were readmitted in less than or equal to 30 days: 3390
records = admission[admission.DAYS_NEXT_ADMIT <= 30]
# read the notes table 
notes = pd.read_csv(path + "NOTEEVENTS.csv")
discharge_sum = notes.loc[notes.CATEGORY == 'Discharge summary']
notes_dis_sum_last = (discharge_sum.groupby(['SUBJECT_ID','HADM_ID']).nth(-1)).reset_index()
# perform a left join of the two datatable
dt_table = pd.merge(admission,notes_dis_sum_last[['SUBJECT_ID','HADM_ID','TEXT']], on = ['SUBJECT_ID','HADM_ID'],how = 'left')
# filter out new born records because a lot of them don't have discharge summary
dt_table = dt_table[dt_table.ADMISSION_TYPE != 'NEWBORN']
# add a column for label
dt_table['LABEL'] = (dt_table.DAYS_NEXT_ADMIT <= 30).astype('int')
# split the dataset into 80% training, 10% validation, and 10% testing:
# shuffle the dataset first:
dt_table_shuffled = dt_table.sample(n=len(dt_table), random_state=42)
dt_table_shuffled = dt_table_shuffled.reset_index(drop=True)
dt_train = dt_table_shuffled.sample(frac=0.80, random_state=42)
dt_val_test = dt_table_shuffled.drop(dt_train.index)
dt_val = dt_val_test.sample(frac=0.50, random_state=42)
dt_test = dt_val_test.drop(dt_val.index)
# skip sub-sampling


  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Tokenize words:
text = dt_table_shuffled.TEXT.values.astype(str)
label = dt_table_shuffled.LABEL.values
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

In [5]:
MAX_SEQUENCE_LENGTH = 500
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
word_index = tokenizer.word_index

In [6]:
print(dt_table_shuffled.shape)
print(dt_table_shuffled.shape[0])

(51113, 14)
51113


In [13]:
# split tokenized data into 80% training, 10% validation, and 10% testing
indices = np.arange(len(dt_table_shuffled))
#np.random.shuffle(indices)
#data = 
n_rows = int(dt_table_shuffled.shape[0])
x_train = data[:int(0.8 * n_rows), :]
x_val = data[int(0.8 * n_rows):int(0.9 * n_rows), :]
x_test = data[int(0.9 * n_rows):, :]
y_train = label[:int(0.8 * n_rows)]
y_val = label[int(0.8 * n_rows):int(0.9 * n_rows)]
y_test = label[int(0.9 * n_rows):]

In [8]:
# prepare the embedding
embeddingIdx = {}
f = open("../glove.6B/glove.6B.300d.txt")
for line in f:
    values = line.split()
    word = values[0]
    coeffs = np.asarray(values[1:], dtype='float32')
    embeddingIdx[word] = coeffs
f.close()
print("Size of the GloVe vocabulary: ", len(embeddingIdx))

Size of the GloVe vocabulary:  400000


In [9]:
EMBEDDING_DIM = 300
embeddingMatrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embeddingVector = embeddingIdx.get(word)
    if embeddingVector is not None:
        embeddingMatrix[i] = embeddingVector
    #else:
        #embeddingMatrix[i] = np.random.uniform(-0.25, 0.25, 300)
        



In [10]:
from keras.layers import Embedding
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embeddingMatrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [29]:
# save data for furture work:
# import pickle
# with open("embedding_layer.pkl", "wb") as f:
#     pickle.dump(embedding_layer, f)
# with open("embeddingMatrix.pkl", "wb") as f:    
#     pickle.dump(embeddingMatrix, f)

In [11]:
# Bulid LSTM model:
hidden_size = 64
max_epochs = 10
batch_size = 1280
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(hidden_size, dropout=0.5, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='softmax'))

model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=Adam(lr=0.0001))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 300)          65456700  
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 65,550,205
Trainable params: 93,505
Non-trainable params: 65,456,700
_________________________________________________________________


In [12]:
# Training the network:
model.fit(x_train, y_train, epochs=max_epochs, batch_size=batch_size)

Epoch 1/10
 8960/40890 [=====>........................] - ETA: 18:49 - loss: 14.9869 - acc: 0.0599

KeyboardInterrupt: 

In [None]:
layers = [LSTM(2), Dense(1)]
model = Sequential(layers)