In [33]:
from keras.preprocessing import sequence
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding, Input, LSTM, TimeDistributed, Dense
from keras.utils import to_categorical
import numpy as np
import os

In [2]:
TEXT_DIR = "train_sections_new.txt"
SUMM_DIR = "train_summary_new.txt"

In [3]:
# import re
# new_text = "train_sections_new.txt"
# with open(new_text, 'w') as f_out:
#     with open(TEXT_DIR, 'r') as f_in:
#         for line in f_in:
#             line = eval(line)
#             new_line = [' '.join(section) for section in line]
#             new_line = ' '.join(new_line)
#             re.sub("\n",  " ", new_line)
#             f_out.write(str(new_line.split())+'\n')
# import re
# with open(SUMM_DIR, 'w') as f_out:
#     with open('train_summary_processed.txt', 'r') as f_in:
#         for line in f_in:
#             re.sub("\n",  " ", line)
#             f_out.write(str(line.split())+'\n')

In [4]:
#set hyper parameters
MAX_NUM_WORDS = 10000 #vocab_size
EMBEDDING_DIM = 50
HIDDEN_UNITS = 32
VAL_SPLIT = 0.1
ENCODER_MAX_LEN = 4096 #for one entire doc
DECODER_MAX_LEN = 200
BATCH_SIZE = 10
EPOCHS = 10
MODEL_NAME = "seq2seq"

In [5]:
# read dataset
# first model: treat each section as one input for the encoder
X = []
with open(TEXT_DIR, 'r') as f:
    for line in f:
        if line != None:
            X.append(' '.join(eval(line)))
print (len(X))

Y = []
with open(SUMM_DIR, 'r') as f:
    f_l = list(f)
    for line in f_l:
        if line != None:
            Y.append(' '.join(eval(line)))
print (len(Y))

100
100


In [6]:
print (np.mean([len(line.split()) for line in X]))
print (np.mean([len(line.split()) for line in Y]))

2940.41
206.12


In [7]:
print (max([len(line.split()) for line in X]))
print (max([len(line.split()) for line in Y]))

10759
373


In [8]:
# 0 is reserved fot padding, 1 for <UNK>, word idx starts from 2
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token=1)
# must feed in a list of list of strings
tokenizer.fit_on_texts(X+Y)

In [9]:
word_index = tokenizer.word_index
print ("Found %s unique tokens." % len(word_index))

Found 15479 unique tokens.


In [8]:
word_index
# index 0: for padding

{1: 1,
 'the': 2,
 'of': 3,
 'and': 4,
 'in': 5,
 'to': 6,
 'a': 7,
 'with': 8,
 'for': 9,
 'was': 10,
 'is': 11,
 'were': 12,
 'that': 13,
 'as': 14,
 'by': 15,
 'or': 16,
 '1': 17,
 'on': 18,
 'from': 19,
 'be': 20,
 'are': 21,
 'this': 22,
 '0': 23,
 'patients': 24,
 '2': 25,
 'at': 26,
 'study': 27,
 'not': 28,
 'an': 29,
 '3': 30,
 'have': 31,
 '5': 32,
 'which': 33,
 'after': 34,
 'between': 35,
 'also': 36,
 'p': 37,
 'group': 38,
 'may': 39,
 'these': 40,
 'we': 41,
 'it': 42,
 '4': 43,
 'mice': 44,
 '6': 45,
 'been': 46,
 'high': 47,
 'than': 48,
 'using': 49,
 'can': 50,
 'has': 51,
 'all': 52,
 'other': 53,
 'cells': 54,
 'more': 55,
 'their': 56,
 'had': 57,
 'used': 58,
 'one': 59,
 '8': 60,
 'levels': 61,
 'significant': 62,
 'there': 63,
 'but': 64,
 'who': 65,
 'data': 66,
 'no': 67,
 'such': 68,
 'our': 69,
 'treatment': 70,
 'b': 71,
 '7': 72,
 'studies': 73,
 'reported': 74,
 'results': 75,
 'health': 76,
 '10': 77,
 'compared': 78,
 'muscle': 79,
 'age': 80,
 'fat':

In [10]:
# add <sos>, <eos> token, <UNK>(OOV) is idx 0 by default
vocab_word_index = {}
vocab_index_word = {}
for word, idx in word_index.items():
    if idx < MAX_NUM_WORDS - 2:
        vocab_word_index[word] = idx
        vocab_index_word[idx] = word

In [11]:
vocab_word_index['<sos>'] = MAX_NUM_WORDS - 1
vocab_index_word[MAX_NUM_WORDS - 1] = '<sos>'
vocab_word_index['<eos>'] = MAX_NUM_WORDS - 2
vocab_index_word[MAX_NUM_WORDS - 2] = '<eos>'

In [86]:
vocab_index_word[0] = ' '
vocab_index_word[1] = '<unk>'

In [12]:
EOS_IDX = MAX_NUM_WORDS - 2
SOS_IDX = MAX_NUM_WORDS - 1

In [13]:
#convert words to indices
#Note: here we pad each section and use section for encoder
#Another alternative is to use sentence as encoder and pad each sentence
# note: encoder does not need <sos> or <eos>
# for encoder input, pad in front
# decoder input, pad in the end
# note that number of sections is not constant for every article
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, ENCODER_MAX_LEN, padding='pre')
    
Y = tokenizer.texts_to_sequences(Y)
for line in Y:
    line.append(EOS_IDX)
Y = pad_sequences(Y, DECODER_MAX_LEN, padding='post')

In [14]:
X[0]

array([ 21,  36, 187, ..., 742, 155, 226], dtype=int32)

In [15]:
Y[12]

array([  13,  372,    4,  268, 2695,  131,   53,  136,   84,  203,    7,
         89, 5050, 5699,    6, 1792,  424,   95,  185,  631,  151,  268,
         51,   46, 1740,    5,  535, 1236,  372,  301,   51,   28,   46,
        752, 1198,   41,  516,    2,  522,   35,  372,    4,  268,    5,
        535,  185,    4,  631, 3536, 1660, 2883,  469,  193,  775,  596,
        142,  535,    4,  596,  410,  970,   12, 6657,    8,  165,  268,
          4,  165,  372,  364, 1197,   84,    3,  372,   10,  111,    5,
        117,  533,    3,  775,   14,   78,    6,  258,    3,  142,    4,
       1370,    3,  410,  970,  268,   10, 1726,  498,    5, 1667,    3,
        142,    4, 1420,    3,  410,  970,    4, 3141,  372,    5, 1420,
          3,  775, 1291,    3,    2,  469,  498,  786,  268,  268,   10,
       3348, 1455,   18,    2,  109,  634,    3,  939,    3,  142, 2126,
          3,  410,  970,    4,  117,   77,    3,  775, 1811,  268,   10,
        288,    5,  746,    3,  775,  387,    3,  1

In [11]:
# n_samples = len(X)
# x_train = X[:-int(n_samples*VAL_SPLIT)]
# y_train = Y[:-int(n_samples*VAL_SPLIT)]

# x_val = X[-int(n_samples*VAL_SPLIT):]
# y_val = Y[-int(n_samples*VAL_SPLIT):]


In [16]:
print (X.shape)
print (Y.shape)

(100, 4096)
(100, 200)


In [17]:
#use pretrained 50d glove vectors
#with larger dataset, we can try glove woith higher dimension or 
#learn word embedding from scratch
# vector for UNK and those not in glove are randomly initialized
GLOVE_DIR = "../glove.6B.50d.txt"
f_emb = open(GLOVE_DIR, 'r')
embedding_index = {}
for line in f_emb:
    line = line.strip().split()
    word = line[0]
    coefs = np.asarray(line[1:], dtype='float32')
    embedding_index[word] = coefs
f_emb.close()

embedding_matrix = np.random.random((MAX_NUM_WORDS+1, EMBEDDING_DIM))
for word, i in vocab_word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [18]:
#slice sequences into many subsequences
X_split=[]
for i in range(X.shape[0]):
    split1=np.split(X[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    X_split.append(a)

In [19]:
X_split = np.array(X_split)
print (X_split.shape)

(100, 8, 8, 64)


In [38]:
embedding_layer = Embedding(MAX_NUM_WORDS + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=int(ENCODER_MAX_LEN/64),
                            trainable=True)

input1 = Input(shape=(int(ENCODER_MAX_LEN/64),), dtype='int32')
embed = embedding_layer(input1)
lstm1 = LSTM(HIDDEN_UNITS)(embed)
Encoder1 = Model(input1, lstm1)

input2 = Input(shape=(8,int(ENCODER_MAX_LEN/64),), dtype='int32')
embed2 = TimeDistributed(Encoder1)(input2)
lstm2 = LSTM(HIDDEN_UNITS)(embed2)
Encoder2 = Model(input2,lstm2)

all_input = Input(shape=(8,8,int(ENCODER_MAX_LEN/64)), dtype='int32')
embed3 = TimeDistributed(Encoder2)(all_input)
_, encoder_state_h, encoder_state_c = LSTM(HIDDEN_UNITS, return_state=True)(embed3)

encoder_states = [encoder_state_h, encoder_state_c]

#in training this is the summary, in inference this is the previous word
# decode input is one-hot index, not embeddings!
# use None to allow variable decoder length, 
# because in inference we are decoding one at a time
decoder_input = Input(shape=(None,MAX_NUM_WORDS))
decoder_lstm = LSTM(HIDDEN_UNITS, return_state=True, return_sequences=True)
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_input, initial_state=encoder_states)
decoder_dense = Dense(MAX_NUM_WORDS, activation='softmax')
# apply dense to output state of every timestep
# print (decoder_outputs.shape)
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([all_input, decoder_input], decoder_outputs)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

encoder_model = Model(all_input, encoder_states)

#we also need to train a decoder model, used for inference
#the input for the decoder model is not only the summary, but also the initial context vector
#in inference this is the states from the encoder, used at the inital vector for decoding
decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_input, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_input]+decoder_state_inputs, [decoder_outputs]+decoder_states)

In [39]:
for layer in model.layers:
    print (layer.output_shape)

(None, 8, 8, 64)
(None, 8, 32)
(None, None, 10000)
[(None, 32), (None, 32), (None, 32)]
[(None, None, 32), (None, 32), (None, 32)]
(None, None, 10000)


In [40]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           (None, 8, 8, 64)     0                                            
__________________________________________________________________________________________________
time_distributed_6 (TimeDistrib (None, 8, 32)        518994      input_15[0][0]                   
__________________________________________________________________________________________________
input_16 (InputLayer)           (None, None, 10000)  0                                            
__________________________________________________________________________________________________
lstm_11 (LSTM)                  [(None, 32), (None,  8320        time_distributed_6[0][0]         
__________________________________________________________________________________________________
lstm_12 (L

In [41]:
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        (None, 8, 8, 64)          0         
_________________________________________________________________
time_distributed_6 (TimeDist (None, 8, 32)             518994    
_________________________________________________________________
lstm_11 (LSTM)               [(None, 32), (None, 32),  8320      
Total params: 527,314
Trainable params: 527,314
Non-trainable params: 0
_________________________________________________________________


In [42]:
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, None, 10000)  0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           (None, 32)           0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, 32)           0                                            
__________________________________________________________________________________________________
lstm_12 (LSTM)                  [(None, None, 32), ( 1284224     input_16[0][0]                   
                                                                 input_17[0][0]                   
          

In [3]:
def load_weights(self, weight_file_path):
    if os.path.exists(weight_file_path):
        model.load_weights(weight_file_path)

def get_weight_path(model_dir_path):
    if not os.path.exists(model_dir_path):
        os.makedirs(model_dir_path)
    return model_dir_path + '/' + MODEL_NAME + '-weights.h5'

In [107]:
# # note the x_samples and y_samples are already tokenized and padded
# # in the vocab should add in a <eos> 
# # decoder input: x y <eos>, decoder target: <sos> x y
# def generate_batch(x_samples, y_samples, batch_size=1):
#     num_batches = len(x_samples)//batch_size
#     while True:
#         for start in range(0, num_batches):
#             encoder_input_batch = np.expand_dims(x_samples[start], 0)
#             decoder_target_batch = np.expand_dims(y_samples[start], 0)
#             decoder_input_batch = np.expand_dims(np.array([SOS_IDX] + list(y_samples[start][:-1])), 0)
#             print ('encoder:', encoder_input_batch.shape)
#             print ('decoder input:', decoder_input_batch.shape)
#             print ('decoder target:', decoder_target_batch.shape)

            
#             yield [encoder_input_batch, decoder_input_batch], decoder_target_batch

In [23]:
def generate_batch(x_samples, y_samples, batch_size):
        num_batches = len(x_samples) // batch_size
        while True:
            for batchIdx in range(0, num_batches):
                start = batchIdx * batch_size
                end = (batchIdx + 1) * batch_size
                encoder_input_batch = x_samples[start:end]
                decoder_target_batch = np.zeros(shape=(batch_size, DECODER_MAX_LEN, MAX_NUM_WORDS))
                decoder_input_batch = np.zeros(shape=(batch_size, DECODER_MAX_LEN, MAX_NUM_WORDS))
                for lineIdx, target_words in enumerate(y_samples[start:end]):
                    decoder_target_batch[lineIdx, 0, SOS_IDX] = 1
                    for idx, w in enumerate(target_words):
                        decoder_input_batch[lineIdx, idx, w] = 1
                        if idx < len(target_words)-1:
                            decoder_target_batch[lineIdx, idx+1, w] = 1
                yield [encoder_input_batch, decoder_input_batch], decoder_target_batch

In [43]:
# can add more callback tricks like early stoppping etc
def fit(Xtrain, Ytrain, val_data, epochs=EPOCHS, batch_size=BATCH_SIZE, model_dir_path=None):
    (x_val, y_val) = val_data
    if model_dir_path is None:
        model_dir_path = "./models"
    weight_file_path = get_weight_path(model_dir_path)
    checkpoint = ModelCheckpoint(weight_file_path)
    
    train_gen = generate_batch(Xtrain, Ytrain, batch_size)
    val_gen = generate_batch(x_val, y_val, batch_size)
    
    train_num_batches = len(Xtrain) // batch_size
    val_num_batches = len(x_val) // batch_size
    
    
    # can't do validation like that
    # need to implement decoder for inference
#     history = model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
#                                   epochs=epochs, verbose=1, 
#                                   validation_data=val_gen, validation_steps=val_num_batches,
#                                   callbacks=[checkpoint])
    history = model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
                                  epochs=epochs, verbose=1, validation_data=val_gen, 
                                  validation_steps=val_num_batches,
                                  callbacks=[checkpoint])
    model.save_weights(weight_file_path)
    return history


In [45]:
#fit(X_split, Y, epochs=1)

In [51]:
# inference decoder
# the input is tokenized and padded validation/test data
# input shape: (8, 8, encoder_max_len/64)

# seems like it only decode one sentence at a time
# because decoded sentences end at different time
def summarize(input_seq):
    input_seq = np.expand_dims(input_seq, 0)
    state_value = encoder_model.predict(input_seq)
    # prev predicted word
    target_seq = np.zeros((1, 1, MAX_NUM_WORDS))
    target_seq[0, 0, SOS_IDX] = 1
    target_text = ''
    target_text_len = 0
    terminated = False 
    while not terminated:
        # predict one word at a time,
        # based on prev states and prev words
        # output shape of decoder: decoder_outputs, state_h, state_c
        output_tokens, h, c = decoder_model.predict([target_seq] + state_value)
        
        sample_token_idx = np.argmax(output_tokens[0, -1, :])
        sample_word = vocab_index_word[sample_token_idx]
        target_text_len += 1
        
        if sample_word != "<sos>" and sample_word != "<eos>":
            target_text += ' ' + sample_word 
        
        if sample_word == "<eos>" or target_text_len >= DECODER_MAX_LEN:
            terminated = True 
        
        target_seq = np.zeros((1, 1, MAX_NUM_WORDS))
        target_seq[0, 0, sample_token_idx] = 1
        
        state_values = [h, c]
    # the returned value is a string
    return target_text.strip()
        
        

In [79]:
# from a sequence of indices to actual text,
# so that we can evaluate
def from_idx_to_text(seq):
    text = [str(vocab_index_word[idx]) for idx in seq]
    return ' '.join(text)

In [70]:
# get rouge score
# usage of rouge library: https://pypi.org/project/rouge/
from rouge import Rouge
# input hypothesis list and reference list
def get_rouge(hyp, ref):
    rouge = Rouge()
    # avg of scores of all sentences
    scores = rouge.get_scores(hyp, ref, avg=True)
    return scores
    

In [28]:
total_len = X_split.shape[0]
val_split = 0.1
test_split = 0.1
x_train = X_split[: int(total_len*(1-val_split-test_split))]
y_train = Y[: int(total_len*(1-val_split-test_split))]
x_val = X_split[int(total_len*(1-val_split-test_split)) : int(total_len*(1-test_split))]
y_val = Y[int(total_len*(1-val_split-test_split)) : int(total_len*(1-test_split))]
x_test = X_split[int(total_len*(1-test_split)) :]
y_test = Y[int(total_len*(1-test_split)) :]


In [34]:
y_train.shape

(80, 200)

In [44]:
fit(x_train, y_train, val_data=(x_val, y_val), epochs=1)

Epoch 1/1


  '. They will not be included '


<keras.callbacks.History at 0x2bbb005f8>

In [45]:
model.load_weights(get_weight_path("./models"))

In [37]:
x_train.shape

(80, 8, 8, 64)

In [54]:
summarized_texts = []
for text in x_test:
    summarized_texts.append(summarize(text))
# ref_texts = [from_idx_to_text(idx) for idx in y_test]
# print (get_rouge(summarized_texts, ref_texts))

In [56]:
# not enough training, so get <eos> every time
summarized_texts

['', '', '', '', '', '', '', '', '', '']

In [87]:
ref_texts = [from_idx_to_text(idx) for idx in y_test]

In [88]:
ref_texts[0]

"checked and was considered as control the dentin and pulp of extracted teeth were tested for the presence of abo rh antigen at respective time periods by ae technique statistical analysis data were analyzed in proportion for comparison chi square test or fisher 's exact test was used for the small sample results blood group antigens of abo and rh factor were detected in dentin and pulp up to 12 months for both abo and rh factor dentin and pulp showed 100 sensitivity for the samples tested at 0 month and showed a gradual decrease in the sensitivity as time period increased the sensitivity of pulp was better than dentin for both the blood grouping systems and abo blood group antigens were better detected than rh antigens conclusion in dentin and pulp the antigens of abo and rh factor were detected up to 12 months but showed a progressive decrease in the antigenicity as the time period increased when compared the results obtained of dentin and pulp in abo and rh factor grouping showed si

In [72]:
# so you can get the rouge score in this way
hyp = ["hello daniel", "yes"]
ref = ["hi daniel", "yes"]
print (get_rouge(hyp, ref))

{'rouge-1': {'f': 0.749999995, 'p': 0.75, 'r': 0.75}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.7499999999995, 'p': 0.75, 'r': 0.75}}


In [90]:
# cannot get result yet because summarized texts cannot be empty
print (get_rouge(summarized_texts, ref_texts))