In [2]:
from keras.preprocessing import sequence
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import numpy as np
import os

In [3]:
DATA_DIR = "../pubmed/"
TEXT_DIR = "train_sections.txt"
SUMM_DIR = "train_summary_processed.txt"

In [4]:
#set hyper parameters
MAX_NUM_WORDS = 4000 #vocab_size
EMBEDDING_DIM = 50
HIDDEN_UNITS = 32
VAL_SPLIT = 0.1
ENCODER_MAX_LEN = 500 #for one section
DECODER_MAX_LEN = 150
BATCH_SIZE = 100
EPOCHS = 10
MODEL_NAME = "seq2seq"

In [5]:
# read dataset
# first model: treat each section as one input for the encoder
X = []
with open(DATA_DIR+TEXT_DIR, 'r') as f:
    f_l = list(f)
    for line in f_l:
        if line != None:
            line2list = eval(line)
            for sec in range(len(line2list)):
                line2list[sec] = ' '.join(line2list[sec])
            X.append(line2list)
print (len(X))


Y = []
with open(DATA_DIR+SUMM_DIR, 'r') as f:
    f_l = list(f)
    for line in f_l:
        if line != None:
            Y.append(line)
print (len(Y))

100
100


In [6]:
# 0 is reserved fot padding, 1 for <UNK>, word idx starts from 2
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token=1)
# must feed in a list of list of strings
all_words = [sec for sec in line for line in X]
all_words.extend(Y)
tokenizer.fit_on_texts(all_words)

In [7]:
word_index = tokenizer.word_index
print ("Found %s unique tokens." % len(word_index))

Found 4099 unique tokens.


In [8]:
# add <sos>, <eos> token, <UNK>(OOV) is idx 0 by default
vocab_word_index = {}
vocab_index_word = {}
for word, idx in word_index.items():
    if idx <= MAX_NUM_WORDS - 2:
        vocab_word_index[word] = idx
        vocab_index_word[idx] = word

In [9]:
vocab_word_index['<sos>'] = MAX_NUM_WORDS - 1
vocab_index_word[MAX_NUM_WORDS] = '<sos>'
vocab_word_index['<eos>'] = MAX_NUM_WORDS 
vocab_index_word[MAX_NUM_WORDS] = '<eos>'

In [10]:
EOS_IDX = MAX_NUM_WORDS
SOS_IDX = MAX_NUM_WORDS - 1

In [11]:
#convert words to indices
#Note: here we pad each section and use section for encoder
#Another alternative is to use sentence as encoder and pad each sentence
# note: encoder does not need <sos> or <eos>
# for encoder input, pad in front
# decoder input, pad in the end
# note that number of sections is not constant for every article
for i in range(len(X)):
    X[i] = tokenizer.texts_to_sequences(X[i])
    X[i] = pad_sequences(X[i], ENCODER_MAX_LEN, padding='pre')

Y = tokenizer.texts_to_sequences(Y)
for line in Y:
    line.append(EOS_IDX)
Y = pad_sequences(Y, DECODER_MAX_LEN, padding='post')

In [12]:
# n_samples = len(X)
# x_train = X[:-int(n_samples*VAL_SPLIT)]
# y_train = Y[:-int(n_samples*VAL_SPLIT)]

# x_val = X[-int(n_samples*VAL_SPLIT):]
# y_val = Y[-int(n_samples*VAL_SPLIT):]

y_train = np.asarray(Y)

In [13]:
y_train.shape

(100, 150)

In [14]:
#use pretrained 50d glove vectors
#with larger dataset, we can try glove woith higher dimension or 
#learn word embedding from scratch
# vector for UNK and those not in glove are randomly initialized
GLOVE_DIR = "glove.6B.50d.txt"
f_emb = open(GLOVE_DIR, 'r')
embedding_index = {}
for line in f_emb:
    line = line.strip().split()
    word = line[0]
    coefs = np.asarray(line[1:], dtype='float32')
    embedding_index[word] = coefs
f_emb.close()

embedding_matrix = np.random.random((MAX_NUM_WORDS+1, EMBEDDING_DIM))
for word, i in vocab_word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [15]:
#Build the seq2seq model
#add attention, pointer and coverage later
#greedy search for now, use beam search later
embedding_layer = layers.Embedding(MAX_NUM_WORDS+1, EMBEDDING_DIM, 
                                   weights=[embedding_matrix],
                                   trainable=True)

one_section_input = layers.Input(shape=(ENCODER_MAX_LEN,))
one_section_embed = embedding_layer(one_section_input)
one_section_lstm = layers.Bidirectional(layers.LSTM(HIDDEN_UNITS))(one_section_embed)
one_section_encoder = Model(one_section_input, one_section_lstm)

all_section_input = layers.Input(shape=(None,ENCODER_MAX_LEN,))
all_section_embed = layers.TimeDistributed(one_section_encoder)(all_section_input)
all_section_lstm = layers.LSTM(HIDDEN_UNITS, return_state=True)
encoder_outputs, encoder_state_h, encoder_state_c = all_section_lstm(all_section_embed)

encoder_states = [encoder_state_h, encoder_state_c]

# in training this is the summary, in inference this is the previous word
# personally i feel that the decoder input should also be word vectors
decoder_input = layers.Input(shape=(DECODER_MAX_LEN,))
decoder_embed = embedding_layer(decoder_input)
decoder_lstm = layers.LSTM(HIDDEN_UNITS, return_state=True, return_sequences=True)
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_embed, initial_state=encoder_states)
decoder_dense = layers.TimeDistributed(layers.Dense(MAX_NUM_WORDS, activation='softmax'))
# apply dense to output state of every timestep
# print (decoder_outputs.shape)
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([all_section_input, decoder_input], decoder_outputs)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

encoder_model = Model(all_section_input, encoder_states)

# we also need to train a decoder model, used for inference
# the input for the decoder model is not only the summary, but also the initial context vector
# in inference this is the states from the encoder, used at the inital vector for decoding
decoder_state_inputs = [layers.Input(shape=(HIDDEN_UNITS,)), layers.Input(shape=(HIDDEN_UNITS,))]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embed, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_input]+decoder_state_inputs, [decoder_outputs]+decoder_states)

In [16]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None, 500)    0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, None, 64)     221298      input_2[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         multiple             200050      input_3[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LS

In [17]:
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, None, 500)         0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 64)          221298    
_________________________________________________________________
lstm_2 (LSTM)                [(None, 32), (None, 32),  12416     
Total params: 233,714
Trainable params: 233,714
Non-trainable params: 0
_________________________________________________________________


In [18]:
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         multiple             200050      input_3[0][0]                    
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 32)           0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 32)           0                                            
__________________________________________________________________________________________________
lstm_3 (LS

In [19]:
def load_weights(self, weight_file_path):
    if os.path.exists(weight_file_path):
        model.load_weights(weight_file_path)

def get_weight_path(model_dir_path):
    return model_dir_path + '/' + MODEL_NAME + '-weights.h5'

In [20]:
# note the x_samples and y_samples are already tokenized and padded
# in the vocab should add in a <eos> 
# decoder input: x y <eos>, decoder target: <sos> x y
def generate_batch(x_samples, y_samples, batch_size=1):
    num_batches = len(x_samples)//batch_size
    while True:
        for batchIdx in range(0, num_batches):
            start = batchIdx * batch_size 
            end = (batchIdx + 1) * batch_size
            encoder_input_batch = np.array([x_samples[start]])
            print ('encoder', encoder_input_batch.shape)
            decoder_target_batch = (y_samples[start])
            decoder_input_batch = np.array([[SOS_IDX]+list(decoder_target_batch[:-1])])
            print ('decoder', decoder_input_batch.shape)
            
            yield [encoder_input_batch, decoder_input_batch], decoder_target_batch

In [21]:
# can add more callback tricks like early stoppping etc
def fit(Xtrain, Ytrain, epochs=EPOCHS, batch_size=BATCH_SIZE, model_dir_path=None):
    if model_dir_path is None:
        model_dir_path = "./models"
    weight_file_path = get_weight_path(model_dir_path)
    checkpoint = ModelCheckpoint(weight_file_path)
    
    train_gen = generate_batch(Xtrain, Ytrain, batch_size)
    
    train_num_batches = len(Xtrain) // batch_size
    
    # can't do validation like that
    # need to implement decoder for inference
#     history = model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
#                                   epochs=epochs, verbose=1, 
#                                   validation_data=val_gen, validation_steps=val_num_batches,
#                                   callbacks=[checkpoint])
    history = model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
                                  epochs=epochs, verbose=1, 
                                  callbacks=[checkpoint])
    model.save_weights(weight_file_path)
    return history


In [22]:
fit(X, y_train, epochs=2)

Epoch 1/2encoder (1, 15, 500)
decoder (1, 150)

encoder (1, 15, 500)
decoder (1, 150)
encoder (1, 15, 500)
decoder (1, 150)


ValueError: Error when checking target: expected time_distributed_2 to have 3 dimensions, but got array with shape (150, 1)

In [None]:
# inference decoder
# the input is tokenized and padded validation/test data
# input shape: (batch_size, encoder_max_len)
def summarize(input_seq):
    batch_size = len(input_seq)
    input_seq_emb = np.zeros(shape=(batch_size, ENCODER_MAX_LEN, EMBEDDING_DIM))
    for idx in input_seq