In [1]:
import numpy as np
from keras.utils import np_utils

SEQ_LENGTH = 25

# ISSUES
# (1) use partially superimposed sequences as input data -> more training data
# (2) need to change output size to (num_sequences, vocab_size)
# (3) use separate text file for predictions; draw sample sequences from the test set
# (4) for predictions, generate a new character with a given sequence, append the character, and remove the first character in that sequence
# (5) add dropout layers

# load sample text and build vocabulary
with open('abstract_train.txt', encoding='utf-16') as data:
    text = list(data.read())
    chars = set(text)
    VOCAB_SIZE = len(chars)
    char_idx = {char:idx for idx, char in enumerate(chars)}
    idx_char = {idx: char for idx, char in enumerate(chars)}
    
    # slice raw text into partially superimposed sequences
    char_vec = []
    for i, v in enumerate(text[:-SEQ_LENGTH]):
        char_vec.append(text[i:i+SEQ_LENGTH]) # convert each character in text into one hot vec
    
    # convert each character in each sequence into index values in char_idx
    for i, seq in enumerate(char_vec):
        for j, c in enumerate(seq):
            char_vec[i][j] = char_idx[c]
            char_vec[i][j] = np_utils.to_categorical(char_vec[i][j], num_classes=VOCAB_SIZE)
    

input_seq = np.array(char_vec[0:-1])
output_seq = np.array(char_vec[1:])


# testing: see if input/output sequences are correct
print(input_seq.shape)
print(output_seq.shape)
s = ''
for i in range(SEQ_LENGTH):
    idx = np.where(input_seq[0][i][:] == 1)
    s += idx_char[idx[0][0]]
print('input sequence:', s)

print('--------------')
s = ''
for i in range(SEQ_LENGTH):
    idx = np.where(output_seq[0][i][:] == 1)
    s += idx_char[idx[0][0]]
print('output sequence:', s)
    


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


(133526, 25, 112)
(133526, 25, 112)
input sequence: The preceding decades hav
--------------
output sequence: he preceding decades have


In [2]:
# build model

from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed, Dropout

model = Sequential()
model.add(LSTM(256, input_shape=(SEQ_LENGTH, VOCAB_SIZE), return_sequences=True)) # input_shape = (SEQ_LENGTH, VOCAB_SIZE)
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(VOCAB_SIZE, activation='softmax')))
model.summary()




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 25, 256)           377856    
_________________________________________________________________
dropout_1 (Dropout)          (None, 25, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 25, 256)           525312    
_________________________________________________________________
dropout_2 (Dropout)          (None, 25, 256)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 25, 112)           28784     
Total params: 931,952
Trainable params: 931,952
Non-trainable params: 0
_________________________________________________________________


In [3]:

# train model
model.compile(loss='categorical_crossentropy', optimizer='Adam')
model.fit(x=input_seq, y=output_seq, batch_size=32, epochs=25) # shuffle = False if stateful = Tru
model.save_weights('lstm_weights.h5')
    

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [8]:
# predict
from random import randint
from keras.utils import np_utils

def generate_text(model, length):
    with open('abstract_test.txt') as data:
        text = list(data.read())
        
    predicted_text = []
    idx = randint(0, len(text)-SEQ_LENGTH)
    tmp = text[idx:idx+SEQ_LENGTH] # sample seed sequence from raw text
    
    # one hot encoding of seed text
    x_test = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for i, c in enumerate(tmp):
        x_test[i] = np_utils.to_categorical(char_idx[c], num_classes=VOCAB_SIZE)
    x_test = np.reshape(x_test, (1, SEQ_LENGTH, VOCAB_SIZE))
    
    # model predictions
    for i in range(length):
        probabilities = model.predict(x_test)
        k = np.argmax(probabilities, axis=2)[0]
        new_char = np_utils.to_categorical(k[-1], num_classes=VOCAB_SIZE)
        new_char = np.reshape(new_char, (1, 1, VOCAB_SIZE))
        x_test = np.append(x_test, new_char, axis=1)
        x_test = x_test[:, 1:, :]
        predicted_text.append(idx_char[k[-1]])

    
    print('Seed text:', ''.join(tmp))
    print('-----------')
    print('predicted text:', ''.join(predicted_text))
        
    # convert one hot vectors to string
generate_text(model, 300)

Seed text: red in full however, the 
-----------
predicted text: physical characterisation of the porphyrin changes to a nanomedicine platform early in development of artificial phospholipid bilayer membrane and induce a toxic immune response (73). Antibody fragments, such as sodium cholate (137). They are able to detect solid tumors for PDT therapy (PDT) is a th
