In [59]:
import os
import sys
import numpy as np
import string

In [60]:
from keras.layers import LSTM, Embedding, Input, Dense
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [61]:
VOCAB_SIZE = 20000
EMEBDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 100
VALIDATION_SPLIT = 0.2
EPOCHS = 1000
BATCH_SIZE = 128
LATENT_DIM = 25

In [62]:
input_texts = []
target_texts = []
for line in open('robert_frost.txt'):
    line = line.rstrip()
    if not line:
        continue

    input_line = '<sos> ' + line
    target_line = line + ' <eos>'

    input_texts.append(input_line)
    target_texts.append(target_line)


In [63]:
all_lines = input_texts + target_texts

In [64]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, filters='')
tokenizer.fit_on_texts(all_lines)

In [65]:
input_sequences = tokenizer.texts_to_sequences(input_texts)

In [66]:
target_sequences = tokenizer.texts_to_sequences(target_texts)

In [67]:
len(target_sequences)

1436

In [68]:
max_sequence_length_from_data = max(len(s) for s in input_sequences)

In [69]:
max_sequence_length_from_data

12

In [70]:
word2idx = tokenizer.word_index

In [71]:
max_sequence_length = min(max_sequence_length_from_data, MAX_SEQUENCE_LENGTH)

In [72]:
max_sequence_length

12

In [73]:
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')

In [74]:
target_sequences = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

In [75]:
word2vec = {}

with open('glove.6B.50d.txt') as f:
    for lines in f:
        values = lines.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec

In [76]:
len(word2vec)

400000

In [77]:
num_words = min(VOCAB_SIZE, len(word2idx) + 1)

In [78]:
num_words

3057

In [79]:
embedding_matrix = np.zeros((num_words, EMEBDDING_DIM))

In [80]:
for word, i in word2idx.items():
    if i < VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [81]:
one_hot_targets = np.zeros((len(input_sequences), max_sequence_length, num_words))
for i, target_sequence in enumerate(target_sequences):
    for t, word in enumerate(target_sequence):
        if word > 0:
            one_hot_targets[i, t, word] = 1

In [82]:
print(one_hot_targets.shape)
print(target_sequences.shape)

(1436, 12, 3057)
(1436, 12)


In [83]:
one_hot_targets[1]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [84]:
target_sequences[1]

array([  5, 541,   6,  65,  31, 934, 141,   2,   0,   0,   0,   0],
      dtype=int32)

In [85]:
embedding_layer = Embedding(
    num_words,
    EMEBDDING_DIM,
    weights=[embedding_matrix]
)

In [86]:
input_ = Input(shape=(max_sequence_length,))
initial_h = Input(shape=(LATENT_DIM,))
intial_c = Input(shape=(LATENT_DIM,))
x = embedding_layer(input_)
lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
x,_,_ = lstm(x, initial_state=[initial_h, intial_c])
dense = Dense(num_words, activation='softmax')
output = dense(x)


In [87]:
model = Model([input_, initial_h, intial_c], output)
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [88]:
z = np.zeros((len(input_sequences),LATENT_DIM))
model.fit(
    [input_sequences, z, z],
    one_hot_targets,
    batch_size=BATCH_SIZE,
    validation_split=VALIDATION_SPLIT,
    epochs = EPOCHS
)

ss: 6.0026 - val_accuracy: 0.0839
Epoch 862/1000
Epoch 863/1000
Epoch 864/1000
Epoch 865/1000
Epoch 866/1000
Epoch 867/1000
Epoch 868/1000
Epoch 869/1000
Epoch 870/1000
Epoch 871/1000
Epoch 872/1000
Epoch 873/1000
Epoch 874/1000
Epoch 875/1000
Epoch 876/1000
Epoch 877/1000
Epoch 878/1000
Epoch 879/1000
Epoch 880/1000
Epoch 881/1000
Epoch 882/1000
Epoch 883/1000
Epoch 884/1000
Epoch 885/1000
Epoch 886/1000
Epoch 887/1000
Epoch 888/1000
Epoch 889/1000
Epoch 890/1000
Epoch 891/1000
Epoch 892/1000
Epoch 893/1000
Epoch 894/1000
Epoch 895/1000
Epoch 896/1000
Epoch 897/1000
Epoch 898/1000
Epoch 899/1000
Epoch 900/1000
Epoch 901/1000
Epoch 902/1000
Epoch 903/1000
Epoch 904/1000
Epoch 905/1000
Epoch 906/1000
Epoch 907/1000
Epoch 908/1000
Epoch 909/1000
Epoch 910/1000
Epoch 911/1000
Epoch 912/1000
Epoch 913/1000
Epoch 914/1000
Epoch 915/1000
Epoch 916/1000
Epoch 917/1000
Epoch 918/1000
Epoch 919/1000
Epoch 920/1000
Epoch 921/1000
Epoch 922/1000
Epoch 923/1000
Epoch 924/1000
Epoch 925/1000
Epoch 

<keras.callbacks.History at 0x7f79e9432f70>

In [89]:
input2 = Input(shape=(1,))
x = embedding_layer(input2)
x, h, c = lstm(x, initial_state=[initial_h, intial_c])
output2 = dense(x)
sampling_model = Model([input2, initial_h, intial_c], [output2, h, c])


In [90]:
idx2word = {v:k for k,v in word2idx.items()}

In [160]:
def sample_line():
    # initial inputs
    np_input = np.array([[word2idx['<sos>']]])
    h = np.zeros((1, LATENT_DIM))
    c = np.zeros((1, LATENT_DIM))

    # so we know when to quit
    eos = word2idx['<eos>']

    # store the output here
    output_sentence = []

    for _ in range(max_sequence_length):
        o, h, c = sampling_model.predict([np_input, h, c])

        # print("o.shape:", o.shape, o[0,0,:10])
        # print(o[0,0])
        # idx = np.argmax(o[0,0])
        probs = o[0, 0]
        if np.argmax(probs) == 0:
            print("wtf")
        probs[0] = 0
        # print(probs[0:10])
        probs /= probs.sum()
        idx = np.random.choice(len(probs), p=probs)
        if idx == eos:
            break

        # accuulate output
        output_sentence.append(idx2word.get(idx, '<WTF %s>' % idx))

        # make the next input into model
        np_input[0, 0] = idx

    return ' '.join(output_sentence)

In [162]:
for _ in range(10):
    print(sample_line())

now not left's no more the bones his abode.
but to-night so like the more for the novelty,
then know from them as concerned for a sudden movement toward the
sitting or three?'
my mother left them so, they're to be.' for adventurously.' here
i'll tell you happen to reckon the bones.'
and tell with one for voices.' of youthful faces.
nor granny's, surely. call i left all halted too,
there'd been all over me john said my lofty can accept 'it's
but if in me in some publisher
