In [1]:
import numpy as np
import os
import sys

In [2]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [3]:
MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 500
LATENT_DIM = 25

In [4]:
input_texts = []
target_texts = []

for line in open('robert_frost.txt'):
    line = line.rstrip()
    if not line:
        continue

    input_line = '<sos> ' + line
    target_line = line + ' <eos>'

    input_texts.append(input_line)
    target_texts.append(target_line)


In [5]:
all_lines = input_texts + target_texts

In [6]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')

In [7]:
tokenizer.fit_on_texts(all_lines)

In [8]:
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

In [9]:
len(target_sequences)

1436

In [10]:
max_sequence_length_from_data = max(len(s) for s in input_sequences)

In [11]:
max_sequence_length_from_data

12

In [12]:
word2idx = tokenizer.word_index
len(word2idx)

3056

In [13]:
max_sequence_length = min(MAX_SEQUENCE_LENGTH, max_sequence_length_from_data)

In [14]:
max_sequence_length

12

In [15]:
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

In [16]:
input_sequences.shape

(1436, 12)

In [17]:
target_sequences.shape

(1436, 12)

In [18]:
word2vec={}

with open(os.path.join("glove.6B.50d.txt")) as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec


In [19]:
len(word2vec)

400000

In [20]:
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)

In [21]:
num_words

3057

In [22]:
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [23]:
one_hot_targets = np.zeros(
    (len(input_sequences), max_sequence_length, num_words))
for i, target_sequence in enumerate(target_sequences):
    for t, word in enumerate(target_sequence):
        if word > 0:
            one_hot_targets[i, t, word] = 1

In [24]:
print(f'len(input_sequences): {len(input_sequences)}')
print(f'max_sequence_length: {max_sequence_length}')
print(f'num_words: {num_words}')
print(f'one_hot_targets.shape: {one_hot_targets.shape}')

len(input_sequences): 1436
max_sequence_length: 12
num_words: 3057
one_hot_targets.shape: (1436, 12, 3057)


In [25]:
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix]
    # tranaible = False
)

In [26]:
input1 = Input(shape=(max_sequence_length,))
initial_h = Input(shape=(LATENT_DIM,))
initial_c = Input(shape=(LATENT_DIM))
x = embedding_layer(input1)
lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
x,_,_ = lstm(x, initial_state=[initial_h, initial_c])
dense = Dense(num_words, activation='softmax')
output = dense(x)

In [27]:
model = Model([input1, initial_h, initial_c], output)

In [28]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'],
    # trainable = False 
)

In [29]:
z = np.zeros((len(input_sequences),LATENT_DIM))

In [30]:
r = model.fit(
    [input_sequences, z, z],
    one_hot_targets,
    epochs=EPOCHS,
    validation_split=VALIDATION_SPLIT,
    batch_size=BATCH_SIZE
)

46 - val_loss: 4.9225 - val_accuracy: 0.1007
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 379/500
Epoch 380/500
Epoch 381/500
Epoch 382/500
Epoch 383/500
Epoch 384/500
Epoch 385/500
Epoch 386/500
Epoch 387/500
Epoch 388/500
Epoch 389/500
Epoch 390/500
Epoch 391/500
Epoch 392/500
Epoch 393/500
Epoch 394/500
Epoch 395/500
Epoch 396/500
Epoch 397/500
Epoch 398/500
Epoch 399/500
Epoch 400/500
Epoch 401/500
Epoch 402/500
Epoch 403/500
Epoch 404/500
Epoch 405/500
Epoch 406/500
Epoch 407/500
Epoch 408/500
Epoch 409/500
Epoch 410/500
Epoch 411/500
Epoch 412/500
Epoch 413/500
Epoch 414/500
Epoch 415/500
Epoch 416/500
Epoch 417/500
Epoch 418/500
Epoch 419/500
Epoch 420/500
Epoch 421/500
Epoch 422/500
Epoch 423/500
Epoch 424/500
Epoch 425/500
Epoch 426/500
Epoch 427/500
Epoch 428/500
Epo

In [38]:
input2 = Input(shape=(1,))
x = embedding_layer(input2)
x, h, c = lstm(x, initial_state=[initial_h, initial_c])
output2 = dense(x)
sampling_model = Model([input2, initial_h, initial_c], [output2, h, c])

In [39]:
idx2word = {v:k for k,v in word2idx.items()}

In [48]:
def sample_line():
    # initial inputs
    np_input = np.array([[word2idx['<sos>']]])
    h = np.zeros((1, LATENT_DIM))
    c = np.zeros((1, LATENT_DIM))
    print(f'np_input: {np_input}')

    # so we know when to quit
    eos = word2idx['<eos>']

    # store the output here
    output_sentence = []

    for _ in range(max_sequence_length):
        o, h, c = sampling_model.predict([np_input, h, c])

        print("o:", o.shape)
        # idx = np.argmax(o[0,0])
        probs = o[0, 0]
        if np.argmax(probs) == 0:
            print("wtf")
        probs[0] = 0
        probs /= probs.sum()
        idx = np.random.choice(len(probs), p=probs)
        if idx == eos:
            break

        # accuulate output
        output_sentence.append(idx2word.get(idx, '<WTF %s>' % idx))

        # make the next input into model
        np_input[0, 0] = idx

    return ' '.join(output_sentence)


In [49]:
sample_line()

np_input: [[1]]
o: (1, 1, 3057)
o: (1, 1, 3057)
o: (1, 1, 3057)
o: (1, 1, 3057)
o: (1, 1, 3057)
o: (1, 1, 3057)
o: (1, 1, 3057)
o: (1, 1, 3057)
o: (1, 1, 3057)


'that seize rushes to go dig buried light'

In [None]:
while True:
    for _ in range(4):
        print(sample_line())

    ans = input("---generate another? [Y/n]---")
    if ans and ans[0].lower().startswith('n'):
        break
