In [None]:
from keras.models import Sequential
from keras.layers.core import Activation, Dense, Dropout
from keras.optimizers import RMSprop
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.callbacks import LambdaCallback
from sklearn.model_selection import train_test_split

import re
import nltk
import string
import os, codecs
import collections
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def analysis_text(text):
    num_lines = 0
    word_seqs = []
    word_freqs = collections.Counter()
    
    for line in text:
        sentence = line.strip().split("\t")[0]
        words = nltk.word_tokenize(sentence.lower())
        for word in words:
            if not re.match(r'^\w+$', word):
                continue
            word_seqs.append(word)
            word_freqs[word] += 1
        num_lines += 1
    return word_freqs, num_lines, word_seqs

In [None]:
with codecs.open("./tinyshakespear.txt", "r", encoding="utf-8") as f:
    text = f.readlines()
    word_freqs, lines, word_seqs = analysis_text(text)

In [None]:
word_to_index = { word:idx for idx, word in enumerate(word_freqs) }
index_to_word = { idx:word for word, idx in word_to_index.items() }

In [None]:
step = 3
MAX_LEN = 10
sentences = []
next_words = []

demo_step = 0
for i in range(0, len(word_seqs)-MAX_LEN, step):
    next_word = word_seqs[i+MAX_LEN]
    seq = word_seqs[i:i+MAX_LEN]
    
    if demo_step < 10:
        print(seq)
        print(next_word)
        demo_step += 1
    
    sentences.append(seq)
    next_words.append(next_word)

In [None]:
num_words = len(word_freqs)    
num_sentences = len(sentences)
print('words count ', num_words)
print('sentences count ', num_sentences)

In [None]:
X = np.zeros((num_sentences, MAX_LEN, num_words), dtype=np.bool)
y = np.zeros((num_sentences, num_words), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for j, word in enumerate(sentence):
        X[i, j, word_to_index[word]] = 1  
    y[i, word_to_index[next_words[i]]] = 1

In [None]:
BATCH_SIZE = 32
NUM_HIDDEN = 128
NUM_EMBEDDING = 128
NUM_EPOCHS = 10

Optimizer = RMSprop(lr=0.01)

In [None]:
model = Sequential()
model.add(LSTM(NUM_HIDDEN, input_shape=(MAX_LEN, num_words)))
model.add(Dense(num_words)) 
model.add(Activation('softmax'))
          
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', 
              optimizer=Optimizer, 
              metrics=["accuracy"]) 

In [None]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
def on_epoch_end(epoch, logs):
    print('Generating text after Epoch: %d' % epoch)
    
    random_idx = np.random.randint(0, len(word_seqs)-MAX_LEN-1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('diversity:', diversity)
        generated = ''
        text = string.join(word_seqs[random_idx:random_idx+MAX_LEN])
        generated += text
        print('Generating with seed: "' + text + '"')

        for i in range(400):
            x = np.zeros((1, MAX_LEN, num_words))
            words_iter = nltk.word_tokenize(text.lower())
            for j, word in enumerate(words_iter):
                x[0, j, word_to_index[word]] = 1

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = index_to_word[next_index]

            generated += " " + next_word
            words = nltk.word_tokenize(text.lower())
            words.append(next_word)
            
            text = string.join(words[1:])
        print(generated)

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [None]:
model.fit(X, y,
          batch_size=BATCH_SIZE,
          epochs=NUM_EPOCHS,
          callbacks=[print_callback])