Implementing Character-Level LSTM text generation

In [33]:
import numpy as np
import keras
from keras.utils import get_file
from keras.layers import LSTM, Dense
from keras.models import Model, Sequential
import random
import sys

In [17]:
path = get_file(
  "nietzsche.txt",
  origin = 'https://s3.amazonaws.com/text-datasets/nietzsche.txt'
)

In [18]:
text = open(path).read().lower()

In [19]:
print(f"Corpus Length: {len(text)}")

Corpus Length: 600893


In [20]:
# extract sequences of 60 characters
maxlen = 60 

# New sequence for every 3 characters
step = 3

# Holds extracted sequences
sentences = []
next_chars = []

In [21]:
for i in range(0, len(text) - maxlen, step):
  sentences.append(text[i:i + maxlen])
  next_chars.append(text[i + maxlen])

In [22]:
print(f"Number of Sequences: {len(sentences)}")

Number of Sequences: 200278


In [23]:
# list of unique characters found in text
chars = sorted(list(set(text)))

In [24]:
print(f"Unique Chars: {chars}")

Unique Chars: ['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'æ', 'é', 'ë']


In [25]:
# create a dictionary mapping to individual characters
char_dict = dict((char, chars.index(char)) for char in chars)

In [26]:
# Vectorization

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=bool)
y = np.zeros((len(sentences), len(chars)), dtype=bool)

In [27]:
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_dict[char]]  = 1
    y[i, char_dict[next_chars[i]]] = 1

In [30]:
model = Sequential([
    LSTM(128, input_shape=(maxlen, len(chars))),
    Dense(len(chars), activation="softmax")
])

In [31]:
model.compile(
    loss="categorical_crossentropy",
    optimizer="rmsprop"
)

In [32]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    preds = np.exp(preds)
    preds = preds / np.sum(preds)
    probas = np.random.multinomial(1, preds, 1)

    return np.argmax(probas)

In [36]:
for epoch in range(1, 60):
    print('epoch', epoch)
    model.fit(x, y, batch_size=128, epochs=1)
    start_index = np.random.randint(0, len(text) - maxlen - 1)
    generated_text = text[start_index: start_index + maxlen]
    print(f"--- Generating with seed: {generated_text}")

    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print("----- temperature:", temperature)
        sys.stdout.write(generated_text)

        for i in range(400):
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_dict[char]] = 1
            
            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)

epoch 1
--- Generating with seed: equently to the clumsiness of naturalists, who can hardly
to
----- temperature: 0.2
equently to the clumsiness of naturalists, who can hardly
to expenter and sour and the soun the some the semprest of the man and the sould and the suphers of the sour and the man the some the such the such and the sour the sering and the sould which a present of the seling and and man the soul the semprest of the sering and the soul the seling and and the sence the precest of the such the sould and the san in the sence the sempress of the sunce the sould a----- temperature: 0.5
d the san in the sence the sempress of the sunce the sould and some to the menery and sore in the free the belle, in the more and is the canared in the rigice, to the somerte somentien and
rust frotery and the the serking and gounally and the sankery the dastenten, in the hamse
sely, and se proun and mase he foin has it is hat and and the ally and the semprect of the pristion the sear and will th

Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x000002DF6F163820>
Traceback (most recent call last):
  File "c:\Users\USER\anaconda3\lib\weakref.py", line 371, in remove
    self = selfref()
KeyboardInterrupt: 


KeyboardInterrupt: 