In [22]:
import numpy
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint, TensorBoard, Callback
from keras.utils import to_categorical
import sys

In [24]:
filename = "./data/pg11.txt"
raw_text = open(filename, encoding="utf-8").read()
raw_text = raw_text.lower()

In [25]:
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

n_chars = len(raw_text)
n_vocab = len(chars)
print(f"Total Characters: {n_chars}, Total Vocab: {n_vocab}")

Total Characters: 144678, Total Vocab: 51


In [26]:
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i : i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  144578


In [27]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
y = to_categorical(dataY)

In [None]:
class TokenGenerationCallback(Callback):
    def __init__(self, x_test, dataX, int_to_char):
        super().__init__()
        self.x_test = x_test
        self.dataX = dataX
        self.int_to_char = int_to_char

    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.x_test)
        for i in range(len(self.x_test)):
            x_str = "".join([self.int_to_char.get(x, "") for x in self.dataX[i]])
            y_pred_token = self.int_to_char[numpy.argmax(y_pred[i])]
            print(f"{x_str} -> {y_pred_token}")

In [28]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation="softmax"))

  super().__init__(**kwargs)


In [None]:
model.compile(loss="categorical_crossentropy", optimizer="adam")

filepath = "./models/weights-improvement-{epoch:02d}-{loss:.4f}.keras"
checkpoint = ModelCheckpoint(
    filepath, monitor="loss", verbose=1, save_best_only=True, mode="min"
)
# from datetime import datetime
# logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir="./logs")
tgc = TokenGenerationCallback(X[-10:], dataX[-10:], int_to_char)
callbacks_list = [checkpoint, tensorboard_callback, tgc]
h = model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
[1m 526/1130[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m3:48[0m 377ms/step - loss: 3.1452

KeyboardInterrupt: 

In [29]:
filename = "../labs/models/weights-improvement-20-2.0046.keras"
model.load_weights(filename)
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [30]:
start = numpy.random.randint(0, len(dataX) - 1)
pattern = dataX[start]
print("Seed:")
print('"', "".join([int_to_char[value] for value in pattern]), '"')
# generate characters
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1 : len(pattern)]
print("\nDone.")

Seed:
" nstantly jumped up, and began bowing to the king, the queen,
the royal children, and everybody else. "
 and the goupdon  alice was aoother worcd in the woods. 
“hh toere oo toe tooe if the soote toen ” said the monk turtle, “wou din ”ou calen tieer what i whsl yhu  the mase whuh the mortle ”ou would toen ”

“ho  i voolk y said the monk turtle, “io would to toe kore that it would be i kene thet io tae to the kooh th

KeyboardInterrupt: 