# Imports

In [37]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
import sys

# Process Text

In [13]:
FILEPATH = "data/prince.txt"

with open(FILEPATH, encoding="utf8") as file:
    text = file.read()


def tokenize_words(text_input):
    """Returns text input tokenized, lowercase, with stopwords removed."""
    text_lower = text_input.lower()

    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text_lower)

    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)


processed = tokenize_words(text)

# Convert Text to Numeric Form

In [10]:
chars = sorted(list(set(processed)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [15]:
input_len = len(processed)
char_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total unique characters:", char_len)

Total number of characters: 149578
Total unique characters: 39


# Create Usable Dataset

In [19]:
seq_length = 100
x_data = []
y_data = []

In [20]:
for i in range(0, input_len - seq_length, 1):
    in_seq = processed[i:i + seq_length]
    out_seq = processed[i + seq_length]

    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [22]:
patterns = len(x_data)
print ("Total Patterns:", patterns)

Total Patterns: 149478


In [24]:
X = np.reshape(x_data, (patterns, seq_length, 1))
X = X/float(char_len)

In [28]:
y = np_utils.to_categorical(y_data)

In [29]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [30]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [31]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [33]:
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 3: loss improved from 2.81819 to 2.66285, saving model to model_weights_saved.hdf5
Epoch 4/4
Epoch 4: loss improved from 2.66285 to 2.54558, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x19d590c7c70>

Epoch 1/4
Epoch 1: loss improved from 2.54558 to 2.42420, saving model to model_weights_saved.hdf5
Epoch 2/4
Epoch 2: loss improved from 2.42420 to 2.32655, saving model to model_weights_saved.hdf5
Epoch 3/4
Epoch 3: loss improved from 2.32655 to 2.24492, saving model to model_weights_saved.hdf5
Epoch 4/4
Epoch 4: loss improved from 2.24492 to 2.17611, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x19d63065f30>

In [34]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [35]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [47]:
start = np.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" nxiety lest romans call ruberto king naples would drive germans city bring back pope nearer friend c "


In [49]:
for i in range(100):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(char_len)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = num_to_char[index]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

season castruccio season castruccio season castruccio season castruccio season castruccio season cas