# **Shakespeare-like Text Generation with LSTM.**

**Importing packages.**

In [None]:
# numpy + ntlk toolkit.
# Project extracted from Project Gutenberg; files: Macbeth, Complete works of Shakespeare
import numpy
import sys
import io

from keras.models import Sequential
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

print('packages imported')

**Tokenizer initialization.**

In [None]:
# read input as file
# file = open("../input/macbeth/Macbeth.txt", 'r')
#file = io.open("../input/macbeth/Macbeth.txt", mode="r", encoding="utf-8")
#file.read()
#file1 = open("../input/macbeth/Macbeth.txt", mode="r", encoding="utf-8")
#file2 = [line.rstrip('\n') for line in file1]
#file3 = [open(file1).read() for file1 in text_files]

with open("../input/macbeth/Macbeth.txt", mode="r", encoding="utf-8") as file4:
          file = file4.read()
          #print(file)

def tokenize_words(input):
    # lowercase text.
    input = input.lower()

    # Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # Filtered text only.
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

processed_inputs = tokenize_words(file)
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

**Vocab stats**

In [None]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

**Sequence Length**

In [None]:
seq_length = 100
x_data = []
y_data = []

In [None]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

**Total patterns**

In [None]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

In [None]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [None]:
y = np_utils.to_categorical(y_data)

In [None]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [None]:
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

In [None]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

In [None]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]