In [None]:
import numpy as np
import re
from keras.models import Sequential
from keras.layers import LSTM, Dense 
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from pickle import dump
from pickle import load 
from keras.models import load_model
import pickle

In [None]:
# Reads in a text file and returns contents as a string
def load_raw(filename): 
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [None]:
# Load in the raw text file as a string
file1 = 'shakespeare.txt'
raw_text = load_raw(file1)

In [None]:
# Remove header for first poem 
process_text = re.sub("                   1", "", raw_text)
# Remove any extra spaces
process_text = re.sub(" +", " ", process_text)
# Remove header for remaining poems 
# Split along newlines and any remaining numbers
process_text = re.split("\n\n\n                   .|\n\n\n.|\n|\d", process_text)
# Remove any empty strings that resulted from the split
process_text = list(filter(None, process_text))

# Check result
print(process_text)

In [None]:
# Make single string of all the characters we want to keep
process_string = "".join(process_text)
# Split string into individual characters
char_text = list(process_string)
# Remove any empty characters
char_text = list(filter(None, char_text))

# Check length of characters
print(len(char_text))

In [None]:
# Parse the characters into consecutive sequences of length 41 
# 40 for training, 1 for prediction
# Each sequence has an offset of one
length = 40 + 1
sequences = []
for i in range(length, len(char_text)): 
    seq = char_text[i-length:i]
    # Seq stored as single string at this point
    seq_string = ''.join(seq)
    sequences.append(seq_string)
# Print the number of sequences constructed 
print(len(sequences))

In [None]:
# Get all unique characters for creating a character embedding
unique_chars = sorted(list(set(process_string)))
# Map to integers 
mapping = dict((c, i) for i, c in enumerate(unique_chars))
# Our total vocabulary size will be the unique characters
vocab_size = len(unique_chars)
# See what our mapping is 
print(mapping)

In [None]:
# OPTIONAL
# Store processed characters in a text file for easier later access
with open("process_shakespeare_final.txt", "wb") as fp: 
    pickle.dump(char_text, fp)
# Store mapping for later access
with open("mapping.pkl", "wb") as fp: 
    pickle.dump(mapping, fp)

In [None]:
# Convert all individual characters to mapped integers
num_char = []
for line in sequences: 
    encoded = [mapping[char] for char in line]
    num_char.append(encoded)
# Verify that the seq is still of length 41
print(len(num_char[0]))

In [None]:
# Split data into training and labels
num_char = np.array(num_char)
X, y = num_char[:, :-1], num_char[:, -1]
# See what sizes our matrices are 
print(X.shape)
print(y.shape)

In [None]:
# One-hot encode integers
X = np.array([to_categorical(x, num_classes=len(mapping)) for x in X])
y = to_categorical(y, num_classes=len(mapping))
print(X.shape)
# See what sizes our matrices are
print(y.shape)

In [None]:
# OPTIONAL
# Store the ready-to-run version of X and y for later access 
# Note: file size usually too large to upload to Google Colab directly
with open("X_training_shakespeare_final.h5", "wb") as fp: 
    pickle.dump(X, fp)
with open("y_training_shakespeare_final.h5", "wb") as fp: 
    pickle.dump(y, fp)

In [None]:
# Basic LSTM model (Model 3 + Model 4 architecture)
# Note: see report for more details on implementations related to model number
# LSTM has 120 units
# Dense layer has softmax layers equal to vocab_size for predicting each character
model = Sequential()
model.add(LSTM(120, input_shape = (X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

In [None]:
# Model 3 training conditions
# Completed in one run
# Total 40 epochs
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs = 40, batch_size = 8, verbose=1)