In [None]:
import numpy as np
import re
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from pickle import dump
from pickle import load 
from keras.models import load_model
import pickle

In [None]:
# Load the ready-to-run shakespeare training data 
X = load(open("X_training_shakespeare_final.h5", "rb"))
y = load(open("y_training_shakespeare_final.h5", "rb"))

In [None]:
def load_raw(filename): 
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [None]:
# Preprocess spenser text similar to shakespeare data
file2 = 'spenser.txt'
raw_text2 = load_raw(file2)

In [None]:
# Remove extra spaces
process_text2 = re.sub(' +', ' ', raw_text2)
# Remove general headers
process_text2 = re.sub('\n\n.+\n\n', '', process_text2)
# Remove first header
process_text2 = re.sub('I\n\n', '', process_text2)
# Split on newlines
process_text2 = re.split("\n        |\n", process_text2)
# Remove any empty strings resulting from split
process_text2 = list(filter(None, process_text2))
# Check result
print(process_text2)

In [None]:
process_string2 = "".join(process_text2)
char_text2 = list(process_string2)
char_text2 = list(filter(None, char_text2))
print(len(char_text2))

In [None]:
# Get all unique characters for creating a character embedding
unique_chars2 = sorted(list(set(process_string2)))
unique_chars2.remove('')
# Map to integers 
mapping2 = dict((c, i) for i, c in enumerate(unique_chars2))
vocab_size2 = len(unique_chars2)
# See our mapping
print(mapping2)

In [None]:
# Load the character mapping for shakespeare 
with open("mapping.pkl", "rb") as fp: 
    mapping = pickle.load(fp)
vocab_size = len(mapping)

In [None]:
# NOTE: we decided to use only the shakespeare mapping 
# for both data sets. We see which extra characters are present
# in Spenser sonnets and remove them. The number of removals is 
# not significant. 
diff = list(set(unique_chars2) - set(unique_chars))
print(diff)
print(char_text2.count('&'))
print(char_text2.count('X'))
print(char_text2.count('Q'))

In [None]:
# Remove and make sure no longer in text 
char_text2.remove('&')
char_text2.remove('X')
char_text2.remove('Q')
char_text2.remove('Q')
char_text2.remove('Q')
print(char_text2.count('&'))
print(char_text2.count('X'))
print(char_text2.count('Q'))

In [None]:
# Redo mapping to verify 
# Get all unique characters for creating a character embedding
unique_chars2 = sorted(list(set(process_string2)))
unique_chars2.remove('')
# Map to integers 
mapping2 = dict((c, i) for i, c in enumerate(unique_chars2))
vocab_size2 = len(unique_chars2)
# See our mapping
print(mapping2)

In [None]:
# Also parse into sequences of 40 (training) + 1 (prediction)
length2 = 40 + 1
sequences2 = []
for i in range(length2, len(char_text2)): 
    seq = char_text2[i-length2:i]
    seq_string = ''.join(seq)
    sequences2.append(seq_string)
print(len(sequences2))

In [None]:
# Convert list of characters to corresponding integer
num_char2 = []
for line in sequences2: 
    encoded = [mapping[char] for char in line]
    num_char2.append(encoded)
print(len(num_char2[0]))

In [None]:
# Split into X and y
num_char2 = np.array(num_char2)
X2, y2 = num_char2[:, :-1], num_char2[:, -1]
print(X2.shape)
print(y2.shape)

In [None]:
# Hot one encode
X2 = np.array([to_categorical(x, num_classes=len(mapping)) for x in X2])
y2 = to_categorical(y2, num_classes=len(mapping))
print(X2.shape)
print(y2.shape)

In [None]:
# Save our processed character file for later access
with open("process_spenser.txt", "wb") as fp: 
    pickle.dump(char_text2, fp)

In [None]:
# Save ready-to-run X and y data for spenser sonnets
with open("X_training_spenser.h5", "wb") as fp: 
    pickle.dump(X2, fp)
with open("y_training_spenser.h5", "wb") as fp: 
    pickle.dump(y2, fp)

In [None]:
# Stack the new spenser training data on top of the shakespeare data
X_train = np.vstack((X2, X))
print(X_train.shape)
y_train = np.vstack((y2, y))
print(y_train.shape)

In [None]:
# We continue training directly from Model 3
model = load_model("model3.h5")
print(model.summary())

In [None]:
# Model 4 training conditions 
# Saved model every 5 epochs
# Total 20 epochs
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
model.fit(X_train, y_train, epochs = 5, batch_size = 64, verbose=1)