In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# read in the text, transforming everything to lower case
text = open('datasets/holmes.txt').read().lower()
print('original text has ' + str(len(text)) + ' characters')

original text has 581881 characters


In [5]:

text[:2000]

"ï»¿project gutenberg's the adventures of sherlock holmes, by arthur conan doyle\n\nthis ebook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  you may copy it, give it away or\nre-use it under the terms of the project gutenberg license included\nwith this ebook or online at www.gutenberg.net\n\n\ntitle: the adventures of sherlock holmes\n\nauthor: arthur conan doyle\n\nposting date: april 18, 2011 [ebook #1661]\nfirst posted: november 29, 2002\n\nlanguage: english\n\n\n*** start of this project gutenberg ebook the adventures of sherlock holmes ***\n\n\n\n\nproduced by an anonymous project gutenberg volunteer and jose menendez\n\n\n\n\n\n\n\n\n\nthe adventures of sherlock holmes\n\nby\n\nsir arthur conan doyle\n\n\n\n   i. a scandal in bohemia\n  ii. the red-headed league\n iii. a case of identity\n  iv. the boscombe valley mystery\n   v. the five orange pips\n  vi. the man with the twisted lip\n vii. the adventure of the blue carbuncle\nviii. 

In [6]:
#removing symbols
text = text[1302:]
text = text.replace('\n',' ')    # replacing '\n' with '' simply removes the sequence
text = text.replace('\r',' ')

In [7]:
text[:1000]

" his eyes she eclipses and predominates the whole of her sex. it was not that he felt any emotion akin to love for irene adler. all emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. he was, i take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position. he never spoke of the softer passions, save with a gibe and a sneer. they were admirable things for the observer--excellent for drawing the veil from men's motives and actions. but for the trained reasoner to admit such intrusions into his own delicate and finely adjusted temperament was to introduce a distracting factor which might throw a doubt upon all his mental results. grit in a sensitive instrument, or a crack in one of his own high-power lenses, would not be more disturbing than a strong emotion in a nature such as his. and yet there was but one woman to him, and that woman was the late irene

In [8]:
import string

allowed_chars = string.ascii_lowercase + ' ' + '!' + ',' + '.' + ':' + ';' + '?'

# remove as many non-english characters and character sequences as you can 
for char in text:
    if char not in allowed_chars:
        text = text.replace(char, ' ')

# shorten any extra dead space created above
text = text.replace('  ',' ')

In [9]:
text[:1000]

' his eyes she eclipses and predominates the whole of her sex. it was not that he felt any emotion akin to love for irene adler. all emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. he was, i take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position. he never spoke of the softer passions, save with a gibe and a sneer. they were admirable things for the observer excellent for drawing the veil from men s motives and actions. but for the trained reasoner to admit such intrusions into his own delicate and finely adjusted temperament was to introduce a distracting factor which might throw a doubt upon all his mental results. grit in a sensitive instrument, or a crack in one of his own high power lenses, would not be more disturbing than a strong emotion in a nature such as his. and yet there was but one woman to him, and that woman was the late irene 

In [11]:
chars = sorted(list(set(text)))

print ("total number of characters " +  str(len(text)) )
print ("unique characters= " +  str(len(chars)) )

total number of characters 573688
unique characters= 33


In [12]:
#we are using sliding windows concept here
def window_transform_text(text,window_size,step_size):
    # containers for input/output pairs
    inputs = []
    outputs = []
    ctr = 0
    
    # Goes from window_size until the end, and pick previous characters
    for i in range(window_size, len(text), step_size):
        inputs.append(text[ctr:i])
        outputs.append(text[i])
        ctr = ctr + step_size
    
    return inputs,outputs

In [13]:
window_size = 100
step_size = 5
inputs, outputs = window_transform_text(text,window_size,step_size)

In [17]:
chars = sorted(list(set(text)))
print("unique characters are")
print(chars)

unique characters are
[' ', '!', ',', '.', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [18]:
chars_to_indices = dict((c, i) for i, c in enumerate(chars))  # map each unique character to unique integer

indices_to_chars = dict((i, c) for i, c in enumerate(chars))  # map each unique integer back to unique character

In [19]:
chars_to_indices

{' ': 0,
 '!': 1,
 ',': 2,
 '.': 3,
 ':': 4,
 ';': 5,
 '?': 6,
 'a': 7,
 'b': 8,
 'c': 9,
 'd': 10,
 'e': 11,
 'f': 12,
 'g': 13,
 'h': 14,
 'i': 15,
 'j': 16,
 'k': 17,
 'l': 18,
 'm': 19,
 'n': 20,
 'o': 21,
 'p': 22,
 'q': 23,
 'r': 24,
 's': 25,
 't': 26,
 'u': 27,
 'v': 28,
 'w': 29,
 'x': 30,
 'y': 31,
 'z': 32}

In [20]:
# transform character-based input/output into equivalent numerical numbers
def encode_io_pairs(text,window_size,step_size):
    # number of unique chars
    chars = sorted(list(set(text)))
    num_chars = len(chars)
    
    # cut up text into character input/output pairs
    inputs, outputs = window_transform_text(text,window_size,step_size)
    
    # create empty vessels for one-hot encoded input/output
    X = np.zeros((len(inputs), window_size, num_chars), dtype=np.bool)
    y = np.zeros((len(inputs), num_chars), dtype=np.bool)
    
    # loop over inputs/outputs and tranform and store in X/y
    for i, sentence in enumerate(inputs):
        for t, char in enumerate(sentence):
            X[i, t, chars_to_indices[char]] = 1
        y[i, chars_to_indices[outputs[i]]] = 1
        
    return X,y

In [21]:
window_size = 100
step_size = 5
X,y = encode_io_pairs(text,window_size,step_size)

In [22]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
#from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from tensorflow.keras import optimizers
#import keras
import random

# TODO build the required RNN model: a single LSTM hidden layer with softmax activation, categorical_crossentropy loss 
model = Sequential()
model.add(LSTM(200, input_shape=(window_size, 33)))
model.add(Dense(33, activation='softmax'))

# initialize optimizer
optimizer = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

# compile model --> make sure initialized optimizer and callbacks - as defined above - are used
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [23]:
Xsmall = X[:10000,:,:]
ysmall = y[:10000,:]

In [24]:
# train the model
model.fit(Xsmall, ysmall, batch_size=500, epochs=40,verbose = 1)

# save weights
model.save_weights('model_weights/best_RNN_small_textdata_weights.hdf5')

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [25]:
# function that uses trained model to predict a desired number of future characters
def predict_next_chars(model,input_chars,num_to_predict):     
    # create output
    predicted_chars = ''
    for i in range(num_to_predict):
        # convert this round's predicted characters to numerical input    
        x_test = np.zeros((1, window_size, len(chars)))
        for t, char in enumerate(input_chars):
            x_test[0, t, chars_to_indices[char]] = 1.

        # make this round's prediction
        test_predict = model.predict(x_test,verbose = 0)[0]

        # translate numerical prediction back to characters
        r = np.argmax(test_predict)                           # predict class of each test input
        d = indices_to_chars[r] 

        # update predicted_chars and input
        predicted_chars+=d
        input_chars+=d
        input_chars = input_chars[1:]
    return predicted_chars

In [26]:
start_inds = [0, 500, 1000]

# load in weights
model.load_weights('model_weights/best_RNN_small_textdata_weights.hdf5')
for s in start_inds:
    start_index = s
    input_chars = text[start_index: start_index + window_size]

    # use the prediction function
    predict_input = predict_next_chars(model,input_chars,num_to_predict = 100)

    # print out input characters
    print('------------------')
    input_line = 'input chars = ' + '\n' +  input_chars + '"' + '\n'
    print(input_line)

    # print out predicted characters
    line = 'predicted chars = ' + '\n' +  predict_input + '"' + '\n'
    print(line)

------------------
input chars = 
 his eyes she eclipses and predominates the whole of her sex. it was not that he felt any emotion ak"

predicted chars = 
 of the cout in the couthe soull and betton the southe sout and he sout in the couthe soull an the t"

------------------
input chars = 
rver excellent for drawing the veil from men s motives and actions. but for the trained reasoner to "

predicted chars = 
he cout the the the mare and whith a sould and herise to the the the was he soull and the couthe sou"

------------------
input chars = 
dler, of dubious and questionable memory. i had seen little of holmes lately. my marriage had drifte"

predicted chars = 
d on the couthe southe sout and her inderome the coust and he was the there in the mast of the couth"



In [27]:
Xlarge = X[:100000,:,:]
ylarge = y[:100000,:]

# TODO: fit to our larger dataset
model.fit(Xlarge, ylarge, batch_size=500, epochs=30,verbose = 1)

# save weights
model.save_weights('model_weights/best_RNN_large_textdata_weights.hdf5')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [28]:
start_inds = [0, 500, 1000]

# save output
f = open('text_gen_output/RNN_large_textdata_output.txt', 'w')  # create an output file to write too

# load weights
model.load_weights('model_weights/best_RNN_large_textdata_weights.hdf5')
for s in start_inds:
    start_index = s
    input_chars = text[start_index: start_index + window_size]

    # use the prediction function
    predict_input = predict_next_chars(model,input_chars,num_to_predict = 100)

    # print out input characters
    line = '-------------------' + '\n'
    print(line)
    f.write(line)

    input_line = 'input chars = ' + '\n' +  input_chars + '"' + '\n'
    print(input_line)
    f.write(input_line)

    # print out predicted characters
    predict_line = 'predicted chars = ' + '\n' +  predict_input + '"' + '\n'
    print(predict_line)
    f.write(predict_line)
f.close()

-------------------

input chars = 
 his eyes she eclipses and predominates the whole of her sex. it was not that he felt any emotion ak"

predicted chars = 
ing to his eyes with a little street which i sen the lany of the windows and for the read in the str"

-------------------

input chars = 
rver excellent for drawing the veil from men s motives and actions. but for the trained reasoner to "

predicted chars = 
arvestered with a recume court of and of the glinds as if the past the street, and i shall be the fa"

-------------------

input chars = 
dler, of dubious and questionable memory. i had seen little of holmes lately. my marriage had drifte"

predicted chars = 
d to sut is my off comp.  it is a sigat of cayse and some to think that the came hadd an ond of the "

