In [6]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import random
import sys
import io
import requests
import re


In [11]:

filename = "Text.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

print(raw_text[0:1000])

the project gutenberg ebook of treasure island, by robert louis stevenson

this ebook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  you may copy it, give it away or
re-use it under the terms of the project gutenberg license included
with this ebook or online at www.gutenberg.net


title: treasure island

author: robert louis stevenson

illustrator: milo winter

release date: january 12, 2009 [ebook #27780]

language: english


*** start of this project gutenberg ebook treasure island ***




produced by juliet sutherland, stephen blundell and the
online distributed proofreading team at http://www.pgdp.net









 the illustrated children's library


         _treasure island_

       robert louis stevenson

          _illustrated by_
            milo winter


           [illustration]


           gramercy books
              new york




 foreword copyright © 1986 by random house value publishing
 color illustrations by milo winter copyrig

In [8]:
processed_text = raw_text.lower()
processed_text = re.sub(r'[^\x00-\x7f]',r'', processed_text)


In [9]:
# create mapping of unique chars to integers
chars = sorted(list(set(processed_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [10]:
n_chars = len(processed_text)
n_vocab = len(chars)
print( "Total Characters: ", n_chars)
print( "Total Vocab: ", n_vocab)

Total Characters:  389308
Total Vocab:  59


In [12]:
print(chars)

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [13]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = processed_text[i:i + seq_length]
	seq_out = processed_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print( "Total Patterns: ", n_patterns)


Total Patterns:  389208


In [14]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [15]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')


In [16]:

# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [17]:
model.fit(X, y, epochs=5, batch_size=128, callbacks=callbacks_list)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x273dbcdcb70>

In [21]:
# load the network weights
#filename = "weights-improvement-19-2.0368.hdf5"
filename = "weights-improvement-04-2.4635.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')


In [22]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [23]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print( "Seed:")
print( "\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print( "\nDone.")

Seed:
"  and well; he was the oldest of our party by a score
of years; and now, sullen, old, serviceable ser "
 the sore to the tooee of the sooee and the sooee to the tooee th tee soone to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the tooee th tee soine to the 