In [1]:
import numpy
import codecs
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# load ascii text and covert to lowercase
filename = "wonderland.txt"
# raw_text = open(filename).read()
# raw_text = raw_text.lower()

with codecs.open(filename, 'rb', 'utf-8-sig') as f:
    raw_text = f.read()
    raw_text = raw_text.lower()

In [3]:
import string
from string import punctuation
raw_text = [s for s in raw_text if s not in string.punctuation]
raw_text = "".join(raw_text)
raw_text



In [4]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [5]:
chars

['\n',
 ' ',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [6]:
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

Total Characters:  134787
Total Vocab:  28


In [7]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

Total Patterns:  134687


In [8]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [9]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(256))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [10]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [11]:
model.fit(X, y, epochs=100, batch_size=1024, callbacks=callbacks_list)

Epoch 1/100

Epoch 00001: loss improved from inf to 2.88651, saving model to weights-improvement-01-2.8865.hdf5
Epoch 2/100

Epoch 00002: loss improved from 2.88651 to 2.83998, saving model to weights-improvement-02-2.8400.hdf5
Epoch 3/100

Epoch 00003: loss improved from 2.83998 to 2.73718, saving model to weights-improvement-03-2.7372.hdf5
Epoch 4/100

Epoch 00004: loss improved from 2.73718 to 2.58947, saving model to weights-improvement-04-2.5895.hdf5
Epoch 5/100

Epoch 00005: loss improved from 2.58947 to 2.42867, saving model to weights-improvement-05-2.4287.hdf5
Epoch 6/100

Epoch 00006: loss improved from 2.42867 to 2.28771, saving model to weights-improvement-06-2.2877.hdf5
Epoch 7/100

Epoch 00007: loss improved from 2.28771 to 2.18276, saving model to weights-improvement-07-2.1828.hdf5
Epoch 8/100

Epoch 00008: loss improved from 2.18276 to 2.09400, saving model to weights-improvement-08-2.0940.hdf5
Epoch 9/100

Epoch 00009: loss improved from 2.09400 to 2.02059, saving mode


Epoch 00043: loss improved from 1.17382 to 1.15462, saving model to weights-improvement-43-1.1546.hdf5
Epoch 44/100

Epoch 00044: loss improved from 1.15462 to 1.14194, saving model to weights-improvement-44-1.1419.hdf5
Epoch 45/100

Epoch 00045: loss improved from 1.14194 to 1.12819, saving model to weights-improvement-45-1.1282.hdf5
Epoch 46/100

Epoch 00046: loss improved from 1.12819 to 1.11393, saving model to weights-improvement-46-1.1139.hdf5
Epoch 47/100

Epoch 00047: loss improved from 1.11393 to 1.09876, saving model to weights-improvement-47-1.0988.hdf5
Epoch 48/100

Epoch 00048: loss improved from 1.09876 to 1.08300, saving model to weights-improvement-48-1.0830.hdf5
Epoch 49/100

Epoch 00049: loss improved from 1.08300 to 1.06901, saving model to weights-improvement-49-1.0690.hdf5
Epoch 50/100

Epoch 00050: loss improved from 1.06901 to 1.05681, saving model to weights-improvement-50-1.0568.hdf5
Epoch 51/100

Epoch 00051: loss improved from 1.05681 to 1.04210, saving mode


Epoch 00086: loss improved from 0.66263 to 0.65880, saving model to weights-improvement-86-0.6588.hdf5
Epoch 87/100

Epoch 00087: loss improved from 0.65880 to 0.64254, saving model to weights-improvement-87-0.6425.hdf5
Epoch 88/100

Epoch 00088: loss improved from 0.64254 to 0.63968, saving model to weights-improvement-88-0.6397.hdf5
Epoch 89/100

Epoch 00089: loss improved from 0.63968 to 0.63316, saving model to weights-improvement-89-0.6332.hdf5
Epoch 90/100

Epoch 00090: loss improved from 0.63316 to 0.62264, saving model to weights-improvement-90-0.6226.hdf5
Epoch 91/100

Epoch 00091: loss improved from 0.62264 to 0.61227, saving model to weights-improvement-91-0.6123.hdf5
Epoch 92/100

Epoch 00092: loss improved from 0.61227 to 0.60756, saving model to weights-improvement-92-0.6076.hdf5
Epoch 93/100

Epoch 00093: loss improved from 0.60756 to 0.59814, saving model to weights-improvement-93-0.5981.hdf5
Epoch 94/100

Epoch 00094: loss improved from 0.59814 to 0.59485, saving mode

<keras.callbacks.History at 0x7fc4c0a16710>

In [12]:
# load the network weights
filename = "weights-improvement-100-0.5518.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [13]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [16]:
import sys

# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(500):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print ("\nDone.")

Seed:
"  that will be
the best plan
it sounded an excellent plan no doubt and very neatly and simply
arrange "
d the only difficulty was that she could not help it said the hatter with a sigh its all madbit
and was goleiri cle pearlng and put here it taid the king that so it has herself and was goldire by the others ar srrn as all that said the hatter would be thlif vo begin with another diggir with the nther the only difficulty the king that she had to gind her arm ann that got see she hame
the queen tasted and wery snon hamping fow to herself in a doean cursay that affine it is all the gryphon replied 
Done.
