In [0]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
 

Using TensorFlow backend.


In [0]:
# source text
data = """ Jack and Jill went up the hill\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n """


In [0]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

In [0]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)


Vocabulary Size: 22


In [0]:
# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
	sequence = encoded[i-2:i+1]
	sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))


Total Sequences: 23


In [0]:
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)


Max Sequence Length: 3


In [0]:
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)


In [0]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())


W0716 22:01:53.910820  4440 deprecation_wrapper.py:119] From C:\Users\RAHUL\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\backend\tensorflow_backend.py:63: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0716 22:01:53.948847  4440 deprecation_wrapper.py:119] From C:\Users\RAHUL\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\backend\tensorflow_backend.py:492: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0716 22:01:53.952849  4440 deprecation_wrapper.py:119] From C:\Users\RAHUL\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\backend\tensorflow_backend.py:3630: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0716 22:01:54.071583  4440 deprecation.py:506] From C:\Users\RAHUL\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\backend\tensorflow_backend.py:1242: calling reduce_sum_v1 (from tensorflow.python.op

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 10)             220       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=500, verbose=2)


W0716 22:01:54.469086  4440 deprecation_wrapper.py:119] From C:\Users\RAHUL\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\optimizers.py:711: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0716 22:01:54.539178  4440 deprecation_wrapper.py:119] From C:\Users\RAHUL\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\backend\tensorflow_backend.py:2884: The name tf.log is deprecated. Please use tf.math.log instead.

W0716 22:01:54.761772  4440 deprecation.py:323] From C:\Users\RAHUL\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0716 22:01:56.124823  4440 deprecation_wrapper.py:119] From C:\Users\RAHUL\AppData\Local\Programs\Python\Python37\lib

Epoch 1/500
 - 1s - loss: 3.0919 - acc: 0.0000e+00
Epoch 2/500
 - 0s - loss: 3.0912 - acc: 0.0000e+00
Epoch 3/500
 - 0s - loss: 3.0904 - acc: 0.0435
Epoch 4/500
 - 0s - loss: 3.0896 - acc: 0.0870
Epoch 5/500
 - 0s - loss: 3.0887 - acc: 0.0870
Epoch 6/500
 - 0s - loss: 3.0878 - acc: 0.0870
Epoch 7/500
 - 0s - loss: 3.0869 - acc: 0.0870
Epoch 8/500
 - 0s - loss: 3.0860 - acc: 0.0870
Epoch 9/500
 - 0s - loss: 3.0851 - acc: 0.0870
Epoch 10/500
 - 0s - loss: 3.0842 - acc: 0.0870
Epoch 11/500
 - 0s - loss: 3.0832 - acc: 0.0870
Epoch 12/500
 - 0s - loss: 3.0823 - acc: 0.0870
Epoch 13/500
 - 0s - loss: 3.0813 - acc: 0.0870
Epoch 14/500
 - 0s - loss: 3.0803 - acc: 0.0870
Epoch 15/500
 - 0s - loss: 3.0792 - acc: 0.0870
Epoch 16/500
 - 0s - loss: 3.0782 - acc: 0.0870
Epoch 17/500
 - 0s - loss: 3.0771 - acc: 0.0870
Epoch 18/500
 - 0s - loss: 3.0760 - acc: 0.0870
Epoch 19/500
 - 0s - loss: 3.0748 - acc: 0.0870
Epoch 20/500
 - 0s - loss: 3.0736 - acc: 0.0870
Epoch 21/500
 - 0s - loss: 3.0724 - acc: 

Epoch 171/500
 - 0s - loss: 1.0878 - acc: 0.8696
Epoch 172/500
 - 0s - loss: 1.0706 - acc: 0.8696
Epoch 173/500
 - 0s - loss: 1.0533 - acc: 0.8696
Epoch 174/500
 - 0s - loss: 1.0361 - acc: 0.8696
Epoch 175/500
 - 0s - loss: 1.0190 - acc: 0.8696
Epoch 176/500
 - 0s - loss: 1.0019 - acc: 0.8696
Epoch 177/500
 - 0s - loss: 0.9849 - acc: 0.8696
Epoch 178/500
 - 0s - loss: 0.9680 - acc: 0.8696
Epoch 179/500
 - 0s - loss: 0.9511 - acc: 0.8696
Epoch 180/500
 - 0s - loss: 0.9344 - acc: 0.8696
Epoch 181/500
 - 0s - loss: 0.9177 - acc: 0.8696
Epoch 182/500
 - 0s - loss: 0.9012 - acc: 0.8696
Epoch 183/500
 - 0s - loss: 0.8847 - acc: 0.8696
Epoch 184/500
 - 0s - loss: 0.8684 - acc: 0.8696
Epoch 185/500
 - 0s - loss: 0.8522 - acc: 0.8696
Epoch 186/500
 - 0s - loss: 0.8362 - acc: 0.8696
Epoch 187/500
 - 0s - loss: 0.8204 - acc: 0.8696
Epoch 188/500
 - 0s - loss: 0.8047 - acc: 0.8696
Epoch 189/500
 - 0s - loss: 0.7892 - acc: 0.8696
Epoch 190/500
 - 0s - loss: 0.7740 - acc: 0.8696
Epoch 191/500
 - 0s 

Epoch 339/500
 - 0s - loss: 0.0994 - acc: 0.9565
Epoch 340/500
 - 0s - loss: 0.0990 - acc: 0.9565
Epoch 341/500
 - 0s - loss: 0.0985 - acc: 0.9565
Epoch 342/500
 - 0s - loss: 0.0981 - acc: 0.9565
Epoch 343/500
 - 0s - loss: 0.0977 - acc: 0.9565
Epoch 344/500
 - 0s - loss: 0.0973 - acc: 0.9565
Epoch 345/500
 - 0s - loss: 0.0969 - acc: 0.9565
Epoch 346/500
 - 0s - loss: 0.0965 - acc: 0.9565
Epoch 347/500
 - 0s - loss: 0.0961 - acc: 0.9565
Epoch 348/500
 - 0s - loss: 0.0957 - acc: 0.9565
Epoch 349/500
 - 0s - loss: 0.0953 - acc: 0.9565
Epoch 350/500
 - 0s - loss: 0.0949 - acc: 0.9565
Epoch 351/500
 - 0s - loss: 0.0946 - acc: 0.9565
Epoch 352/500
 - 0s - loss: 0.0942 - acc: 0.9565
Epoch 353/500
 - 0s - loss: 0.0938 - acc: 0.9565
Epoch 354/500
 - 0s - loss: 0.0935 - acc: 0.9565
Epoch 355/500
 - 0s - loss: 0.0932 - acc: 0.9565
Epoch 356/500
 - 0s - loss: 0.0928 - acc: 0.9565
Epoch 357/500
 - 0s - loss: 0.0925 - acc: 0.9565
Epoch 358/500
 - 0s - loss: 0.0922 - acc: 0.9565
Epoch 359/500
 - 0s 

<keras.callbacks.History at 0x7460e1cc88>

In [0]:
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'Jack and', 5))
print(generate_seq(model, tokenizer, max_length-1, 'And Jill', 3))
print(generate_seq(model, tokenizer, max_length-1, 'fell down', 5))
print(generate_seq(model, tokenizer, max_length-1, 'pail of', 5))

Jack and jill went up the hill
And Jill went up the
fell down and broke his crown and
pail of water jack fell down and
