In [3]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [4]:
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

In [5]:
data = """ Jack and Jill went up the hill\n
They're heading in this direction. What are we going to do?  We'll be sent to the spice mines of Kessel or smashed into who knows what!\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n 
		After Demo need a validation \n      
		Did you hear that?  They've shut down the main reactor We'll be destroyed for sure This is madness!\n 
		Did he hear that?  He will protest against this.\n 
		Did she hear that?  She will have it published for sure. This is madness!\n 
        """


In [6]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]


In [7]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

In [8]:
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 67


In [9]:
# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
    sequence = encoded[i-2:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 95


In [10]:
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 3


In [11]:
sequences

array([[ 8,  2,  9],
       [ 2,  9, 24],
       [ 9, 24, 25],
       [24, 25,  3],
       [25,  3, 26],
       [ 3, 26, 27],
       [26, 27, 28],
       [27, 28, 29],
       [28, 29,  1],
       [29,  1, 30],
       [ 1, 30, 10],
       [30, 10, 31],
       [10, 31, 32],
       [31, 32, 33],
       [32, 33,  4],
       [33,  4, 34],
       [ 4, 34, 11],
       [34, 11, 12],
       [11, 12, 35],
       [12, 35,  4],
       [35,  4,  3],
       [ 4,  3, 36],
       [ 3, 36, 37],
       [36, 37, 13],
       [37, 13, 38],
       [13, 38, 39],
       [38, 39, 40],
       [39, 40, 41],
       [40, 41, 42],
       [41, 42, 43],
       [42, 43, 10],
       [43, 10,  4],
       [10,  4, 44],
       [ 4, 44, 14],
       [44, 14, 45],
       [14, 45, 13],
       [45, 13, 46],
       [13, 46,  8],
       [46,  8, 47],
       [ 8, 47, 15],
       [47, 15,  2],
       [15,  2, 48],
       [ 2, 48, 49],
       [48, 49, 50],
       [49, 50,  2],
       [50,  2,  9],
       [ 2,  9, 51],
       [ 9, 5

In [13]:
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [14]:
sequences[:,:-1]

array([[ 8,  2],
       [ 2,  9],
       [ 9, 24],
       [24, 25],
       [25,  3],
       [ 3, 26],
       [26, 27],
       [27, 28],
       [28, 29],
       [29,  1],
       [ 1, 30],
       [30, 10],
       [10, 31],
       [31, 32],
       [32, 33],
       [33,  4],
       [ 4, 34],
       [34, 11],
       [11, 12],
       [12, 35],
       [35,  4],
       [ 4,  3],
       [ 3, 36],
       [36, 37],
       [37, 13],
       [13, 38],
       [38, 39],
       [39, 40],
       [40, 41],
       [41, 42],
       [42, 43],
       [43, 10],
       [10,  4],
       [ 4, 44],
       [44, 14],
       [14, 45],
       [45, 13],
       [13, 46],
       [46,  8],
       [ 8, 47],
       [47, 15],
       [15,  2],
       [ 2, 48],
       [48, 49],
       [49, 50],
       [50,  2],
       [ 2,  9],
       [ 9, 51],
       [51, 52],
       [52, 16],
       [16, 16],
       [16, 53],
       [53, 54],
       [54, 14],
       [14, 55],
       [55,  5],
       [ 5, 56],
       [56,  6],
       [ 6,  7

In [15]:
sequences[:,-1]

array([ 9, 24, 25,  3, 26, 27, 28, 29,  1, 30, 10, 31, 32, 33,  4, 34, 11,
       12, 35,  4,  3, 36, 37, 13, 38, 39, 40, 41, 42, 43, 10,  4, 44, 14,
       45, 13, 46,  8, 47, 15,  2, 48, 49, 50,  2,  9, 51, 52, 16, 16, 53,
       54, 14, 55,  5, 56,  6,  7, 57, 58, 15,  3, 59, 60, 11, 12, 61, 17,
       18,  1, 19, 20,  5, 21,  6,  7, 21, 22, 62, 63,  1,  5, 23,  6,  7,
       23, 22, 64, 65, 66, 17, 18,  1, 19, 20], dtype=int32)

In [17]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 10)             670       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 67)                3417      
Total params: 16,287
Trainable params: 16,287
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=500, verbose=2)

Epoch 1/500
3/3 - 2s - loss: 4.2059 - accuracy: 0.0000e+00
Epoch 2/500
3/3 - 0s - loss: 4.2040 - accuracy: 0.0105
Epoch 3/500
3/3 - 0s - loss: 4.2026 - accuracy: 0.0316
Epoch 4/500
3/3 - 0s - loss: 4.2012 - accuracy: 0.0421
Epoch 5/500
3/3 - 0s - loss: 4.1998 - accuracy: 0.0526
Epoch 6/500
3/3 - 0s - loss: 4.1985 - accuracy: 0.0632
Epoch 7/500
3/3 - 0s - loss: 4.1970 - accuracy: 0.0632
Epoch 8/500
3/3 - 0s - loss: 4.1953 - accuracy: 0.0632
Epoch 9/500
3/3 - 0s - loss: 4.1936 - accuracy: 0.0632
Epoch 10/500
3/3 - 0s - loss: 4.1919 - accuracy: 0.0632
Epoch 11/500
3/3 - 0s - loss: 4.1902 - accuracy: 0.0737
Epoch 12/500
3/3 - 0s - loss: 4.1881 - accuracy: 0.0737
Epoch 13/500
3/3 - 0s - loss: 4.1861 - accuracy: 0.0737
Epoch 14/500
3/3 - 0s - loss: 4.1838 - accuracy: 0.0737
Epoch 15/500
3/3 - 0s - loss: 4.1811 - accuracy: 0.0737
Epoch 16/500
3/3 - 0s - loss: 4.1785 - accuracy: 0.0737
Epoch 17/500
3/3 - 0s - loss: 4.1755 - accuracy: 0.0842
Epoch 18/500
3/3 - 0s - loss: 4.1721 - accuracy: 0.08

<tensorflow.python.keras.callbacks.History at 0x7f93cd4aee10>

In [29]:
print(generate_seq(model, tokenizer, max_length-1, 'destroyed', 5))

destroyed for sure this is madness


In [27]:
print(generate_seq(model, tokenizer, max_length-1, 'Did you hear that', 10))

Did you hear that she will have it published for sure this is madness


In [23]:
print(generate_seq(model, tokenizer, max_length-1, 'Jack fell down', 10))

Jack fell down and broke his crown and jill came tumbling after after


In [22]:
print(generate_seq(model, tokenizer, max_length-1, 'After Demo ', 3))

After Demo  need a validation
