#  Model 3: Two-Words-In, One-Word-Out Sequence

## Step 01: Import Libraries

In [1]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from array import *
from numpy import array
import warnings
warnings.filterwarnings("ignore")

## Step 02: Input Text

In [2]:
# source text
data = """ Hey diddle diddle\n
The cat and the fiddle\n
The cow jumped over the moon\n
The little dog laughed to see such sport\n
And the dish ran away with the spoon\n """

## Step 03: Preprocessing of Data

In [3]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

## Step 04: Data Analysis

In [4]:
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print( "Vocabulary Size: %d" % vocab_size)

Vocabulary Size: 23


In [5]:

# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
    sequence = encoded[i-2:i+1]
    sequences.append(sequence)
print( "Total Sequences: %d" % len(sequences))

Total Sequences: 28


In [6]:
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding= "pre" )
print( "Max Sequence Length: %d" % max_length)

Max Sequence Length: 3


## Step 05: Split into input and output

In [7]:
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]


## Step 06: Encoding of data

In [8]:
y = to_categorical(y, num_classes=vocab_size)

# Step 07: Define Model and Training of Data

In [9]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding= "pre" )
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word =" "
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += " "+ out_word
    return in_text


# define the model
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 10, input_length=max_length-1))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation= "softmax" ))
    # compile network
    model.compile(loss= "categorical_crossentropy" , optimizer= "adam" , metrics=[ "accuracy" ])
    # summarize defined model
    model.summary()
    #plot_model(model, to_file= "model3.png" , show_shapes=True)
    return model

# define model
model = define_model(vocab_size, max_length)
# fit network
model.fit(X, y, epochs=500, verbose=2)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2, 10)             230       
_________________________________________________________________
lstm (LSTM)                  (None, 50)                12200     
_________________________________________________________________
dense (Dense)                (None, 23)                1173      
Total params: 13,603
Trainable params: 13,603
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500
1/1 - 3s - loss: 3.1356 - accuracy: 0.0714
Epoch 2/500
1/1 - 0s - loss: 3.1340 - accuracy: 0.2143
Epoch 3/500
1/1 - 0s - loss: 3.1324 - accuracy: 0.2857
Epoch 4/500
1/1 - 0s - loss: 3.1307 - accuracy: 0.2500
Epoch 5/500
1/1 - 0s - loss: 3.1291 - accuracy: 0.2500
Epoch 6/500
1/1 - 0s - loss: 3.1274 - accuracy: 0.2500
Epoch 7/500
1/1 - 0s - loss: 3.1257 - accuracy: 0.25

Epoch 134/500
1/1 - 0s - loss: 1.9997 - accuracy: 0.4286
Epoch 135/500
1/1 - 0s - loss: 1.9861 - accuracy: 0.4286
Epoch 136/500
1/1 - 0s - loss: 1.9725 - accuracy: 0.4643
Epoch 137/500
1/1 - 0s - loss: 1.9590 - accuracy: 0.4643
Epoch 138/500
1/1 - 0s - loss: 1.9454 - accuracy: 0.4643
Epoch 139/500
1/1 - 0s - loss: 1.9318 - accuracy: 0.4643
Epoch 140/500
1/1 - 0s - loss: 1.9182 - accuracy: 0.4643
Epoch 141/500
1/1 - 0s - loss: 1.9046 - accuracy: 0.4643
Epoch 142/500
1/1 - 0s - loss: 1.8910 - accuracy: 0.4643
Epoch 143/500
1/1 - 0s - loss: 1.8775 - accuracy: 0.4643
Epoch 144/500
1/1 - 0s - loss: 1.8639 - accuracy: 0.4643
Epoch 145/500
1/1 - 0s - loss: 1.8504 - accuracy: 0.4643
Epoch 146/500
1/1 - 0s - loss: 1.8368 - accuracy: 0.4643
Epoch 147/500
1/1 - 0s - loss: 1.8233 - accuracy: 0.4643
Epoch 148/500
1/1 - 0s - loss: 1.8098 - accuracy: 0.4286
Epoch 149/500
1/1 - 0s - loss: 1.7963 - accuracy: 0.4286
Epoch 150/500
1/1 - 0s - loss: 1.7827 - accuracy: 0.4286
Epoch 151/500
1/1 - 0s - loss: 

Epoch 278/500
1/1 - 0s - loss: 0.3740 - accuracy: 0.9643
Epoch 279/500
1/1 - 0s - loss: 0.3685 - accuracy: 0.9643
Epoch 280/500
1/1 - 0s - loss: 0.3631 - accuracy: 0.9643
Epoch 281/500
1/1 - 0s - loss: 0.3578 - accuracy: 0.9643
Epoch 282/500
1/1 - 0s - loss: 0.3526 - accuracy: 0.9643
Epoch 283/500
1/1 - 0s - loss: 0.3475 - accuracy: 0.9643
Epoch 284/500
1/1 - 0s - loss: 0.3425 - accuracy: 0.9643
Epoch 285/500
1/1 - 0s - loss: 0.3376 - accuracy: 0.9643
Epoch 286/500
1/1 - 0s - loss: 0.3327 - accuracy: 0.9643
Epoch 287/500
1/1 - 0s - loss: 0.3280 - accuracy: 0.9643
Epoch 288/500
1/1 - 0s - loss: 0.3233 - accuracy: 0.9643
Epoch 289/500
1/1 - 0s - loss: 0.3187 - accuracy: 0.9643
Epoch 290/500
1/1 - 0s - loss: 0.3142 - accuracy: 0.9643
Epoch 291/500
1/1 - 0s - loss: 0.3098 - accuracy: 0.9643
Epoch 292/500
1/1 - 0s - loss: 0.3054 - accuracy: 0.9643
Epoch 293/500
1/1 - 0s - loss: 0.3012 - accuracy: 0.9643
Epoch 294/500
1/1 - 0s - loss: 0.2970 - accuracy: 0.9643
Epoch 295/500
1/1 - 0s - loss: 

Epoch 422/500
1/1 - 0s - loss: 0.0924 - accuracy: 0.9643
Epoch 423/500
1/1 - 0s - loss: 0.0920 - accuracy: 0.9643
Epoch 424/500
1/1 - 0s - loss: 0.0916 - accuracy: 0.9643
Epoch 425/500
1/1 - 0s - loss: 0.0912 - accuracy: 0.9643
Epoch 426/500
1/1 - 0s - loss: 0.0908 - accuracy: 0.9643
Epoch 427/500
1/1 - 0s - loss: 0.0904 - accuracy: 0.9643
Epoch 428/500
1/1 - 0s - loss: 0.0900 - accuracy: 0.9643
Epoch 429/500
1/1 - 0s - loss: 0.0897 - accuracy: 0.9643
Epoch 430/500
1/1 - 0s - loss: 0.0893 - accuracy: 0.9643
Epoch 431/500
1/1 - 0s - loss: 0.0889 - accuracy: 0.9643
Epoch 432/500
1/1 - 0s - loss: 0.0886 - accuracy: 0.9643
Epoch 433/500
1/1 - 0s - loss: 0.0882 - accuracy: 0.9643
Epoch 434/500
1/1 - 0s - loss: 0.0879 - accuracy: 0.9643
Epoch 435/500
1/1 - 0s - loss: 0.0876 - accuracy: 0.9643
Epoch 436/500
1/1 - 0s - loss: 0.0872 - accuracy: 0.9643
Epoch 437/500
1/1 - 0s - loss: 0.0869 - accuracy: 0.9643
Epoch 438/500
1/1 - 0s - loss: 0.0866 - accuracy: 0.9643
Epoch 439/500
1/1 - 0s - loss: 

<tensorflow.python.keras.callbacks.History at 0x1d324d59198>

## Step 08: evaluation of Model

In [10]:
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, "Hey diddle" , 5))
print(generate_seq(model, tokenizer, max_length-1, "the cat" , 3))
print(generate_seq(model, tokenizer, max_length-1, "the cow" , 5))
print(generate_seq(model, tokenizer, max_length-1, "the little" , 5))

Hey diddle diddle the cat and the
the cat and the dish
the cow jumped over the moon the
the little dog laughed to see such
