#  Model 2: Line-by-Line Sequence

## Step 01: Import Libraries

In [21]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from array import *
from numpy import array
import warnings
warnings.filterwarnings("ignore")

## Step 02: Input Text

In [22]:
# source text
data = """ Hey diddle diddle\n
The cat and the fiddle\n
The cow jumped over the moon\n
The little dog laughed to see such sport\n
And the dish ran away with the spoon\n """

## Step 03: Preprocessing of Data

In [23]:
# prepare the tokenizer on the source text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

## Step 04: Data Analysis

In [24]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print( "Vocabulary Size: %d" % vocab_size)

Vocabulary Size: 23


In [25]:

# create line-based sequences
sequences = list()
for line in data.split("\n"):
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
print( "Total Sequences: %d" % len(sequences))

Total Sequences: 25


In [26]:
# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding= "pre" )
print( "Max Sequence Length: %d" % max_length)

Max Sequence Length: 8


## Step 05: Split into input and output

In [27]:
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]


## Step 06: Encoding of data

In [28]:
y = to_categorical(y, num_classes=vocab_size)

# Step 07: Define Model and Training of Data

In [29]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding= "pre" )
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word =" "
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += " "+ out_word
    return in_text


# define the model
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 10, input_length=max_length-1))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation= "softmax" ))
    # compile network
    model.compile(loss= "categorical_crossentropy" , optimizer= "adam" , metrics=[ "accuracy" ])
    # summarize defined model
    model.summary()
    #plot_model(model, to_file= "model2.png" , show_shapes=True)
    return model

# define model
model = define_model(vocab_size, max_length)
# fit network
model.fit(X, y, epochs=500, verbose=2)


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 7, 10)             230       
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_2 (Dense)              (None, 23)                1173      
Total params: 13,603
Trainable params: 13,603
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500
1/1 - 2s - loss: 3.1353 - accuracy: 0.0400
Epoch 2/500
1/1 - 0s - loss: 3.1335 - accuracy: 0.0000e+00
Epoch 3/500
1/1 - 0s - loss: 3.1318 - accuracy: 0.1600
Epoch 4/500
1/1 - 0s - loss: 3.1300 - accuracy: 0.1600
Epoch 5/500
1/1 - 0s - loss: 3.1282 - accuracy: 0.1600
Epoch 6/500
1/1 - 0s - loss: 3.1263 - accuracy: 0.1600
Epoch 7/500
1/1 - 0s - loss: 3.1244 - accuracy

Epoch 134/500
1/1 - 0s - loss: 1.3858 - accuracy: 0.6400
Epoch 135/500
1/1 - 0s - loss: 1.3715 - accuracy: 0.6400
Epoch 136/500
1/1 - 0s - loss: 1.3580 - accuracy: 0.6800
Epoch 137/500
1/1 - 0s - loss: 1.3456 - accuracy: 0.6400
Epoch 138/500
1/1 - 0s - loss: 1.3321 - accuracy: 0.6800
Epoch 139/500
1/1 - 0s - loss: 1.3189 - accuracy: 0.6800
Epoch 140/500
1/1 - 0s - loss: 1.3070 - accuracy: 0.6400
Epoch 141/500
1/1 - 0s - loss: 1.2947 - accuracy: 0.6800
Epoch 142/500
1/1 - 0s - loss: 1.2817 - accuracy: 0.6800
Epoch 143/500
1/1 - 0s - loss: 1.2699 - accuracy: 0.6400
Epoch 144/500
1/1 - 0s - loss: 1.2587 - accuracy: 0.7200
Epoch 145/500
1/1 - 0s - loss: 1.2467 - accuracy: 0.6400
Epoch 146/500
1/1 - 0s - loss: 1.2347 - accuracy: 0.7200
Epoch 147/500
1/1 - 0s - loss: 1.2235 - accuracy: 0.7200
Epoch 148/500
1/1 - 0s - loss: 1.2126 - accuracy: 0.6400
Epoch 149/500
1/1 - 0s - loss: 1.2016 - accuracy: 0.7200
Epoch 150/500
1/1 - 0s - loss: 1.1903 - accuracy: 0.7200
Epoch 151/500
1/1 - 0s - loss: 

Epoch 278/500
1/1 - 0s - loss: 0.3827 - accuracy: 0.9200
Epoch 279/500
1/1 - 0s - loss: 0.3795 - accuracy: 0.9200
Epoch 280/500
1/1 - 0s - loss: 0.3763 - accuracy: 0.9200
Epoch 281/500
1/1 - 0s - loss: 0.3732 - accuracy: 0.9200
Epoch 282/500
1/1 - 0s - loss: 0.3702 - accuracy: 0.9200
Epoch 283/500
1/1 - 0s - loss: 0.3671 - accuracy: 0.9200
Epoch 284/500
1/1 - 0s - loss: 0.3641 - accuracy: 0.9200
Epoch 285/500
1/1 - 0s - loss: 0.3612 - accuracy: 0.9200
Epoch 286/500
1/1 - 0s - loss: 0.3582 - accuracy: 0.9200
Epoch 287/500
1/1 - 0s - loss: 0.3553 - accuracy: 0.9200
Epoch 288/500
1/1 - 0s - loss: 0.3525 - accuracy: 0.9200
Epoch 289/500
1/1 - 0s - loss: 0.3497 - accuracy: 0.9200
Epoch 290/500
1/1 - 0s - loss: 0.3470 - accuracy: 0.9200
Epoch 291/500
1/1 - 0s - loss: 0.3442 - accuracy: 0.9200
Epoch 292/500
1/1 - 0s - loss: 0.3415 - accuracy: 0.9200
Epoch 293/500
1/1 - 0s - loss: 0.3389 - accuracy: 0.9200
Epoch 294/500
1/1 - 0s - loss: 0.3363 - accuracy: 0.9200
Epoch 295/500
1/1 - 0s - loss: 

Epoch 422/500
1/1 - 0s - loss: 0.1861 - accuracy: 0.9200
Epoch 423/500
1/1 - 0s - loss: 0.1856 - accuracy: 0.9200
Epoch 424/500
1/1 - 0s - loss: 0.1852 - accuracy: 0.9200
Epoch 425/500
1/1 - 0s - loss: 0.1848 - accuracy: 0.9200
Epoch 426/500
1/1 - 0s - loss: 0.1844 - accuracy: 0.9200
Epoch 427/500
1/1 - 0s - loss: 0.1840 - accuracy: 0.9200
Epoch 428/500
1/1 - 0s - loss: 0.1836 - accuracy: 0.9200
Epoch 429/500
1/1 - 0s - loss: 0.1832 - accuracy: 0.9200
Epoch 430/500
1/1 - 0s - loss: 0.1828 - accuracy: 0.9200
Epoch 431/500
1/1 - 0s - loss: 0.1825 - accuracy: 0.9200
Epoch 432/500
1/1 - 0s - loss: 0.1821 - accuracy: 0.9200
Epoch 433/500
1/1 - 0s - loss: 0.1817 - accuracy: 0.9200
Epoch 434/500
1/1 - 0s - loss: 0.1813 - accuracy: 0.9200
Epoch 435/500
1/1 - 0s - loss: 0.1810 - accuracy: 0.9200
Epoch 436/500
1/1 - 0s - loss: 0.1806 - accuracy: 0.9200
Epoch 437/500
1/1 - 0s - loss: 0.1803 - accuracy: 0.9200
Epoch 438/500
1/1 - 0s - loss: 0.1799 - accuracy: 0.9200
Epoch 439/500
1/1 - 0s - loss: 

<tensorflow.python.keras.callbacks.History at 0x185ff68da58>

## Step 08: evaluation of Model

In [30]:
print(generate_seq(model, tokenizer, max_length-1, "diddle" , 4))
print(generate_seq(model, tokenizer, max_length-1, "cow" , 4))

diddle diddle diddle diddle diddle
cow cat the over the
