In [70]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Input, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [45]:
# source text
data = """ Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n """

**Model 1: One-word in, One-word out Sequences**

In [66]:
def generate_seq(model, tokenizer, seed_text, n_words):
    in_text, result = seed_text, seed_text
    # generate fixed no. of words
    for _ in range(n_words):
        # encode text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = np.array(encoded)
        yhat = model.predict(encoded, verbose = 0)
        yhat = np.argmax(yhat, axis = -1)
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        in_text, result = out_word, result + ' ' + out_word
    return result

In [60]:
def define_model(vocab_size):
    model = Sequential()
    model.add(Input(shape = (1,)))
    model.add(Embedding(vocab_size, 10))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation = 'softmax'))
    model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics =['accuracy'])
    model.summary()
    return model

In [61]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

In [62]:
print(encoded)

[2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18, 1, 3, 19, 20, 21]


In [63]:
vocab_size = len(tokenizer.word_index) + 1

sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1 : i+1]
    sequences.append(sequence)
sequences = np.array(sequences)
print(sequences)

[[ 2  1]
 [ 1  3]
 [ 3  4]
 [ 4  5]
 [ 5  6]
 [ 6  7]
 [ 7  8]
 [ 8  9]
 [ 9 10]
 [10 11]
 [11 12]
 [12 13]
 [13  2]
 [ 2 14]
 [14 15]
 [15  1]
 [ 1 16]
 [16 17]
 [17 18]
 [18  1]
 [ 1  3]
 [ 3 19]
 [19 20]
 [20 21]]


In [67]:
x, y = sequences[:, 0], sequences[:, 1]
y = to_categorical(y, num_classes = vocab_size)
model = define_model(vocab_size)
model.fit(x, y, epochs = 500, verbose = 2)

Epoch 1/500
1/1 - 2s - 2s/step - accuracy: 0.0833 - loss: 3.0910
Epoch 2/500
1/1 - 0s - 37ms/step - accuracy: 0.0833 - loss: 3.0902
Epoch 3/500
1/1 - 0s - 38ms/step - accuracy: 0.1667 - loss: 3.0894
Epoch 4/500
1/1 - 0s - 32ms/step - accuracy: 0.2083 - loss: 3.0885
Epoch 5/500
1/1 - 0s - 32ms/step - accuracy: 0.2083 - loss: 3.0877
Epoch 6/500
1/1 - 0s - 36ms/step - accuracy: 0.2083 - loss: 3.0869
Epoch 7/500
1/1 - 0s - 31ms/step - accuracy: 0.2083 - loss: 3.0861
Epoch 8/500
1/1 - 0s - 33ms/step - accuracy: 0.2083 - loss: 3.0852
Epoch 9/500
1/1 - 0s - 36ms/step - accuracy: 0.2083 - loss: 3.0844
Epoch 10/500
1/1 - 0s - 31ms/step - accuracy: 0.2083 - loss: 3.0835
Epoch 11/500
1/1 - 0s - 35ms/step - accuracy: 0.2083 - loss: 3.0826
Epoch 12/500
1/1 - 0s - 32ms/step - accuracy: 0.2083 - loss: 3.0817
Epoch 13/500
1/1 - 0s - 37ms/step - accuracy: 0.2083 - loss: 3.0808
Epoch 14/500
1/1 - 0s - 29ms/step - accuracy: 0.2083 - loss: 3.0799
Epoch 15/500
1/1 - 0s - 33ms/step - accuracy: 0.2083 - loss

<keras.src.callbacks.history.History at 0x1dbf0b32a90>

In [68]:
print(generate_seq(model, tokenizer, 'Jack', 6))

Jack and jill went up the hill


**Model 2: Line-by-Line Sequence**

In [90]:
def generate_seq_2(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate fixed no. of words
    for _ in range(n_words):
        # encode text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        yhat = model.predict(encoded, verbose = 0)
        yhat = np.argmax(yhat, axis = -1)
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        in_text += ' ' + out_word
    return in_text

In [74]:
def define_model_2(vocab_size, max_length):
    model = Sequential()
    # max_length contained input and output sequence
    model.add(Input(shape = (max_length - 1,)))
    model.add(Embedding(vocab_size, 10))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation = 'softmax'))
    model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics =['accuracy'])
    model.summary()
    return model

In [75]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

vocab_size = len(tokenizer.word_index) + 1

In [84]:
# create line-based sequences
sequences = list()
for line in data.split('\n'):
    encoded = tokenizer.texts_to_sequences([line])[0]
    #print(encoded)
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
print(sequences)

[[2, 1], [2, 1, 3], [2, 1, 3, 4], [2, 1, 3, 4, 5], [2, 1, 3, 4, 5, 6], [2, 1, 3, 4, 5, 6, 7], [8, 9], [8, 9, 10], [8, 9, 10, 11], [8, 9, 10, 11, 12], [8, 9, 10, 11, 12, 13], [2, 14], [2, 14, 15], [2, 14, 15, 1], [2, 14, 15, 1, 16], [2, 14, 15, 1, 16, 17], [2, 14, 15, 1, 16, 17, 18], [1, 3], [1, 3, 19], [1, 3, 19, 20], [1, 3, 19, 20, 21]]


In [88]:
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
sequences = np.array(sequences)

x, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
model = define_model_2(vocab_size, max_length)
model.fit(x, y, epochs=500, verbose=2)

Epoch 1/500
1/1 - 4s - 4s/step - accuracy: 0.0476 - loss: 3.0910
Epoch 2/500
1/1 - 0s - 96ms/step - accuracy: 0.0952 - loss: 3.0893
Epoch 3/500
1/1 - 0s - 55ms/step - accuracy: 0.1429 - loss: 3.0877
Epoch 4/500
1/1 - 0s - 59ms/step - accuracy: 0.1429 - loss: 3.0861
Epoch 5/500
1/1 - 0s - 59ms/step - accuracy: 0.0952 - loss: 3.0845
Epoch 6/500
1/1 - 0s - 53ms/step - accuracy: 0.0952 - loss: 3.0828
Epoch 7/500
1/1 - 0s - 50ms/step - accuracy: 0.0952 - loss: 3.0811
Epoch 8/500
1/1 - 0s - 44ms/step - accuracy: 0.0952 - loss: 3.0793
Epoch 9/500
1/1 - 0s - 53ms/step - accuracy: 0.0952 - loss: 3.0775
Epoch 10/500
1/1 - 0s - 50ms/step - accuracy: 0.0952 - loss: 3.0756
Epoch 11/500
1/1 - 0s - 44ms/step - accuracy: 0.0952 - loss: 3.0737
Epoch 12/500
1/1 - 0s - 49ms/step - accuracy: 0.0952 - loss: 3.0716
Epoch 13/500
1/1 - 0s - 43ms/step - accuracy: 0.0952 - loss: 3.0694
Epoch 14/500
1/1 - 0s - 49ms/step - accuracy: 0.0952 - loss: 3.0670
Epoch 15/500
1/1 - 0s - 47ms/step - accuracy: 0.0952 - loss

<keras.src.callbacks.history.History at 0x1dbf371a410>

In [92]:
# evaluate model
print(generate_seq_2(model, tokenizer, max_length-1, 'Jack', 4))
print(generate_seq_2(model, tokenizer, max_length-1, 'Jill', 4))

Jack fell down and broke
Jill jill came tumbling after


**Model 3: Two-Words-In, One-Word-Out Sequence**

In [93]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
vocab_size = len(tokenizer.word_index) + 1

In [94]:
# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
    sequence = encoded[i-2 : i+1]
    sequences.append(sequence)
print(sequences)

[[2, 1, 3], [1, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, 8], [7, 8, 9], [8, 9, 10], [9, 10, 11], [10, 11, 12], [11, 12, 13], [12, 13, 2], [13, 2, 14], [2, 14, 15], [14, 15, 1], [15, 1, 16], [1, 16, 17], [16, 17, 18], [17, 18, 1], [18, 1, 3], [1, 3, 19], [3, 19, 20], [19, 20, 21]]


In [96]:
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
sequences = np.array(sequences)

x, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
model = define_model_2(vocab_size, max_length)
model.fit(x, y, epochs=500, verbose=2)

Epoch 1/500
1/1 - 5s - 5s/step - accuracy: 0.0000e+00 - loss: 3.0915
Epoch 2/500
1/1 - 0s - 46ms/step - accuracy: 0.0435 - loss: 3.0906
Epoch 3/500
1/1 - 0s - 44ms/step - accuracy: 0.0870 - loss: 3.0897
Epoch 4/500
1/1 - 0s - 38ms/step - accuracy: 0.0870 - loss: 3.0888
Epoch 5/500
1/1 - 0s - 44ms/step - accuracy: 0.0870 - loss: 3.0878
Epoch 6/500
1/1 - 0s - 42ms/step - accuracy: 0.0870 - loss: 3.0869
Epoch 7/500
1/1 - 0s - 32ms/step - accuracy: 0.0870 - loss: 3.0860
Epoch 8/500
1/1 - 0s - 37ms/step - accuracy: 0.0870 - loss: 3.0850
Epoch 9/500
1/1 - 0s - 37ms/step - accuracy: 0.0870 - loss: 3.0840
Epoch 10/500
1/1 - 0s - 34ms/step - accuracy: 0.0870 - loss: 3.0830
Epoch 11/500
1/1 - 0s - 41ms/step - accuracy: 0.0870 - loss: 3.0820
Epoch 12/500
1/1 - 0s - 42ms/step - accuracy: 0.0870 - loss: 3.0810
Epoch 13/500
1/1 - 0s - 38ms/step - accuracy: 0.0870 - loss: 3.0800
Epoch 14/500
1/1 - 0s - 42ms/step - accuracy: 0.0870 - loss: 3.0789
Epoch 15/500
1/1 - 0s - 35ms/step - accuracy: 0.0870 - 

<keras.src.callbacks.history.History at 0x1dbf610c7d0>

In [97]:
# evaluate model
print(generate_seq_2(model, tokenizer, max_length-1, 'Jack and', 5))
print(generate_seq_2(model, tokenizer, max_length-1, 'And Jill', 3))
print(generate_seq_2(model, tokenizer, max_length-1, 'fell down', 5))
print(generate_seq_2(model, tokenizer, max_length-1, 'pail of', 5))

Jack and jill came tumbling after after
And Jill came tumbling after
fell down and broke his crown and
pail of water jack fell down and
