In [2]:
from keras.preprocessing.text import Tokenizer
from numpy import array
from keras.utils import to_categorical
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential
import numpy as np
from keras.preprocessing.sequence import pad_sequences

In [38]:
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text=seed_text
    for _ in range(n_words):
        encoded=tokenizer.texts_to_sequences([in_text])[0]
        encoded=pad_sequences([encoded], maxlen=max_length, padding='pre')
        encoded = np.array(encoded)
        yhat=model.predict(encoded, verbose=0)
        pred_class=np.argmax(yhat, axis=1)

        out_word=''
        for word, index in tokenizer.word_index.items():
            if index==pred_class:
                out_word=word
                break
        in_text+=' '+out_word
    return in_text

In [4]:
def define_model(vocab_size,max_length):
    model= Sequential()
    model.add(Embedding(vocab_size, 10, input_length=max_length-1))
    model.add(LSTM(50)) 
    model.add(Dense(vocab_size,activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [5]:
data= """Jack and Jill went up the hill \n
To fetch a pill of water\n
Jack fell down and broke his crown\n
And Jill came trembling after\n"""

In [39]:
tokenizer= Tokenizer()
tokenizer.fit_on_texts([data])
vocab_size=len(tokenizer.word_index)+1
sequences=[]
for line in data.split('\n'):
    encoded=tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

max_length=max([len(seq) for seq in sequences])
sequences=pad_sequences(sequences, maxlen=max_length, padding='pre')

X,y= sequences[:,:-1], sequences[:,-1]
y= to_categorical(y, num_classes=vocab_size)

model=define_model(vocab_size, max_length)
model.fit(X, y, epochs=500, verbose=2)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 6, 10)             220       
                                                                 
 lstm_1 (LSTM)               (None, 50)                12200     
                                                                 
 dense_1 (Dense)             (None, 22)                1122      
                                                                 
Total params: 13542 (52.90 KB)
Trainable params: 13542 (52.90 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/500
1/1 - 2s - loss: 3.0890 - accuracy: 0.1905 - 2s/epoch - 2s/step
Epoch 2/500
1/1 - 0s - loss: 3.0870 - accuracy: 0.1429 - 10ms/epoch - 10ms/step
Epoch 3/500
1/1 - 0s - loss: 3.0850 - accuracy: 0.1429 - 17ms/epoch - 17ms/step
Epoch 4/500
1/1 - 0s - loss: 3.0830 - accuracy

<keras.src.callbacks.History at 0x1aedc8a01f0>

In [40]:
generate_seq(model, tokenizer, max_length-1, "Jack", 4)

'Jack fell down and broke'

In [41]:
generate_seq(model, tokenizer, max_length-1, "Jill", 4)

'Jill jill came trembling after'