In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [1]:
text = """It is not the strength of the body but the strength of the spirit. 
 It is useless to meet revenge with revenge it will heal nothing. 
 Even the smallest person can change the course of history.
 All we have to decide is what to do with the time that is given us. 
 The burned hand teaches best. After that, advice about fire goes to the heart."""

In [15]:
# Split text into an array of words 
words = text.split()
print(words)
vocab_size = x = len(set(words))
# Make sentences of 4 words each, moving one word at a time
sentences = []
for i in range(4, len(words)):
  sentences.append(' '.join(words[i-4:i]))
print(sentences)
# Instantiate a Tokenizer, then fit it on the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
# Turn sentences into a sequence of numbers
sequences = tokenizer.texts_to_sequences(sentences)
print("Sentences: \n {} \n Sequences: \n {}".format(sentences[:5],sequences[:5]))
vocab_size = len(tokenizer.word_index) + 1



['It', 'is', 'not', 'the', 'strength', 'of', 'the', 'body', 'but', 'the', 'strength', 'of', 'the', 'spirit.', 'It', 'is', 'useless', 'to', 'meet', 'revenge', 'with', 'revenge', 'it', 'will', 'heal', 'nothing.', 'Even', 'the', 'smallest', 'person', 'can', 'change', 'the', 'course', 'of', 'history.', 'All', 'we', 'have', 'to', 'decide', 'is', 'what', 'to', 'do', 'with', 'the', 'time', 'that', 'is', 'given', 'us.', 'The', 'burned', 'hand', 'teaches', 'best.', 'After', 'that,', 'advice', 'about', 'fire', 'goes', 'to', 'the', 'heart.']
['It is not the', 'is not the strength', 'not the strength of', 'the strength of the', 'strength of the body', 'of the body but', 'the body but the', 'body but the strength', 'but the strength of', 'the strength of the', 'strength of the spirit.', 'of the spirit. It', 'the spirit. It is', 'spirit. It is useless', 'It is useless to', 'is useless to meet', 'useless to meet revenge', 'to meet revenge with', 'meet revenge with revenge', 'revenge with revenge it',

In [35]:
# print(sequences)
X = np.array([seq[:-1] for seq in sequences])  # Input: first 3 words
y = np.array([seq[-1] for seq in sequences])  # Target: 4th word
# print(y)

# One-hot encode the target
y = np.eye(vocab_size)[y]
# print(vocab_size)
# print(y[1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# print('data', X_train, y_train)

In [36]:
# Import the Embedding, LSTM and Dense layer
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

model = Sequential()

# Add an Embedding layer with the right parameters
model.add(Embedding(input_dim = vocab_size, input_length = 3, output_dim = 8, ))

# Add a 32 unit LSTM layer
model.add(LSTM(32))

# Add a hidden Dense layer of 32 units and an output layer of vocab_size with softmax
model.add(Dense(32, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()


# Compile your model using categorical_crossentropy loss
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])



In [38]:
model.fit(X_train, y_train, epochs=200, verbose=2, validation_data=(X_test, y_test))

Epoch 1/200
2/2 - 0s - 79ms/step - accuracy: 0.1429 - loss: 3.8193 - val_accuracy: 0.1538 - val_loss: 3.8585
Epoch 2/200
2/2 - 0s - 57ms/step - accuracy: 0.1429 - loss: 3.8149 - val_accuracy: 0.1538 - val_loss: 3.8598
Epoch 3/200
2/2 - 0s - 54ms/step - accuracy: 0.1429 - loss: 3.8102 - val_accuracy: 0.1538 - val_loss: 3.8612
Epoch 4/200
2/2 - 0s - 55ms/step - accuracy: 0.1429 - loss: 3.8053 - val_accuracy: 0.1538 - val_loss: 3.8627
Epoch 5/200
2/2 - 0s - 60ms/step - accuracy: 0.1429 - loss: 3.7999 - val_accuracy: 0.1538 - val_loss: 3.8644
Epoch 6/200
2/2 - 0s - 56ms/step - accuracy: 0.1429 - loss: 3.7939 - val_accuracy: 0.1538 - val_loss: 3.8663
Epoch 7/200
2/2 - 0s - 50ms/step - accuracy: 0.1429 - loss: 3.7880 - val_accuracy: 0.1538 - val_loss: 3.8685
Epoch 8/200
2/2 - 0s - 58ms/step - accuracy: 0.1429 - loss: 3.7806 - val_accuracy: 0.1538 - val_loss: 3.8712
Epoch 9/200
2/2 - 0s - 58ms/step - accuracy: 0.1429 - loss: 3.7733 - val_accuracy: 0.1538 - val_loss: 3.8742
Epoch 10/200
2/2 - 

<keras.src.callbacks.history.History at 0x2964e31f3e0>

In [39]:
def predict_text(test_text, model = model):
  if len(test_text.split()) != 3:
    print('Text input should be 3 words!')
    return False
  
  # Turn the test_text into a sequence of numbers
  test_seq = tokenizer.texts_to_sequences([test_text])
  test_seq = np.array(test_seq)
  
  # Use the model passed as a parameter to predict the next word
  pred = model.predict(test_seq).argmax(axis = 1)[0]
  
  # Return the word that maps to the prediction
  return tokenizer.index_word[pred]

In [40]:
predict_text('meet revenge with')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 266ms/step


'revenge'

In [41]:
predict_text('the course of')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step


'history'

In [42]:
predict_text('strength of the')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step


'body'