In [1]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from pathlib import Path


In [5]:

text_file = Path("Text Document.txt")

MAX_RECORDS = 4500
corpus = []

with text_file.open("r", encoding="utf-8") as file:
    for line in file:
        cleaned_line = line.encode("ascii", "ignore").decode("ascii").strip()
        if cleaned_line:
            corpus.append(cleaned_line)
        if len(corpus) >= MAX_RECORDS:
            break

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

total_words = len(tokenizer.word_index) + 1

input_sequences = []

for record in corpus:
    token_list = tokenizer.texts_to_sequences([record])[0]
    for i in range(1, len(token_list)):
        ngram = token_list[:i+1]
        input_sequences.append(ngram)


In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

total_words = len(tokenizer.word_index) + 1

input_sequences = []

for record in corpus:
    token_list = tokenizer.texts_to_sequences([record])[0]
    for i in range(1, len(token_list)):
        ngram = token_list[:i+1]
        input_sequences.append(ngram)


In [11]:
maximum_len = max(len(seq) for seq in input_sequences)
padded_sequences = pad_sequences(input_sequences, maxlen=maximum_len, padding="pre")

X_train = padded_sequences[:, :-1]
y_train = padded_sequences[:, -1]
y_train = to_categorical(y_train, num_classes=total_words)


In [13]:
model = Sequential()
model.add(Embedding(total_words, 128))
model.add(LSTM(128))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30, verbose=1)


Epoch 1/30
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 174ms/step - accuracy: 0.0581 - loss: 7.0613
Epoch 2/30
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 165ms/step - accuracy: 0.0781 - loss: 6.3739
Epoch 3/30
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 161ms/step - accuracy: 0.0973 - loss: 5.9669
Epoch 4/30
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 173ms/step - accuracy: 0.1188 - loss: 5.5949
Epoch 5/30
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 196ms/step - accuracy: 0.1433 - loss: 5.2294
Epoch 6/30
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 164ms/step - accuracy: 0.1638 - loss: 4.8824
Epoch 7/30
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 165ms/step - accuracy: 0.1852 - loss: 4.5514
Epoch 8/30
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 163ms/step - accuracy: 0.2122 - loss: 4.2362
Epoch 9/

<keras.src.callbacks.history.History at 0x16777f7e8d0>

In [35]:
def predict_word(model, tokenizer, seed_text, max_length):
    for _ in range(1):
        tokenized = tokenizer.texts_to_sequences([seed_text])[0]
        tokenized = pad_sequences([tokenized], maxlen=max_length - 1, padding="pre")
        predicted = model.predict(tokenized, verbose=0)
        predicted_word_index = np.argmax(predicted)
        
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                seed_text += " " + word
                return seed_text

example_phrase = "I am so"
print(predict_word(model, tokenizer, example_phrase, maximum_len))


I am so sorry
