In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import regex as re

# Load the text data from a file
with open("data2.txt", "r", encoding='utf-8') as file:
    data = file.read()
 
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

input_sequences = []
for sentence in data.split('\n'):
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

    for i in range(1, len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i + 1])

max_len = max([len(x) for x in input_sequences])

padded_input_sequence = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

x = padded_input_sequence[:, :-1]
y = padded_input_sequence[:, -1]

# One-hot encode the labels
y = to_categorical(y, num_classes=len(tokenizer.word_index) + 1)

# Define the model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_len - 1))
model.add(LSTM(150))
# Make sure the number of units in the Dense layer matches the vocabulary size
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

# Use 'sparse_categorical_crossentropy' for single integer labels
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


# Train the model
model.fit(x, y, epochs=10)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 19, 100)           277900    
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 2779)              419629    
                                                                 
Total params: 848129 (3.24 MB)
Trainable params: 848129 (3.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1bf9c569a10>

In [2]:
import tkinter as tk

def predict_words():
    input_word = input_word_entry.get()
    num_predictions = 3
    text = input_word

    for i in range(num_predictions):
        token_text = tokenizer.texts_to_sequences([text])[0]
        padded_token_text = pad_sequences([token_text], maxlen=max_len - 1, padding="pre")
        predictions_for_input = model.predict(padded_token_text)
        top_index = predictions_for_input.argsort()[0][-i - 1]

        prediction_word = ""
        for word, word_index in tokenizer.word_index.items():
            if word_index == top_index:
                prediction_word = word
                break

        if prediction_word:
            prediction_labels[i].config(text=prediction_word, fg='black')  # Set text color to green

root = tk.Tk()
root.title("Word Prediction")
root.configure(bg='lightblue')  

# Create input widgets with improved styling
input_word_label = tk.Label(root, text="Input Word:", font=("Palatino", 24, "bold"), fg="blue")
input_word_label.pack()
input_word_entry = tk.Entry(root, font=("Palatino", 22), bg="lightgray", width=60,)
input_word_entry.pack()

predict_button = tk.Button(root, text="Predict", command=predict_words, font=("Palatino", 18, "bold"), bg='black', fg='white')
predict_button.pack()

# Create Labels for displaying predictions with custom styles
prediction_labels = []
for i in range(3):
    label = tk.Label(root, text="", font=("Palatino", 24, "italic"), fg='darkblue')
    label.pack()
    prediction_labels.append(label)

root.mainloop()




