In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import regex as re
import tkinter as tk
from spellchecker import SpellChecker


# Load the text data from a file
with open("data2.txt", "r", encoding='utf-8') as file:
    data = file.read()

tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

input_sequences = []
for sentence in data.split('\n'):
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

    for i in range(1, len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i + 1])

max_len = max([len(x) for x in input_sequences])

padded_input_sequence = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

x = padded_input_sequence[:, :-1]
y = padded_input_sequence[:, -1]

# One-hot encode the labels
y = to_categorical(y, num_classes=len(tokenizer.word_index) + 1)

# Define the model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_len - 1))
model.add(LSTM(150))
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
model.fit(x, y, epochs=10)

# Function to correct typos
def correct_typos(text):
    spell = SpellChecker()
    corrected_words = [spell.correction(word) for word in text.split()]
    return ' '.join(corrected_words)

# GUI prediction function with typo correction
def predict_words():
    input_word = input_word_entry.get()
    corrected_word = correct_typos(input_word)
    input_word_entry.delete(0, tk.END)  # Clear the input field
    input_word_entry.insert(0, corrected_word)  # Update input field with corrected word

    num_predictions = 3
    text = corrected_word

    for i in range(num_predictions):
        token_text = tokenizer.texts_to_sequences([text])[0]
        padded_token_text = pad_sequences([token_text], maxlen=max_len - 1, padding="pre")
        predictions_for_input = model.predict(padded_token_text, verbose=0)
        predicted_index = np.argmax(predictions_for_input, axis=1)[0]
        prediction_word = tokenizer.index_word[predicted_index]

        if prediction_word:
            prediction_labels[i].config(text=prediction_word, fg='black')  # Update prediction labels


# Setup tkinter GUI
root = tk.Tk()
root.title("Word Prediction")
root.configure(bg='lightblue')

input_word_label = tk.Label(root, text="Input Word:", font=("Palatino", 24, "bold"), fg="blue")
input_word_label.pack()
input_word_entry = tk.Entry(root, font=("Palatino", 22), bg="lightgray", width=60)
input_word_entry.pack()

predict_button = tk.Button(root, text="Predict", command=predict_words, font=("Palatino", 18, "bold"), bg='black', fg='white')
predict_button.pack()

prediction_labels = [tk.Label(root, text="", font=("Palatino", 24, "italic"), fg='darkblue') for _ in range(3)]
for label in prediction_labels:
    label.pack()

root.mainloop()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 19, 100)           277900    
                                                                 
 lstm_1 (LSTM)               (None, 150)               150600    
                                                                 
 dense_1 (Dense)             (None, 2779)              419629    
                                                                 
Total params: 848129 (3.24 MB)
Trainable params: 848129 (3.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [5]:
from tensorflow.keras.layers import SimpleRNN

# Define the RNN model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_len - 1))
model.add(SimpleRNN(150))  # Using SimpleRNN instead of LSTM
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
model.fit(x, y, epochs=10)

# Function to correct typos
def correct_typos(text):
    spell = SpellChecker()
    corrected_words = [spell.correction(word) for word in text.split()]
    return ' '.join(corrected_words)

# GUI prediction function with typo correction
def predict_words():
    input_word = input_word_entry.get()
    corrected_word = correct_typos(input_word)
    input_word_entry.delete(0, tk.END)  # Clear the input field
    input_word_entry.insert(0, corrected_word)  # Update input field with corrected word

    num_predictions = 3
    text = corrected_word

    for i in range(num_predictions):
        token_text = tokenizer.texts_to_sequences([text])[0]
        padded_token_text = pad_sequences([token_text], maxlen=max_len - 1, padding="pre")
        predictions_for_input = model.predict(padded_token_text, verbose=0)
        predicted_index = np.argmax(predictions_for_input, axis=1)[0]
        prediction_word = tokenizer.index_word[predicted_index]

        if prediction_word:
            prediction_labels[i].config(text=prediction_word, fg='black')  # Update prediction labels


# Setup tkinter GUI
root = tk.Tk()
root.title("Word Prediction")
root.configure(bg='lightblue')

input_word_label = tk.Label(root, text="Input Word:", font=("Palatino", 24, "bold"), fg="blue")
input_word_label.pack()
input_word_entry = tk.Entry(root, font=("Palatino", 22), bg="lightgray", width=60)
input_word_entry.pack()

predict_button = tk.Button(root, text="Predict", command=predict_words, font=("Palatino", 18, "bold"), bg='black', fg='white')
predict_button.pack()

prediction_labels = [tk.Label(root, text="", font=("Palatino", 24, "italic"), fg='darkblue') for _ in range(3)]
for label in prediction_labels:
    label.pack()

root.mainloop()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 19, 100)           277900    
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 150)               37650     
                                                                 
 dense_4 (Dense)             (None, 2779)              419629    
                                                                 
Total params: 735179 (2.80 MB)
Trainable params: 735179 (2.80 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [6]:
pip install transformers


Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.38.1-py3-none-any.whl.metadata (131 kB)
     ---------------------------------------- 0.0/131.1 kB ? eta -:--:--
     --- ------------------------------------ 10.2/131.1 kB ? eta -:--:--
     ----------- ------------------------- 41.0/131.1 kB 487.6 kB/s eta 0:00:01
     -------------------------------------- 131.1/131.1 kB 1.1 MB/s eta 0:00:00
Collecting filelock (from transformers)
  Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.2-cp311-none-win_amd64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.2-cp311-none-win_amd64.whl.metadata (3.9 kB)
Collecting tqdm>=4.27 (from tran

In [7]:
import numpy as np
import regex as re
import tkinter as tk
from spellchecker import SpellChecker
from transformers import BertTokenizer, TFBertForMaskedLM

# Function to correct typos
def correct_typos(text):
    spell = SpellChecker()
    corrected_words = [spell.correction(word) for word in text.split()]
    return ' '.join(corrected_words)

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model
model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')

# Function to predict the next word using BERT
def predict_next_word(text):
    # Tokenize input and convert to ids
    input_ids = tokenizer.encode(text, return_tensors='tf')
    
    # Create masked input ids
    masked_input_ids = input_ids.numpy()
    masked_input_ids[0, -1] = tokenizer.mask_token_id  # Mask the last token
    masked_input_ids = tf.constant(masked_input_ids)

    # Predict the masked token with BERT
    predictions = model(masked_input_ids)[0]
    
    # Get the index of the masked token
    masked_index = np.where(masked_input_ids == tokenizer.mask_token_id)[1][0]
    
    # Get the top 3 token predictions of the masked token
    predicted_index = np.argsort(predictions[0, masked_index, :])[-3:]
    predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_index)
    
    return predicted_tokens[::-1]  # Return predictions in descending order of probability

# GUI prediction function with typo correction
def predict_words():
    input_word = input_word_entry.get()
    corrected_word = correct_typos(input_word)
    input_word_entry.delete(0, tk.END)  # Clear the input field
    input_word_entry.insert(0, corrected_word)  # Update input field with corrected word

    text = corrected_word + ' ' + tokenizer.mask_token  # Add mask token at the end
    predicted_words = predict_next_word(text)
    
    for i, prediction_word in enumerate(predicted_words):
        prediction_labels[i].config(text=prediction_word, fg='black')  # Update prediction labels

# Setup tkinter GUI
root = tk.Tk()
root.title("Word Prediction with BERT")
root.configure(bg='lightblue')

input_word_label = tk.Label(root, text="Input Word:", font=("Palatino", 24, "bold"), fg="blue")
input_word_label.pack()
input_word_entry = tk.Entry(root, font=("Palatino", 22), bg="lightgray", width=60)
input_word_entry.pack()

predict_button = tk.Button(root, text="Predict", command=predict_words, font=("Palatino", 18, "bold"), bg='black', fg='white')
predict_button.pack()

prediction_labels = [tk.Label(root, text="", font=("Palatino", 24, "italic"), fg='darkblue') for _ in range(3)]
for label in prediction_labels:
    label.pack()

root.mainloop()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
