Step 1: Install Required Libraries

In [None]:
!pip install tensorflow keras




Step 2: Load and Preprocess Dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load dataset
file_path = '/content/data-spell-checker.xlsx'
data = pd.read_excel(file_path)

# Preprocess data
data = data.dropna()  # Remove missing values
words = data['word'].values
labels = data['label'].values  # 1 for correct, 0 for incorrect

# Tokenize the words
tokenizer = Tokenizer(char_level=True)  # Tokenize at character level
tokenizer.fit_on_texts(words)
sequences = tokenizer.texts_to_sequences(words)

# Pad sequences
max_len = max(len(seq) for seq in sequences)  # Set max length to the longest word
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = to_categorical(labels, num_classes=2)  # Convert labels to one-hot encoding

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


Step 3: Define and Train the LSTM Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the LSTM model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')  # Binary classification: correct (1) or incorrect (0)
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=32,
    epochs=10,  # Adjust epochs for better performance
    verbose=2
)

# Save the model
model.save('/content/sinhala_spell_checker.h5')


Epoch 1/10




2553/2553 - 140s - 55ms/step - accuracy: 0.7464 - loss: 0.5339 - val_accuracy: 0.7696 - val_loss: 0.5006
Epoch 2/10
2553/2553 - 142s - 56ms/step - accuracy: 0.7715 - loss: 0.4922 - val_accuracy: 0.7800 - val_loss: 0.4879
Epoch 3/10
2553/2553 - 153s - 60ms/step - accuracy: 0.8357 - loss: 0.3920 - val_accuracy: 0.8833 - val_loss: 0.2880
Epoch 4/10
2553/2553 - 200s - 78ms/step - accuracy: 0.8881 - loss: 0.2711 - val_accuracy: 0.8981 - val_loss: 0.2464
Epoch 5/10
2553/2553 - 136s - 53ms/step - accuracy: 0.9062 - loss: 0.2285 - val_accuracy: 0.9113 - val_loss: 0.2153
Epoch 6/10
2553/2553 - 144s - 56ms/step - accuracy: 0.9213 - loss: 0.1969 - val_accuracy: 0.9247 - val_loss: 0.1926
Epoch 7/10
2553/2553 - 141s - 55ms/step - accuracy: 0.9309 - loss: 0.1750 - val_accuracy: 0.9328 - val_loss: 0.1725
Epoch 8/10
2553/2553 - 140s - 55ms/step - accuracy: 0.9385 - loss: 0.1583 - val_accuracy: 0.9389 - val_loss: 0.1608
Epoch 9/10
2553/2553 - 144s - 57ms/step - accuracy: 0.9445 - loss: 0.1443 - val_acc



Step 4: Display Output

In [None]:
from difflib import get_close_matches

# Function to predict if a word is correct or incorrect
def predict_word(word):
    seq = tokenizer.texts_to_sequences([word])
    padded_seq = pad_sequences(seq, maxlen=max_len, padding='post')
    pred = model.predict(padded_seq)
    return np.argmax(pred)  # 0 = Incorrect, 1 = Correct

# Function to correct misspelled words
def auto_correct(word, correct_words):
    if predict_word(word) == 1:
        return word  # Word is correct
    close_matches = get_close_matches(word, correct_words, n=1, cutoff=0.7)
    return close_matches[0] if close_matches else word

# Correct a sentence and display output in the desired format
def process_sentence(sentence, sample_number, correct_words):
    words_in_sentence = sentence.split()
    misspelled_words = []
    corrected_words = []

    # Process each word in the sentence
    for word in words_in_sentence:
        corrected_word = auto_correct(word, correct_words)
        corrected_words.append(corrected_word)
        if corrected_word != word:  # If the word is corrected
            misspelled_words.append(word)

    # Display the output
    print(f"Sample Sentence {sample_number}:")
    print(f"Original Sentence: {sentence}")
    print(f"Misspelled Words: {misspelled_words}")
    print(f"Corrected Sentence: {' '.join(corrected_words)}\n")

# Test the function with multiple sentences
correct_words = [word for word, label in zip(words, labels) if label == 1]
sentences = [
    "අම්මා යුහුෂුලුව අවදිවෙනවා",
    "උකුෂ්ෂා සාර්ථඛව සුනඛයකු පස්සේ එළවනවා",
    "සමකාළීන වෙඩික්කාරයා වෙඩිතියනවා",
    "මුරඛාරයා සැළකිළිමත්ව වීදිය පසුකරනවා",
    "ණාවිකයා සම්මත තාක්සණය නෞඛා පැදවීමට භාවිතා කරනවා",
]

for i, sentence in enumerate(sentences, start=1):
    process_sentence(sentence, sample_number=i, correct_words=correct_words)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Sample Sentence 1:
Original Sentence: අම්මා යුහුෂුලුව අවදිවෙනවා
Misspelled Words: ['යුහුෂුලුව']
Corrected Sentence: අම්මා යුහුසුලුව අවදිවෙනවා

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Sample Sentence 2:
Original Sentence: උකුෂ්ෂා සාර්ථඛව සුනඛයකු පස්සේ එළවනවා
Misspelled Words: ['උකුෂ්ෂා', 'සාර්ථඛව']
Corrected Sentence: උකුස්සා සාර්ථකව සුනඛයකු පස්සේ එළවනවා

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━



---



In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.8.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.1 (from gradio)
  Downloading gradio_client-1.5.1-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.19-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [None]:
import gradio as gr
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model
from difflib import get_close_matches

# Load the trained model
model = load_model('/content/sinhala_spell_checker.h5')

# Load dataset (needed for correct words list)
file_path = '/content/data-spell-checker.xlsx'
data = pd.read_excel(file_path)
data = data.dropna()
words = data['word'].values
labels = data['label'].values
correct_words = [word for word, label in zip(words, labels) if label == 1]

# Tokenizer (must be the same as used during training)
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(words)
max_len = max(len(seq) for seq in tokenizer.texts_to_sequences(words))


def predict_word(word):
    seq = tokenizer.texts_to_sequences([word])
    padded_seq = pad_sequences(seq, maxlen=max_len, padding='post')
    pred = model.predict(padded_seq)
    return np.argmax(pred)  # 0 = Incorrect, 1 = Correct


def auto_correct(word, correct_words):
    if predict_word(word) == 1:
        return word
    close_matches = get_close_matches(word, correct_words, n=1, cutoff=0.7)
    return close_matches[0] if close_matches else word


def correct_sentence(sentence):
    words_in_sentence = sentence.split()
    corrected_words = []

    for word in words_in_sentence:
        corrected_word = auto_correct(word, correct_words)
        corrected_words.append(corrected_word)

    return " ".join(corrected_words)

iface = gr.Interface(
    fn=correct_sentence,
    inputs=gr.Textbox(lines=2, placeholder="Enter Sinhala text here..."),
    outputs="text",
    title="Sinhala Spell Checker",
    description="Enter Sinhala text to correct spelling mistakes.",
)

iface.launch()



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://964405f2cb425ea9f5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


