Step 1: Install Required Libraries

In [8]:
!pip install tensorflow keras




Step 2: Load and Preprocess Dataset

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load dataset
file_path = '/content/data-spell-checker.xlsx'
data = pd.read_excel(file_path)

# Preprocess data
data = data.dropna()  # Remove missing values
words = data['word'].values
labels = data['label'].values  # 1 for correct, 0 for incorrect

# Tokenize the words
tokenizer = Tokenizer(char_level=True)  # Tokenize at character level
tokenizer.fit_on_texts(words)
sequences = tokenizer.texts_to_sequences(words)

# Pad sequences
max_len = max(len(seq) for seq in sequences)  # Set max length to the longest word
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = to_categorical(labels, num_classes=2)  # Convert labels to one-hot encoding

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


Step 3: Define and Train the LSTM Model

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the LSTM model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')  # Binary classification: correct (1) or incorrect (0)
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=32,
    epochs=10,  # Adjust epochs for better performance
    verbose=2
)

# Save the model
model.save('/content/sinhala_spell_checker.h5')


Epoch 1/10




2553/2553 - 178s - 70ms/step - accuracy: 0.7614 - loss: 0.5179 - val_accuracy: 0.8374 - val_loss: 0.3868
Epoch 2/10
2553/2553 - 204s - 80ms/step - accuracy: 0.8619 - loss: 0.3589 - val_accuracy: 0.8767 - val_loss: 0.3339
Epoch 3/10
2553/2553 - 205s - 80ms/step - accuracy: 0.8789 - loss: 0.3104 - val_accuracy: 0.8859 - val_loss: 0.2843
Epoch 4/10
2553/2553 - 174s - 68ms/step - accuracy: 0.8917 - loss: 0.2674 - val_accuracy: 0.9028 - val_loss: 0.2381
Epoch 5/10
2553/2553 - 200s - 78ms/step - accuracy: 0.9120 - loss: 0.2175 - val_accuracy: 0.9172 - val_loss: 0.2131
Epoch 6/10
2553/2553 - 174s - 68ms/step - accuracy: 0.9264 - loss: 0.1846 - val_accuracy: 0.9300 - val_loss: 0.1781
Epoch 7/10
2553/2553 - 248s - 97ms/step - accuracy: 0.9357 - loss: 0.1630 - val_accuracy: 0.9362 - val_loss: 0.1638
Epoch 8/10
2553/2553 - 213s - 83ms/step - accuracy: 0.9432 - loss: 0.1465 - val_accuracy: 0.9411 - val_loss: 0.1559
Epoch 9/10
2553/2553 - 197s - 77ms/step - accuracy: 0.9497 - loss: 0.1319 - val_acc



Step 4: Display Output

In [17]:
from difflib import get_close_matches

# Function to predict if a word is correct or incorrect
def predict_word(word):
    seq = tokenizer.texts_to_sequences([word])
    padded_seq = pad_sequences(seq, maxlen=max_len, padding='post')
    pred = model.predict(padded_seq)
    return np.argmax(pred)  # 0 = Incorrect, 1 = Correct

# Function to correct misspelled words
def auto_correct(word, correct_words):
    if predict_word(word) == 1:
        return word  # Word is correct
    close_matches = get_close_matches(word, correct_words, n=1, cutoff=0.7)
    return close_matches[0] if close_matches else word

# Correct a sentence and display output in the desired format
def process_sentence(sentence, sample_number, correct_words):
    words_in_sentence = sentence.split()
    misspelled_words = []
    corrected_words = []

    # Process each word in the sentence
    for word in words_in_sentence:
        corrected_word = auto_correct(word, correct_words)
        corrected_words.append(corrected_word)
        if corrected_word != word:  # If the word is corrected
            misspelled_words.append(word)

    # Display the output
    print(f"Sample Sentence {sample_number}:")
    print(f"Original Sentence: {sentence}")
    print(f"Misspelled Words: {misspelled_words}")
    print(f"Corrected Sentence: {' '.join(corrected_words)}\n")

# Test the function with multiple sentences
correct_words = [word for word, label in zip(words, labels) if label == 1]
sentences = [
    "අම්මා යුහුෂුලුව අවදිවෙනවා",
    "උකුෂ්ෂා සාර්ථඛව සුනඛයකු පස්සේ එළවනවා",
    "සමකාළීන වෙඩික්කාරයා වෙඩිතියනවා",
    "මුරඛාරයා සැළකිළිමත්ව වීදිය පසුකරනවා",
    "ණාවිකයා සම්මත තාක්සණය නෞඛා පැදවීමට භාවිතා කරනවා",
]

for i, sentence in enumerate(sentences, start=1):
    process_sentence(sentence, sample_number=i, correct_words=correct_words)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Sample Sentence 1:
Original Sentence: අම්මා යුහුෂුලුව අවදිවෙනවා
Misspelled Words: ['යුහුෂුලුව']
Corrected Sentence: අම්මා යුහුසුලුව අවදිවෙනවා

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Sample Sentence 2:
Original Sentence: උකුෂ්ෂා සාර්ථඛව සුනඛයකු පස්සේ එළවනවා
Misspelled Words: ['උකුෂ්ෂා', 'සාර්ථඛව']
Corrected Sentence: උකුස්සා සාර්ථකව සුනඛයකු පස්සේ එළවනවා

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━