**Unzip the dataset**

In [1]:
from zipfile import ZipFile

with ZipFile("Sinhala_dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("Sinhala_dataset")

**Setting Up the Dataset**

In [12]:
import os
import re
from collections import Counter

# Directory containing your .txt files
data_dir = 'Sinhala_dataset/Sinhala_dataset'

# List to store all sentences and words
sentences = []
word_list = []

# Define a pattern to match only valid Sinhala words
sinhala_word_pattern = r'[\u0D80-\u0DFF]+'

# Function to clean and tokenize text
def clean_and_tokenize(text):
    # Remove any unwanted characters, leaving only Sinhala words
    clean_sentences = []
    raw_sentences = re.split(r'(?<=[.?!])\s+', text)

    for sentence in raw_sentences:
        # Find all Sinhala words in the sentence
        words = re.findall(sinhala_word_pattern, sentence)
        if words:  # Avoid empty sentences
            clean_sentences.append(" ".join(words))
            word_list.extend(words)  # Add words to the global list

    return clean_sentences

# Process each .txt file in the dataset directory
for filename in os.listdir(data_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(data_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            file_sentences = clean_and_tokenize(text)
            sentences.extend(file_sentences)  # Add file sentences to global list

# Create dictionary by counting unique words
word_count = Counter(word_list)
unique_words = sorted(word_count.keys())  # Sorted list of unique words for spelling checker

# Save the unique words to a dictionary file for later use
with open("sinhala_dictionary.txt", "w", encoding="utf-8") as f:
    for word in unique_words:
        f.write(f"{word}\n")

# Output
print(f"Total sentences: {len(sentences)}")
print(f"Total unique words: {len(unique_words)}")


Total sentences: 17796
Total unique words: 36896


**Generating Sample Pairs of Misspelled and Correct Words**

In [14]:
import random

# Load dictionary of unique words
with open("sinhala_dictionary.txt", "r", encoding="utf-8") as f:
    unique_words = [line.strip() for line in f]

# Generate misspelled-correct pairs
misspelled_words = []
correct_words = []

# Simulate misspellings by altering one character in each word
for word in unique_words[:10000]:
    if len(word) > 1:
        misspelled = list(word)
        rand_index = random.randint(0, len(misspelled) - 1)
        misspelled[rand_index] = random.choice(list("අආඇඈඉඊඋඌඑඒඔඕකගජටඩතදනපබමයරලවශසහෆ"))
        misspelled = ''.join(misspelled)
        misspelled_words.append(misspelled)
        correct_words.append(word)


In [23]:
misspelled_words[:10]

['අර',
 'අංල',
 'අංකඑ',
 'ඩංකයකින්',
 'අජකයට',
 'අංකඋර',
 'අංකසර',
 'අංත',
 'අංගණඇට',
 'අදගනය']

**Proceeding with Tokenization and Model Training**

In [17]:
!pip install tensorflow



In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Tokenize the words at the character level
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(misspelled_words + correct_words)
sequences = tokenizer.texts_to_sequences(misspelled_words)

# Pad sequences to ensure consistent input length
max_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_length, padding='post')
y = tokenizer.texts_to_sequences(correct_words)
y = pad_sequences(y, maxlen=max_length, padding='post')

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model architecture
# Get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64, return_sequences=True), # Return sequences for each time step
    # Add a Dense layer to output probabilities for each character in the vocabulary
    Dense(vocab_size, activation='softmax') # Output layer with vocab_size units and softmax activation
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10




[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 64ms/step - accuracy: 0.6270 - loss: 1.9538 - val_accuracy: 0.7035 - val_loss: 1.1432
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 64ms/step - accuracy: 0.7242 - loss: 1.0167 - val_accuracy: 0.8635 - val_loss: 0.5998
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 69ms/step - accuracy: 0.8867 - loss: 0.5177 - val_accuracy: 0.9227 - val_loss: 0.3697
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 63ms/step - accuracy: 0.9292 - loss: 0.3554 - val_accuracy: 0.9390 - val_loss: 0.3092
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 67ms/step - accuracy: 0.9411 - loss: 0.3050 - val_accuracy: 0.9444 - val_loss: 0.2773
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 62ms/step - accuracy: 0.9459 - loss: 0.2711 - val_accuracy: 0.9471 - val_loss: 0.2598
Epoch 7/10
[1m250/250[0m 

<keras.src.callbacks.history.History at 0x7b1cabb3eec0>

**Saving the Model for Future Use**

In [22]:
model.save('spelling_checker_model.h5')



In [None]:
model.load('spelling_checker_model.h5')