Step 1: Load Necessary Libraries

In [1]:
!pip install pandas python-Levenshtein
import pandas as pd
import Levenshtein
import numpy as np


Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

Step 2: Load Dataset


In [2]:
# Load the dataset
dataset_path = "/content/data-spell-checker.xlsx"
data = pd.read_excel(dataset_path)

# Preview the data
print(data.head())


        word  label
0  අභිචෝදකයා      1
1      අංකනය      1
2       අංකන      1
3       අංකය      1
4  අංකාන්තරය      1


Step 3: Create a Correction Function

In [3]:
# Function to get the closest correct word using Levenshtein distance
def correct_word(input_word, correct_word_list):
    min_distance = float('inf')
    corrected_word = input_word
    for word in correct_word_list:
        distance = Levenshtein.distance(input_word, word)
        if distance < min_distance:
            min_distance = distance
            corrected_word = word
    return corrected_word


Step 4: Split the Dataset


In [4]:
# Extract correct words
correct_words = data[data['label'] == 1]['word'].tolist()

# Extract incorrect words
incorrect_words = data[data['label'] == 0]['word'].tolist()


Step 5: Correct the Misspelled Words

In [5]:
# Correcting misspelled words and displaying outputs
def correct_sentences(sentences):
    for i, sentence in enumerate(sentences):
        print(f"\nSample Sentence {i+1}:")
        print(f"Original Sentence: {sentence}")

        words = sentence.split()
        misspelled_words = []
        corrected_words = []

        # Check and correct words
        for word in words:
            if word not in correct_words:
                misspelled_words.append(word)
                corrected_words.append(correct_word(word, correct_words))
            else:
                corrected_words.append(word)

        # Display results
        print(f"Misspelled Words: {misspelled_words}")
        corrected_sentence = " ".join(corrected_words)
        print(f"Corrected Sentence: {corrected_sentence}")



In [8]:
# Test with custom sentences
test_sentences = [
    "අම්මා යුහුෂුලුව අවදිවෙනවා",
    "උකුෂ්ෂා සාර්ථඛව සුනඛයකු පස්සේ එළවනවා",
    "ණාවිකයා සම්මත තාක්සණය නෞඛා පැදවීමට භාවිතා කරනවා",

]

correct_sentences(test_sentences)



Sample Sentence 1:
Original Sentence: අම්මා යුහුෂුලුව අවදිවෙනවා
Misspelled Words: ['යුහුෂුලුව']
Corrected Sentence: අම්මා යුහුසුලුව අවදිවෙනවා

Sample Sentence 2:
Original Sentence: උකුෂ්ෂා සාර්ථඛව සුනඛයකු පස්සේ එළවනවා
Misspelled Words: ['උකුෂ්ෂා', 'සාර්ථඛව']
Corrected Sentence: උකුස්සා සාර්ථකව සුනඛයකු පස්සේ එළවනවා

Sample Sentence 3:
Original Sentence: ණාවිකයා සම්මත තාක්සණය නෞඛා පැදවීමට භාවිතා කරනවා
Misspelled Words: ['ණාවිකයා', 'තාක්සණය', 'නෞඛා', 'පැදවීමට', 'භාවිතා']
Corrected Sentence: නාවිකයා සම්මත තාක්ෂණය නෞකා පැදවීම භාවිතය කරනවා




---
## To enhance the Sinhala Spelling Corrector further by providing correction suggestions based on **edit distance**, the following improved approach can be implemented. This ensures better identification and automatic correction of errors with suggestions, thereby improving the accuracy of the Sinhala text correction.




---
Create a Function to Provide Suggestions


In [16]:
# Function to provide top N suggestions for a misspelled word
def get_suggestions(input_word, correct_word_list, top_n=3):
    distances = []

    # Calculate Levenshtein distance for each correct word
    for word in correct_word_list:
        distance = Levenshtein.distance(input_word, word)
        distances.append((word, distance))

    # Sort words by their edit distance (ascending order)
    distances.sort(key=lambda x: x[1])

    # Return top N closest words
    return [word for word, _ in distances[:top_n]]


Correct Sentences with Suggestions

In [17]:
# Correct sentences and provide suggestions
def correct_sentence_with_suggestions(sentences, correct_word_list, top_n=3):
    for i, sentence in enumerate(sentences):
        print(f"\nSample Sentence {i+1}:")
        print(f"Original Sentence: {sentence}")

        words = sentence.split()
        misspelled_words = []
        corrected_words = []

        # Check for misspelled words
        for word in words:
            if word not in correct_word_list:
                misspelled_words.append(word)
                suggestions = get_suggestions(word, correct_word_list, top_n)
                corrected_word = suggestions[0]  # Automatically pick the closest suggestion

                print(f"Word '{word}' is misspelled. Suggestions: {suggestions}")
                corrected_words.append(corrected_word)
            else:
                corrected_words.append(word)

        # Display results
        print(f"Misspelled Words: {misspelled_words}")
        corrected_sentence = " ".join(corrected_words)
        print(f"Corrected Sentence: {corrected_sentence}")


In [29]:
# Extract correct words from dataset
correct_words = data[data['label'] == 1]['word'].tolist()

# Example sentences to test the implementation
test_sentences = [
    "උකුෂ්ෂා සාර්ථඛව සුනඛයකු පස්සේ එළවනවා",
    "ණාවිකයා සම්මත තාක්සණය නෞඛා පැදවීමට භාවිතා කරනවා",
    "මුරඛාරයා සැළකිළිමත්ව වීදිය පසුකරනවා",
    "අම්මා යුහුෂුලුව අවදිවෙනවා",
    "සමකාළීන වෙඩික්කාරයා වෙඩිතියනවා",
    "වෙඩික්කාරයා වෙඩිතියනවා",
    "අම්මා අවදිවෙනවා"
]

# Run the correction function
correct_sentence_with_suggestions(test_sentences, correct_words, top_n=3)



Sample Sentence 1:
Original Sentence: උකුෂ්ෂා සාර්ථඛව සුනඛයකු පස්සේ එළවනවා
Word 'උකුෂ්ෂා' is misspelled. Suggestions: ['උකුස්සා', 'උකුණා', 'කුක්කා']
Word 'සාර්ථඛව' is misspelled. Suggestions: ['සාර්ථකව', 'සාර්ථක', 'සාර්ථය']
Misspelled Words: ['උකුෂ්ෂා', 'සාර්ථඛව']
Corrected Sentence: උකුස්සා සාර්ථකව සුනඛයකු පස්සේ එළවනවා

Sample Sentence 2:
Original Sentence: ණාවිකයා සම්මත තාක්සණය නෞඛා පැදවීමට භාවිතා කරනවා
Word 'ණාවිකයා' is misspelled. Suggestions: ['නාවිකයා', 'ජාතිකයා', 'පාදිකයා']
Word 'තාක්සණය' is misspelled. Suggestions: ['තාක්ෂණය', 'තක්ෂණය', 'අක්සනය']
Word 'නෞඛා' is misspelled. Suggestions: ['නෞකා', 'නයා', 'නා']
Word 'පැදවීමට' is misspelled. Suggestions: ['පැදවීම', 'පැටවීම', 'පැනවීම']
Word 'භාවිතා' is misspelled. Suggestions: ['භාවිතය', 'භාවිතාව', 'අභාවිතය']
Misspelled Words: ['ණාවිකයා', 'තාක්සණය', 'නෞඛා', 'පැදවීමට', 'භාවිතා']
Corrected Sentence: නාවිකයා සම්මත තාක්ෂණය නෞකා පැදවීම භාවිතය කරනවා

Sample Sentence 3:
Original Sentence: මුරඛාරයා සැළකිළිමත්ව වීදිය පසුකරනවා
Word 'මුරඛාරයා'

In [30]:
def calculate_accuracy(test_sentences, correct_words):
    """
    Calculates the accuracy of the spell correction model.
    """
    correct_count = 0
    total_words = 0

    for sentence in test_sentences:
        words = sentence.split()
        total_words += len(words)
        for word in words:
            if word in correct_words:
              correct_count +=1

    accuracy = (correct_count / total_words) * 100
    return accuracy

# Example usage (assuming correct_words and test_sentences are defined)
accuracy = calculate_accuracy(test_sentences, correct_words)
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 57.69%
