In [1]:
from google.colab import drive
drive.mount('/content/drive')

import re

def load_dictionary(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        # Read and clean the dictionary, assume each word is on a new line
        dictionary = set(word.strip().lower() for word in file.readlines())
    return dictionary

# Load the Sinhala dictionary
dictionary = load_dictionary('/content/drive/MyDrive/Al_Spelling and grammer checker/sinhala_dictionary_new.txt')

Mounted at /content/drive


In [2]:
# Function to calculate edit distance between two words considering letters and positions
def edit_distance(word1, word2):
    m, n = len(word1), len(word2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif word1[i-1] == word2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])

    return dp[m][n]

# Enhanced candidate generation considering letter positions
def generate_candidates(word, dictionary):
    candidates = []
    for dict_word in dictionary:
        # Calculate the edit distance for each word in the dictionary
        dist = edit_distance(word, dict_word)
        if dist <= 1:  # You can adjust the threshold based on your needs
            candidates.append((dict_word, dist))
    return candidates

# Example usage of candidate generation
misspelled_word = "පාසට"  # Replace with your input word
candidates = generate_candidates(misspelled_word, dictionary)
print(f"Candidates for the word '{misspelled_word}': {candidates}")


Candidates for the word 'පාසට': [('පාසා', 1), ('පාට', 1), ('පාසලට', 1), ('පාටට', 1), ('පසට', 1), ('පාසල', 1)]


In [3]:
from collections import Counter

# Score candidates based on their edit distance and frequency
def score_candidates(candidates, word_frequency):
    scored_candidates = []
    for candidate, dist in candidates:
        score = word_frequency.get(candidate, 0) - dist  # Higher frequency and lower edit distance score higher
        scored_candidates.append((candidate, score))
    scored_candidates.sort(key=lambda x: x[1], reverse=True)  # Sort candidates by score
    return scored_candidates

# Example usage of scoring candidates
word_frequency = Counter(dictionary)  # Frequency of words in the dictionary
scored_candidates = score_candidates(candidates, word_frequency)
print(f"Scored candidates: {scored_candidates}")


Scored candidates: [('පාසා', 0), ('පාට', 0), ('පාසලට', 0), ('පාටට', 0), ('පසට', 0), ('පාසල', 0)]


In [4]:
# Function to correct spelling considering all letters and positions
def correct_spelling(word, dictionary, word_frequency):
    if word in dictionary:  # If the word is in the dictionary, return it unchanged
        return word
    candidates = generate_candidates(word, dictionary)
    if not candidates:
        return word  # No correction found, return the original word
    scored_candidates = score_candidates(candidates, word_frequency)
    # Return the highest scoring candidate
    return scored_candidates[0][0]

# Example usage of spelling correction
corrected_word = correct_spelling(misspelled_word, dictionary, word_frequency)
print(f"Corrected word: {corrected_word}")


Corrected word: පාසා


In [5]:
# Function to check spelling for a whole input text
def spell_checker(input_text, dictionary, word_frequency):
    words = input_text.split()  # Split the text into individual words
    corrected_words = [correct_spelling(word, dictionary, word_frequency) for word in words]
    return ' '.join(corrected_words)

# Example usage for correcting an entire sentence
input_text = "ගුරැවරුන් පාසට වෙත පැමිණෙයි"
corrected_text = spell_checker(input_text, dictionary, word_frequency)
print(f"Corrected text: {corrected_text}")


Corrected text: ගුරුවරුන් පාසා වෙත පැමිණෙයි


In [6]:
misspelled_texts = [
    "ගුරැවරුන් පාසට වෙත පැමිනෙයි",
    "ඔයා හොද අයෙක්",
    "අද කුමුදු ගමනක ගියා",
    "ළමයි සෙලලම් කර කර හිටයා.",
    "වැස්ස නිසා අපිට එළියට යන්න බරි වණා"
]

for text in misspelled_texts:
    corrected_text = spell_checker(text, dictionary, word_frequency)
    print(f"\nOriginal: {text}, \nCorrected: {corrected_text}")



Original: ගුරැවරුන් පාසට වෙත පැමිනෙයි, 
Corrected: ගුරුවරුන් පාසා වෙත පැමිණෙයි

Original: ඔයා හොද අයෙක්, 
Corrected: ඔයා හොර අයෙක්

Original: අද කුමුදු ගමනක ගියා, 
Corrected: අද කුමුදු ගමනට ගියා

Original: ළමයි සෙලලම් කර කර හිටයා., 
Corrected: ළමයිට සෙල්ලම් කර කර හිටයා.

Original: වැස්ස නිසා අපිට එළියට යන්න බරි වණා, 
Corrected: වැස්ස නිසා අපිට එළියට යන්න බැරි වගා
