In [1]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

In [2]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
from typing import List, Tuple
from Levenshtein import distance as edit_distance
from google.colab import drive

class SinhalaSpellChecker:
    def __init__(self, dictionary_path: str):  # Changed _init_ to __init__
        self.dictionary = self._load_dictionary(dictionary_path)

    def _load_dictionary(self, path: str) -> set:
        try:
            with open(path, 'r', encoding='utf-8') as f:
                return set(word.strip() for word in f if word.strip())
        except FileNotFoundError:
            print(f"Error: Dictionary file '{path}' not found.")
            return set()

    def suggest_corrections(self, word: str, max_suggestions: int = 5) -> List[Tuple[str, float]]:
        if not self.dictionary:
            print("Error: Dictionary is empty. Please load a valid dictionary.")
            return []

        if word in self.dictionary:
            return [(word, 1.0)]

        suggestions = []
        for dict_word in self.dictionary:
            dist = edit_distance(word, dict_word)
            similarity = 1 - (dist / max(len(word), len(dict_word)))
            if similarity > 0.6:
                suggestions.append((dict_word, similarity))

        return sorted(suggestions, key=lambda x: x[1], reverse=True)[:max_suggestions]

    def correct_sentence(self, sentence: str) -> str:
        corrected_words = []
        words = sentence.split()
        for word in words:
            # Skip non-Sinhala words or symbols
            if not re.match(r"^[\u0D80-\u0DFF]+$", word):
                corrected_words.append(word)
                continue

            if word in self.dictionary:
                corrected_words.append(word)
            else:
                suggestions = self.suggest_corrections(word)
                if suggestions:
                    best_suggestion, _ = suggestions[0]
                    corrected_words.append(best_suggestion)
                else:
                    corrected_words.append(word)  # Keep the original if no suggestions
        return ' '.join(corrected_words)


# Path to the dictionary file
dictionary_path = '/content/drive/MyDrive/extended_sinhala_dictionary.txt'
checker = SinhalaSpellChecker(dictionary_path)

while True:
    user_input = input("Enter a Sinhala sentence (or type 'exit' to quit): ").strip()

    if user_input.lower() == "exit":
        print("Exiting the Sinhala Spell Checker. Goodbye!")
        break

    corrected_sentence = checker.correct_sentence(user_input)
    print(f"Corrected Sentence: {corrected_sentence}")


Corrected Sentence: ඔහු ඉස්සරහා කෝටියක් ගියේය
Corrected Sentence: ඔහු ඉස්සරහා කුටියේ ගියේය
Corrected Sentence: අග්නදිග දූපතේ අක්කර වැඩිදියුණු කළ
Corrected Sentence: අග්නදිග දූපතේ අක්කර වැඩිදියුණු කළා
Corrected Sentence: ඔහුගේ අංගය අඩ පමණක් තිබුණි
Corrected Sentence: අක්කා අගනගරය නැරඹීමේ සතුටක් වූවා
Corrected Sentence: අක්කා අගනගරය නැරඹීමේ සතුටක් වූවා
Corrected Sentence: අක්කා අගනගරය නැරඹීමේ සතුටක් වූවා
Corrected Sentence: අක්කා අගනගරය නැරඹීමේ සතුටක් වූවා
Corrected Sentence: අනුරාධපුර අද විශේෂ ප්‍රධන දිනක්
