In [11]:
import pandas as pd

# Load Excel dataset
file_path = "data-spell-checker.xlsx"
data = pd.read_excel(file_path)

# Extract correct words
dictionary = data[data['label'] == 1]['word'].tolist()
print(f"Loaded {len(dictionary)} correct Sinhala words.")


Loaded 67260 correct Sinhala words.


In [12]:
# Load Sinhala stop words
with open('stop words.txt', 'r', encoding='utf-8') as file:
    stopwords = set(file.read().splitlines())


In [13]:
import re

def preprocess_text(text, stopwords):
    # Use regex to remove non-alphanumeric characters except spaces
    cleaned_text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation (keeping spaces)
    
    # Tokenize the text into words
    words = cleaned_text.split()
    
    # Remove stop words
    filtered_words = [word for word in words if word not in stopwords]
    return filtered_words

In [14]:
def sinhala_stemmer(word):
    suffixes = ['ින්', 'ට', 'ව', 'ගේ', 'යන්', 'නවා']  # Add more relevant suffixes
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

In [15]:
def detect_spelling_errors(words, dictionary):
    # Find words not in the dictionary
    errors = [word for word in words if word not in dictionary]
    return errors


In [16]:
from fuzzywuzzy import process

def sinhala_soundex(word):
    phonetic_map = {
        'ක': '1', 'ඛ': '1',
        'ච': '2', 'ජ': '2', 'ඡ': '2', 'ඣ': '2',
        'ට': '3', 'ඩ': '3', 'ඨ': '3', 'ඪ': '3',
        'ත': '4', 'ථ': '4',
        'බ': '5', 'ඵ': '5', 'භ': '5',
        'ශ': '7', 'ෂ': '7', 'ස': '7',
        'ග': '8', 'ඝ': '8', 'ඟ': '8'
    }
    first_letter = word[0]
    soundex_code = [first_letter]
    for char in word[1:]:
        if char in phonetic_map:
            code = phonetic_map[char]
            if soundex_code[-1] != code:
                soundex_code.append(code)
    while len(soundex_code) < 4:
        soundex_code.append('0')  # Pad with zeros
    return ''.join(soundex_code[:4])



In [17]:
def suggest_corrections(errors, dictionary, limit=3, threshold=80):
    suggestions = {}
    for error in errors:
        matches = process.extract(error, dictionary, limit=limit)
        # Filter suggestions based on minimum similarity threshold
        filtered_matches = [match[0] for match in matches if match[1] >= threshold]
        suggestions[error] = filtered_matches
    return suggestions


In [18]:
def spell_checker(input_text, dictionary, stopwords):
    # Step 1: Preprocess input text
    words = preprocess_text(input_text, stopwords)
    # Step 2: Detect spelling errors
    errors = detect_spelling_errors(words, dictionary)
    
    if not errors:
        return "No spelling errors found!", {}

    # Step 3: Suggest corrections for detected errors
    corrections = suggest_corrections(errors, dictionary)
    return errors, corrections


In [19]:
def auto_correct(input_text, dictionary, stopwords):
    errors, corrections = spell_checker(input_text, dictionary, stopwords)
    words = input_text.split()

    # Replace each word with the top suggestion if available
    corrected_words = [
        corrections.get(word, [word])[0]  # If word is found in corrections, replace it
        for word in words
    ]
    
    return " ".join(corrected_words)


In [20]:
input_text = "කේක"

# Run the spell checker
errors, corrections = spell_checker(input_text, dictionary, stopwords)
print("Errors:", errors)
print("Suggestions:", corrections)

# Auto-correct the text
corrected_text = auto_correct(input_text, dictionary, stopwords)
print("Corrected Text:", corrected_text)


Errors: ['කක']
Suggestions: {'කක': ['අධිකකම', 'අවංකකම', 'එකක්වත්']}
Corrected Text: කේක


In [21]:
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import fuzz, process
from typing import List, Dict, Tuple

class AdvancedSinhalaSpellChecker:
    def __init__(self, dictionary_path: str, stopwords_path: str):
        # Load dictionary with more sophisticated preprocessing
        self.data = pd.read_excel(dictionary_path)
        self.dictionary = self._preprocess_dictionary()
        
        # Load stopwords
        with open(stopwords_path, 'r', encoding='utf-8') as file:
            self.stopwords = set(file.read().splitlines())
        
        # Advanced phonetic mapping
        self.phonetic_mapping = self._create_advanced_phonetic_mapping()
        
        # Suffix rules for more comprehensive stemming
        self.suffix_rules = [
            'ආගම', 'ගෙන', 'යෙහි', 'යේ', 'ට', 'ම', 
            'යන', 'ක', 'වා', 'ලා', 'ල', 'න', 'හි'
        ]
    
    def _preprocess_dictionary(self) -> List[str]:
      
        # Assuming 'word' column contains correct words and 'label' column indicates correctness
        correct_words = self.data[self.data['label'] == 1]['word']
        
        # Remove duplicates, convert to lowercase, remove special characters
        processed_words = set(
            re.sub(r'[^\u0D80-\u0DFF]', '', word.lower()) 
            for word in correct_words
        )
        
        return list(processed_words)
    
    def _create_advanced_phonetic_mapping(self) -> Dict[str, str]:
       
        return {
            # Consonant groups with similar sounds
            'ක': 'k', 'ඛ': 'k', 'ගෑ': 'g', 'ඝ': 'g',
            'ච': 'c', 'ජ': 'j', 'ඣ': 'j',
            'ට': 't', 'ඩ': 'd', 'ඨ': 't', 'ඪ': 'd',
            'ත': 't', 'ද': 'd', 'ධ': 'd',
            'ප': 'p', 'බ': 'b', 'භ': 'b',
            'ම': 'm', 'න': 'n', 'ණ': 'n',
            'ල': 'l', 'ළ': 'l',
            'ර': 'r', 'ඍ': 'r',
            'ව': 'v', 'ශ': 's', 'ෂ': 's', 'ස': 's', 
            'හ': 'h'
        }
    
    def _advanced_stemmer(self, word: str) -> str:
      
        original_word = word
        for suffix in self.suffix_rules:
            if word.endswith(suffix):
                word = word[:-len(suffix)]
                break
        
        # If no suffix removed and word is too short, return original
        return word if len(word) > 2 else original_word
    
    def _phonetic_key(self, word: str) -> str:
     
        phonetic_key = ''
        for char in word:
            phonetic_key += self.phonetic_mapping.get(char, char)
        return phonetic_key
    
    def find_corrections(self, word: str, limit: int = 5, threshold: int = 70) -> List[Tuple[str, int]]:
    
        if word in self.stopwords or word in self.dictionary:
            return [(word, 100)]
        
        # Stemming
        stemmed_word = self._advanced_stemmer(word)
        
        # Multiple similarity strategies
        candidates = []
        for dict_word in self.dictionary:
            # Phonetic similarity
            phonetic_similarity = fuzz.ratio(
                self._phonetic_key(stemmed_word), 
                self._phonetic_key(dict_word)
            )
            
            # String-based similarity
            string_similarity = fuzz.ratio(word, dict_word)
            
            # Levenshtein distance
            edit_similarity = fuzz.token_sort_ratio(word, dict_word)
            
            # Combined weighted similarity
            combined_score = (
                0.4 * phonetic_similarity + 
                0.3 * string_similarity + 
                0.3 * edit_similarity
            )
            
            candidates.append((dict_word, combined_score))
        
        # Sort and filter candidates
        candidates.sort(key=lambda x: x[1], reverse=True)
        return [
            (candidate, score) 
            for candidate, score in candidates 
            if score >= threshold
        ][:limit]
    
    def spell_check(self, text: str) -> Dict[str, List[Tuple[str, int]]]:
       
        # Preprocess text
        words = re.findall(r'\S+', text)
        
        # Spelling error detection and correction
        spelling_errors = {}
        for word in words:
            if word not in self.dictionary and word not in self.stopwords:
                corrections = self.find_corrections(word)
                if corrections:
                    spelling_errors[word] = corrections
        
        return spelling_errors
    
    def auto_correct(self, text: str) -> str:
      
        errors = self.spell_check(text)
        corrected_words = []
        
        for word in text.split():
            if word in errors:
                # Take the first (best) suggestion
                corrected_words.append(errors[word][0][0])
            else:
                corrected_words.append(word)
        
        return ' '.join(corrected_words)

# Example Usage
def main():
    # Initialize spell checker
    spell_checker = AdvancedSinhalaSpellChecker(
        dictionary_path='data-spell-checker.xlsx',
        stopwords_path='stop words.txt'
    )
    
    # Test input
    test_text = "අදරය"
    
    # Spell check
    spelling_errors = spell_checker.spell_check(test_text)
    print("Spelling Errors:", spelling_errors)
    
    # Auto-correction
    corrected_text = spell_checker.auto_correct(test_text)
    print("Corrected Text:", corrected_text)

if __name__ == "__main__":
    main()

Spelling Errors: {'අදරය': [('දරය', 86.0), ('අරය', 86.0), ('අධරය', 85.0), ('අන්දරය', 80.0), ('අනාදරය', 80.0)]}
Corrected Text: දරය


In [None]:
!pip install sinling

ERROR: Could not install packages due to an OSError: Could not find a suitable TLS CA certificate bundle, invalid path: C:\Program Files\PostgreSQL\16\ssl\certs\ca-bundle.crt



In [24]:
from sinling import SinhalaTokenizer, POSTagger

tokenizer = SinhalaTokenizer()
tagger = POSTagger()

document = "මම පොත කියවායි"
tokenized_sentences = [tokenizer.tokenize(f'{ss}.') for ss in tokenizer.split_sentences(document)]
pos_tags = tagger.predict(tokenized_sentences)
print(pos_tags)

[[('මම', 'PRP'), ('පොත', 'NNC'), ('කියවායි', 'VFM'), ('.', 'FS')]]


In [27]:
import difflib

class SinhalaSpellChecker:
    def __init__(self, dictionary_path: str, stopwords_path: str, suffixes_path: str):
        # Load dictionary
        self.data = pd.read_excel(dictionary_path)
        self.dictionary = self._preprocess_dictionary()
        
        # Load stopwords
        with open(stopwords_path, 'r', encoding='utf-8') as file:
            self.stopwords = set(file.read().splitlines())
        
        # Load suffixes from suffixes_list.txt
        with open(suffixes_path, 'r', encoding='utf-8') as file:
            self.suffix_rules = file.read().splitlines()
        
        # Advanced phonetic mapping
        self.phonetic_mapping = self._create_advanced_phonetic_mapping()
        
        # Prefix and suffix variations
        self.prefix_variations = {
            'අ': ['ආ', 'අ'],
            'අද': ['ආද', 'අද'],
            'අන': ['ආන', 'අන']
        }
    
    def _preprocess_dictionary(self) -> List[str]:
        """
        Advanced dictionary preprocessing
        """
        correct_words = self.data[self.data['label'] == 1]['word']
        
        # Remove duplicates, convert to lowercase, remove special characters
        processed_words = set(
            re.sub(r'[^\u0D80-\u0DFF]', '', word.lower()) 
            for word in correct_words
        )
        
        return list(processed_words)
    
    def _create_advanced_phonetic_mapping(self) -> Dict[str, str]:
        """
        Comprehensive phonetic mapping for Sinhala characters
        """
        return {
            # Consonant groups with similar sounds
            'ක': 'k', 'ඛ': 'k', 'ගෑ': 'g', 'ඝ': 'g',
            'ච': 'c', 'ජ': 'j', 'ඣ': 'j',
            'ට': 't', 'ඩ': 'd', 'ඨ': 't', 'ඪ': 'd',
            'ත': 't', 'ද': 'd', 'ධ': 'd',
            'ප': 'p', 'බ': 'b', 'භ': 'b',
            'ම': 'm', 'න': 'n', 'ණ': 'n',
            'ල': 'l', 'ළ': 'l',
            'ර': 'r', 'ඍ': 'r',
            'ව': 'v', 'ශ': 's', 'ෂ': 's', 'ස': 's', 
            'හ': 'h'
        }
    
    def _advanced_stemmer(self, word: str) -> str:
        """
        Advanced stemming with multiple suffix removal strategies
        """
        original_word = word
        for suffix in self.suffix_rules:
            if word.endswith(suffix):
                word = word[:-len(suffix)]
                break
        
        # If no suffix removed and word is too short, return original
        return word if len(word) > 2 else original_word
    
    def _phonetic_key(self, word: str) -> str:
        """
        Generate advanced phonetic key
        """
        phonetic_key = ''
        for char in word:
            phonetic_key += self.phonetic_mapping.get(char, char)
        return phonetic_key
    
    def _generate_prefix_variations(self, word: str) -> List[str]:
        """
        Generate potential prefix variations of a word
        """
        variations = [word]
        
        for prefix, alternates in self.prefix_variations.items():
            if word.startswith(prefix):
                for alt_prefix in alternates:
                    if prefix != alt_prefix:
                        variation = alt_prefix + word[len(prefix):]
                        variations.append(variation)
        
        return variations
    
    def find_corrections(self, word: str, limit: int = 5, threshold: int = 70) -> List[Tuple[str, int]]:
        """
        Enhanced correction finding with prefix variations and multiple similarity metrics
        """
        # Check if word is already correct
        if word in self.stopwords or word in self.dictionary:
            return [(word, 100)]
        
        # Generate prefix variations to check
        word_variations = self._generate_prefix_variations(word)
        
        # Comprehensive similarity calculation
        candidates = []
        for dict_word in self.dictionary:
            for variation in word_variations:
                # Stem both variation and dictionary word
                stemmed_variation = self._advanced_stemmer(variation)
                stemmed_dict_word = self._advanced_stemmer(dict_word)
                
                # Multiple similarity metrics
                phonetic_similarity = fuzz.ratio(
                    self._phonetic_key(stemmed_variation), 
                    self._phonetic_key(stemmed_dict_word)
                )
                
                string_similarity = fuzz.ratio(stemmed_variation, stemmed_dict_word)
                edit_similarity = fuzz.token_sort_ratio(stemmed_variation, stemmed_dict_word)
                
                # Sequence matcher for more nuanced similarity
                seq_matcher = difflib.SequenceMatcher(None, stemmed_variation, stemmed_dict_word)
                sequence_similarity = seq_matcher.ratio() * 100
                
                # Prefix similarity
                prefix_similarity = fuzz.ratio(variation[:3], dict_word[:3]) * 0.5
                
                # Combined weighted similarity
                combined_score = (
                    0.25 * phonetic_similarity + 
                    0.2 * string_similarity + 
                    0.15 * edit_similarity +
                    0.25 * sequence_similarity +
                    0.15 * prefix_similarity
                )
                
                candidates.append((dict_word, combined_score))
        
        # Sort, filter, and limit candidates
        candidates = sorted(candidates, key=lambda x: x[1], reverse=True)
        unique_candidates = []
        seen = set()
        for candidate, score in candidates:
            if candidate not in seen and score >= threshold:
                unique_candidates.append((candidate, score))
                seen.add(candidate)
                if len(unique_candidates) == limit:
                    break
        
        return unique_candidates
    
    def spell_check(self, text: str) -> Dict[str, List[Tuple[str, int]]]:
        """
        Comprehensive spell checking
        """
        words = re.findall(r'\S+', text)
        
        spelling_errors = {}
        for word in words:
            if word not in self.dictionary and word not in self.stopwords:
                corrections = self.find_corrections(word)
                if corrections:
                    spelling_errors[word] = corrections
        
        return spelling_errors
    
    def auto_correct(self, text: str) -> str:
        """
        Automatically correct text using best suggestions
        """
        errors = self.spell_check(text)
        corrected_words = []
        
        for word in text.split():
            if word in errors:
                corrected_words.append(errors[word][0][0])  # Take the first suggestion
            else:
                corrected_words.append(word)
        
        return ' '.join(corrected_words)

# Example Usage
def main():
    # Initialize spell checker with suffix list file path
    spell_checker = SinhalaSpellChecker(
        dictionary_path='data-spell-checker.xlsx',
        stopwords_path='stop words.txt',
        suffixes_path='suffixes_list.txt'
    )
    
    test_texts = [
        "මම පොත කියවායි",
        "ගෘහෂ්ථ",
        "අදරය"
    ]
    
    for text in test_texts:
        print("\n--- Spell Check for:", text)
        
        # Find spelling errors
        errors = spell_checker.spell_check(text)
        print("Spelling Errors:", errors)
        
        # Auto-correction
        corrected_text = spell_checker.auto_correct(text)
        print("Auto-corrected Text:", corrected_text)

if __name__ == "__main__":
    main()



--- Spell Check for: මම පොත කියවායි
Spelling Errors: {'කියවායි': [('කියවනවා', 84.77222222222221), ('කියවීම', 78.5), ('කියවෙනවා', 78.5), ('කියවනය', 76.85), ('කියමු', 72.9)]}
Auto-corrected Text: මම පොත කියවනවා

--- Spell Check for: ගෘහෂ්ථ
Spelling Errors: {'ගෘහෂ්ථ': [('ගෘහස්ථ', 82.38333333333333)]}
Auto-corrected Text: ගෘහස්ථ

--- Spell Check for: අදරය
Spelling Errors: {'අදරය': [('ආදරය', 92.5), ('අර', 74.0), ('ආර', 74.0), ('දර', 74.0), ('අද', 74.0)]}
Auto-corrected Text: ආදරය


In [28]:
class SinhalaSpellChecker:
    def __init__(self, dictionary_path: str, stopwords_path: str, suffixes_path: str, stem_dictionary_path: str):
        # Load dictionary
        self.data = pd.read_excel(dictionary_path)
        self.dictionary = self._preprocess_dictionary()
        
        # Load stopwords
        with open(stopwords_path, 'r', encoding='utf-8') as file:
            self.stopwords = set(file.read().splitlines())
        
        # Load suffixes from suffixes_list.txt
        with open(suffixes_path, 'r', encoding='utf-8') as file:
            self.suffix_rules = file.read().splitlines()
        
        # Load stem dictionary
        self.stem_dictionary = self._load_stem_dictionary(stem_dictionary_path)
        
        # Advanced phonetic mapping
        self.phonetic_mapping = self._create_advanced_phonetic_mapping()
        
        # Prefix and suffix variations
        self.prefix_variations = {
            'අ': ['ආ', 'අ'],
            'අද': ['ආද', 'අද'],
            'අන': ['ආන', 'අන']
        }
    
    def _load_stem_dictionary(self, stem_dictionary_path: str) -> Dict[str, str]:
        """
        Load stem dictionary from a file, where each line contains a word variation and its stem.
        """
        stem_dict = {}
        with open(stem_dictionary_path, 'r', encoding='utf-8') as file:
            for line in file:
                word, stem = line.strip().split('\t')
                stem_dict[word] = stem
        return stem_dict
    
    def _preprocess_dictionary(self) -> List[str]:
        """
        Advanced dictionary preprocessing
        """
        correct_words = self.data[self.data['label'] == 1]['word']
        
        # Remove duplicates, convert to lowercase, remove special characters
        processed_words = set(
            re.sub(r'[^\u0D80-\u0DFF]', '', word.lower()) 
            for word in correct_words
        )
        
        return list(processed_words)
    
    def _create_advanced_phonetic_mapping(self) -> Dict[str, str]:
        """
        Comprehensive phonetic mapping for Sinhala characters
        """
        return {
            # Consonant groups with similar sounds
            'ක': 'k', 'ඛ': 'k', 'ගෑ': 'g', 'ඝ': 'g',
            'ච': 'c', 'ජ': 'j', 'ඣ': 'j',
            'ට': 't', 'ඩ': 'd', 'ඨ': 't', 'ඪ': 'd',
            'ත': 't', 'ද': 'd', 'ධ': 'd',
            'ප': 'p', 'බ': 'b', 'භ': 'b',
            'ම': 'm', 'න': 'n', 'ණ': 'n',
            'ල': 'l', 'ළ': 'l',
            'ර': 'r', 'ඍ': 'r',
            'ව': 'v', 'ශ': 's', 'ෂ': 's', 'ස': 's', 
            'හ': 'h'
        }
    
    def _advanced_stemmer(self, word: str) -> str:
        """
        Advanced stemming with multiple suffix removal strategies and stem dictionary
        """
        # First, check if the word exists in the stem dictionary
        if word in self.stem_dictionary:
            return self.stem_dictionary[word]
        
        # If not, apply suffix removal rules
        original_word = word
        for suffix in self.suffix_rules:
            if word.endswith(suffix):
                word = word[:-len(suffix)]
                break
        
        # If no suffix removed and word is too short, return original
        return word if len(word) > 2 else original_word
    
    def _phonetic_key(self, word: str) -> str:
        """
        Generate advanced phonetic key
        """
        phonetic_key = ''
        for char in word:
            phonetic_key += self.phonetic_mapping.get(char, char)
        return phonetic_key
    
    def _generate_prefix_variations(self, word: str) -> List[str]:
        """
        Generate potential prefix variations of a word
        """
        variations = [word]
        
        for prefix, alternates in self.prefix_variations.items():
            if word.startswith(prefix):
                for alt_prefix in alternates:
                    if prefix != alt_prefix:
                        variation = alt_prefix + word[len(prefix):]
                        variations.append(variation)
        
        return variations
    
    def find_corrections(self, word: str, limit: int = 5, threshold: int = 70) -> List[Tuple[str, int]]:
        """
        Enhanced correction finding with prefix variations and multiple similarity metrics
        """
        # Check if word is already correct
        if word in self.stopwords or word in self.dictionary:
            return [(word, 100)]
        
        # Generate prefix variations to check
        word_variations = self._generate_prefix_variations(word)
        
        # Comprehensive similarity calculation
        candidates = []
        for dict_word in self.dictionary:
            for variation in word_variations:
                # Stem both variation and dictionary word
                stemmed_variation = self._advanced_stemmer(variation)
                stemmed_dict_word = self._advanced_stemmer(dict_word)
                
                # Multiple similarity metrics
                phonetic_similarity = fuzz.ratio(
                    self._phonetic_key(stemmed_variation), 
                    self._phonetic_key(stemmed_dict_word)
                )
                
                string_similarity = fuzz.ratio(stemmed_variation, stemmed_dict_word)
                edit_similarity = fuzz.token_sort_ratio(stemmed_variation, stemmed_dict_word)
                
                # Sequence matcher for more nuanced similarity
                seq_matcher = difflib.SequenceMatcher(None, stemmed_variation, stemmed_dict_word)
                sequence_similarity = seq_matcher.ratio() * 100
                
                # Prefix similarity
                prefix_similarity = fuzz.ratio(variation[:3], dict_word[:3]) * 0.5
                
                # Combined weighted similarity
                combined_score = (
                    0.25 * phonetic_similarity + 
                    0.2 * string_similarity + 
                    0.15 * edit_similarity +
                    0.25 * sequence_similarity +
                    0.15 * prefix_similarity
                )
                
                candidates.append((dict_word, combined_score))
        
        # Sort, filter, and limit candidates
        candidates = sorted(candidates, key=lambda x: x[1], reverse=True)
        unique_candidates = []
        seen = set()
        for candidate, score in candidates:
            if candidate not in seen and score >= threshold:
                unique_candidates.append((candidate, score))
                seen.add(candidate)
                if len(unique_candidates) == limit:
                    break
        
        return unique_candidates
    
    def spell_check(self, text: str) -> Dict[str, List[Tuple[str, int]]]:
        """
        Comprehensive spell checking
        """
        words = re.findall(r'\S+', text)
        
        spelling_errors = {}
        for word in words:
            if word not in self.dictionary and word not in self.stopwords:
                corrections = self.find_corrections(word)
                if corrections:
                    spelling_errors[word] = corrections
        
        return spelling_errors
    
    def auto_correct(self, text: str) -> str:
        """
        Automatically correct text using best suggestions
        """
        errors = self.spell_check(text)
        corrected_words = []
        
        for word in text.split():
            if word in errors:
                corrected_words.append(errors[word][0][0])  # Take the first suggestion
            else:
                corrected_words.append(word)
        
        return ' '.join(corrected_words)

# Example Usage
def main():
    # Initialize spell checker with suffix list and stem dictionary file paths
    spell_checker = SinhalaSpellChecker(
        dictionary_path='data-spell-checker.xlsx',
        stopwords_path='stop words.txt',
        suffixes_path='suffixes_list.txt',
        stem_dictionary_path='stem_dictionary.txt'
    )
    
    test_texts = [
        "මම පොත කියවායි"
    ]
    
    for text in test_texts:
        print("\n--- Spell Check for:", text)
        
        # Find spelling errors
        errors = spell_checker.spell_check(text)
        print("Spelling Errors:", errors)
        
        # Auto-correction
        corrected_text = spell_checker.auto_correct(text)
        print("Auto-corrected Text:", corrected_text)

if __name__ == "__main__":
    main()



--- Spell Check for: මම පොත කියවායි
Spelling Errors: {'කියවායි': [('කියවනවා', 84.77222222222221), ('කියවීම', 78.5), ('කියවනය', 76.85), ('කියමු', 72.9), ('කියනවා', 72.9)]}
Auto-corrected Text: මම පොත කියවනවා


In [30]:
from sinling import SinhalaTokenizer, POSTagger

class SinhalaGrammarChecker:
    def __init__(self):
        self.tokenizer = SinhalaTokenizer()
        self.tagger = POSTagger()

    def is_sov_order(self, pos_tags):
        """
        Check if the sentence follows SOV order based on POS tags.
        """
        if len(pos_tags) < 3:
            return False  # Sentence too short to be SOV
        
        subject_tag, object_tag, verb_tag = pos_tags[0][1], pos_tags[1][1], pos_tags[2][1]
        
        # SOV structure: S -> PRP, O -> NNC, V -> V* (verbs starting with 'V')
        return subject_tag == 'PRP' and object_tag == 'NNC' and verb_tag.startswith('V')

    def check_grammar(self, sentence):
        """
        Check grammar rules for a given sentence.
        """
        tokenized_sentences = [self.tokenizer.tokenize(f'{ss}.') for ss in self.tokenizer.split_sentences(sentence)]
        pos_tags = self.tagger.predict(tokenized_sentences)
        
        if not pos_tags or not pos_tags[0]:
            return "Unable to analyze the sentence."
        
        tokens = tokenized_sentences[0]
        tags = pos_tags[0]
        
        # Ensure the sentence follows SOV structure
        if not self.is_sov_order(tags):
            return "Sentence does not follow SOV order."
        
        # Extract Subject, Verb, and Object
        subject = tokens[0]
        verb = tokens[-1]
        
        # Rule 1: If S = "මම", V should end with "මි"
        if subject == "මම" and not verb.endswith("මි"):
            return f"Grammar error: Verb '{verb}' should end with 'මි' when the subject is 'මම'."
        
        # Rule 2: If S = "අපි", V should end with "මු"
        if subject == "අපි" and not verb.endswith("මු"):
            return f"Grammar error: Verb '{verb}' should end with 'මු' when the subject is 'අපි'."
        
        return "The sentence is grammatically correct."

# Example Usage
def main():
    grammar_checker = SinhalaGrammarChecker()

    sentence = "මම පොත කියවායි"
    result = grammar_checker.check_grammar(sentence)
    print(result)

if __name__ == "__main__":
    main()


Grammar error: Verb '.' should end with 'මි' when the subject is 'මම'.


In [35]:
from sinling import SinhalaTokenizer, POSTagger
from typing import List, Tuple, Optional

class SinhalaGrammarChecker:
    def __init__(self):
        """Initialize the grammar checker with required tools."""
        self.tokenizer = SinhalaTokenizer()
        self.tagger = POSTagger()
        
        # Define verb endings for different subjects
        self.subject_verb_endings = {
            "මම": "මි",    # I
            "අපි": "මු",    # We
            "ඔහු": "යි",    # He
            "ඇය": "යි",    # She
            "ඔවුන්": "ති",  # They
            "ඔබ": "හි",     # You (singular)
            "ඔබලා": "හු",   # You (plural)
        }
        
    def tokenize_and_tag(self, sentence: str) -> Tuple[List[str], List[Tuple[str, str]]]:
        """
        Tokenize and POS tag the input sentence.
        
        Args:
            sentence: Input Sinhala sentence
            
        Returns:
            Tuple of (tokens, pos_tags)
        """
        try:
            # Add period if sentence doesn't end with punctuation
            if not sentence[-1] in [".", "!", "?"]:
                sentence += "."
                
            tokenized_sentences = [self.tokenizer.tokenize(ss) 
            for ss in self.tokenizer.split_sentences(sentence)]
            if not tokenized_sentences:
                raise ValueError("Empty sentence after tokenization")
                
            pos_tags = self.tagger.predict(tokenized_sentences)
            if not pos_tags or not pos_tags[0]:
                raise ValueError("Failed to generate POS tags")
                
            return tokenized_sentences[0], pos_tags[0]
            
        except Exception as e:
            raise ValueError(f"Error in tokenization/tagging: {str(e)}")

    def is_sov_order(self, tokens: List[str], pos_tags: List[Tuple[str, str]]) -> bool:
        """
        Check if the sentence follows Subject-Object-Verb order.
        
        Args:
            tokens: List of tokenized words
            pos_tags: List of POS tags for each token
            
        Returns:
            Boolean indicating if sentence follows SOV order
        """
        if len(pos_tags) < 3:
            return False

        # Get basic components
        subject_pos = pos_tags[0][1]
        verb_pos = pos_tags[-1][1]
        
        # Check if there's an object between subject and verb
        has_object = False
        for tag in pos_tags[1:-1]:
            if tag[1] in ['NNC', 'NNP', 'PRP']:  # Common noun, proper noun, or pronoun
                has_object = True
                break
                
        return (subject_pos in ['PRP', 'NNP', 'NNC'] and  # Subject is pronoun or noun
                verb_pos.startswith('V') and  # Last word is verb
                has_object)  # Has object between S and V

    def check_subject_verb_agreement(self, subject: str, verb: str) -> Optional[str]:
        """
        Check if the verb ending agrees with the subject.
        
        Args:
            subject: Subject word
            verb: Verb word
            
        Returns:
            Correction suggestion if there's an error, None if correct
        """
        if subject in self.subject_verb_endings:
            expected_ending = self.subject_verb_endings[subject]
            if not verb.endswith(expected_ending):
                # Generate correct verb form
                verb_root = verb[:-2] if len(verb) > 2 else verb
                correct_verb = verb_root + expected_ending
                return correct_verb
        return None

    def check_grammar(self, sentence: str) -> dict:
        """
        Check grammar rules for a given sentence and return detailed analysis.
        
        Args:
            sentence: Input Sinhala sentence
            
        Returns:
            Dictionary containing analysis results and suggestions
        """
        try:
            tokens, pos_tags = self.tokenize_and_tag(sentence)
            
            result = {
                "original": sentence,
                "is_grammatical": True,
                "errors": [],
                "suggestions": [],
                "analysis": {
                    "tokens": tokens,
                    "pos_tags": pos_tags
                }
            }
            
            # Check word order
            if not self.is_sov_order(tokens, pos_tags):
                result["is_grammatical"] = False
                result["errors"].append("Sentence does not follow Subject-Object-Verb order")
                # Suggest correction by reordering
                suggested_order = self._reorder_to_sov(tokens, pos_tags)
                if suggested_order:
                    result["suggestions"].append(f"Consider: {' '.join(suggested_order)}")
            
            # Check subject-verb agreement
            subject = tokens[0]
            verb = tokens[-1]
            corrected_verb = self.check_subject_verb_agreement(subject, verb)
            
            if corrected_verb:
                result["is_grammatical"] = False
                result["errors"].append(f"Verb ending doesn't agree with subject '{subject}'")
                # Create corrected sentence
                corrected_tokens = tokens[:-1] + [corrected_verb]
                result["suggestions"].append(f"Consider: {' '.join(corrected_tokens)}")
            
            return result
            
        except ValueError as e:
            return {
                "original": sentence,
                "is_grammatical": False,
                "errors": [str(e)],
                "suggestions": [],
                "analysis": None
            }

    def _reorder_to_sov(self, tokens: List[str], pos_tags: List[Tuple[str, str]]) -> Optional[List[str]]:
        """
        Attempt to reorder tokens to follow SOV order.
        
        Args:
            tokens: List of tokens
            pos_tags: List of POS tags
            
        Returns:
            Reordered list of tokens if possible, None if not
        """
        # Find subject, object, and verb candidates
        subject_idx = None
        object_idx = None
        verb_idx = None
        
        for i, (token, tag) in enumerate(zip(tokens, pos_tags)):
            if tag[1] in ['PRP', 'NNP', 'NNC'] and subject_idx is None:
                subject_idx = i
            elif tag[1] in ['NNC', 'NNP', 'PRP'] and subject_idx is not None and object_idx is None:
                object_idx = i
            elif tag[1].startswith('V'):
                verb_idx = i
        
        if all(x is not None for x in [subject_idx, object_idx, verb_idx]):
            # Reorder maintaining other words' relative positions
            reordered = []
            # Add subject
            reordered.append(tokens[subject_idx])
            # Add object
            reordered.append(tokens[object_idx])
            # Add any intervening words
            for i, token in enumerate(tokens):
                if i not in [subject_idx, object_idx, verb_idx]:
                    reordered.append(token)
            # Add verb at end
            reordered.append(tokens[verb_idx])
            return reordered
        return None

def main():
    """Example usage of the grammar checker."""
    checker = SinhalaGrammarChecker()
    
    # Test sentences
    test_sentences = [
        "අපි පොත කියවම",
        "මම පාඩම කරයි",
        "ඔහු සිංහල ඉගෙනගනී",
    ]
    
    for sentence in test_sentences:
        print("\nAnalyzing:", sentence)
        result = checker.check_grammar(sentence)
        
        if result["is_grammatical"]:
            print("✓ Grammatically correct")
        else:
            print("✗ Grammar errors found:")
            for error in result["errors"]:
                print(f"  - {error}")
            print("\nSuggestions:")
            for suggestion in result["suggestions"]:
                print(f"  - {suggestion}")

if __name__ == "__main__":
    main()



Analyzing: අපි පොත කියවම
✗ Grammar errors found:
  - Sentence does not follow Subject-Object-Verb order
  - Verb ending doesn't agree with subject 'අපි'

Suggestions:
  - Consider: අපි පොත කියමු

Analyzing: මම පාඩම කරයි
✗ Grammar errors found:
  - Verb ending doesn't agree with subject 'මම'

Suggestions:
  - Consider: මම පාඩම කරමි

Analyzing: ඔහු සිංහල ඉගෙනගනී
✗ Grammar errors found:
  - Sentence does not follow Subject-Object-Verb order
  - Verb ending doesn't agree with subject 'ඔහු'

Suggestions:
  - Consider: ඔහු සිංහල ඉගෙනගයි
