In [None]:
# Standard Sinhala Words Dictionary
standard_dictionary = {
    "මම": "මම", "ඔබ": "ඔබ", "අරලිය": "අරලිය", "ගඟ": "ගඟ",
    "ගස්": "ගස්", "මල්": "මල්", "මලක්": "මලක්", "අපි": "අපි",
    "අද": "අද", "ආදරය": "ආදරය", "සුන්දර": "සුන්දර", "විදුහල": "විදුහල",
    "දින": "දින", "ගුරුවරුන්": "ගුරුවරුන්", "පෙරදිග": "පෙරදිග", "විදුහලට": "විදුහලට",
    "මට":"මට", "යනවා":"යනවා", "කරති":"කරති"
}

# Regional Variations and Common Typing Mistakes
regional_variations = {
    "ගුරැවරුන්": "ගුරුවරුන්", "ගඟට": "ගඟට", "අපේ": "අපේ",
    "මලක්ට": "මලක්", "අපිටා": "අපිට", "විදුහලටා": "විදුහලට"
}


In [None]:
from indicnlp.tokenize import indic_tokenize

def tokenize_sinhala_text(paragraph):
    """Tokenize Sinhala text while preserving sentence structure."""
    return indic_tokenize.trivial_tokenize(paragraph, lang="si")


In [None]:
import Levenshtein

def correct_spelling(word):
    """Correct spelling using standard dictionary, regional variations, and contextual suggestions."""
    if word in standard_dictionary:
        return word  # Already correct
    elif word in regional_variations:
        return regional_variations[word]  # Regional variation

    # Log unknown words
    print(f"Unknown Word: {word}")

    # Find the closest match in the dictionary
    threshold = 1 if len(word) <= 4 else 2
    closest_match = min(standard_dictionary.keys(), key=lambda x: Levenshtein.distance(word, x))
    if Levenshtein.distance(word, closest_match) <= threshold:
        return closest_match

    # Return original word if no match found
    return word


In [None]:
def advanced_grammar_checker(paragraph):
    """Grammar checker with basic and advanced rules."""
    errors = []
    words = paragraph.split()

    # Rule 1: Detect repeated words
    for i in range(len(words) - 1):
        if words[i] == words[i + 1]:
            errors.append(f"Repeated word: '{words[i]}' at position {i}")

    # Rule 2: Sentence-ending errors
    if paragraph.endswith("අ"):
        errors.append("Suggestion: Sentence should not end with 'අ'.")

    # Rule 3: Subject-Verb-Object (SVO) alignment
    if len(words) > 2 and words[0] in ["මම", "ඔබ", "අපි"] and words[1] not in ["යන්න", "යමු"]:
        errors.append(f"Check sentence structure: Subject '{words[0]}' might need verb alignment.")

    return errors


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

# Load IndicBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModelForMaskedLM.from_pretrained("ai4bharat/indic-bert")

def contextual_correction(sentence):
    """Use IndicBERT to suggest corrections based on context."""
    tokens = tokenizer.tokenize(sentence)
    corrected_tokens = []

    for i, token in enumerate(tokens):
        if token not in standard_dictionary:
            token = tokenizer.mask_token
        corrected_tokens.append(token)

    masked_sentence = tokenizer.convert_tokens_to_string(corrected_tokens)
    inputs = tokenizer(masked_sentence, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs).logits

    for i, token in enumerate(tokens):
        if token == tokenizer.mask_token:
            masked_index = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)
            predicted_token_id = torch.argmax(outputs[0, masked_index]).item()
            tokens[i] = tokenizer.decode([predicted_token_id])

    return tokenizer.convert_tokens_to_string(tokens)


In [None]:
def correction_pipeline(paragraph):
    """Pipeline for spelling and grammar correction."""
    #  Tokenize
    tokens = tokenize_sinhala_text(paragraph)

    #  Correct Spelling
    corrected_tokens = [correct_spelling(word) for word in tokens]

    # Combine corrected tokens
    corrected_paragraph = ' '.join(corrected_tokens)

    #  Grammar Check
    grammar_suggestions = advanced_grammar_checker(corrected_paragraph)

    #  Contextual Correction
    final_paragraph = contextual_correction(corrected_paragraph)

    return final_paragraph, grammar_suggestions


In [None]:
input_paragraph = "අපි විදුහලට යනවා. ගුරැවරුන් මට ආදරය කරති."

# Execute the pipeline
corrected_paragraph, grammar_suggestions = correction_pipeline(input_paragraph)

# Display results
print("Original Paragraph:")
print(input_paragraph)
print("\nCorrected Paragraph:")
print(corrected_paragraph)
print("\nGrammar Suggestions:")
for suggestion in grammar_suggestions:
    print(f"- {suggestion}")


Unknown Word: .
Unknown Word: .
Original Paragraph:
අපි විදුහලට යනවා. ගුරැවරුන් මට ආදරය කරති.

Corrected Paragraph:
අප වදහලට යනව . ගරවරන මට ආදරය කරත .

Grammar Suggestions:
- Check sentence structure: Subject 'අපි' might need verb alignment.
