In [46]:
import json

# RuleExtractor class (already provided by you)
class RuleExtractor:
    def __init__(self, correct_sentences, incorrect_sentences):
        self.correct_sentences = correct_sentences
        self.incorrect_sentences = incorrect_sentences

    def extract_rules(self):
        rules = {
            "subject_verb_order": [],
            "pronoun_ending_match": [],
            "verb_conjugation": []  # New category
        }

        for correct, incorrect in zip(self.correct_sentences, self.incorrect_sentences):
            if correct != incorrect:
                correct_words = correct.split()
                incorrect_words = incorrect.split()

                # Subject-Verb order rule
                if sorted(correct_words) == sorted(incorrect_words):
                    rules["subject_verb_order"].append({
                        "incorrect": incorrect,
                        "correct": correct
                    })

                # Pronoun-ending match rule
                if len(correct_words) > 1 and len(incorrect_words) > 1:
                    pronoun = correct_words[0]
                    correct_ending = correct_words[-1][-2:]
                    incorrect_ending = incorrect_words[-1][-2:]
                    if pronoun in ["මම", "අපි", "ඔහු"] and correct_ending != incorrect_ending:
                        rules["pronoun_ending_match"].append({
                            "start": pronoun,
                            "end": correct_ending
                        })

                # Verb conjugation rule
                if len(correct_words) == len(incorrect_words):
                    for cw, iw in zip(correct_words, incorrect_words):
                        if cw != iw and cw.endswith("යි") and iw.endswith("ය"):
                            rules["verb_conjugation"].append({
                                "incorrect": iw,
                                "correct": cw
                            })

        # Save rules to a JSON file
        with open(".extracted_rules.json", "w", encoding="utf-8") as file:
            json.dump(rules, file, ensure_ascii=False, indent=4)

        return rules


# Load sentences from files
with open('incorrectSentences.txt', 'r', encoding='utf-8') as f:
    incorrect_sentences = f.read().splitlines()

with open('correctSentences.txt', 'r', encoding='utf-8') as f:
    correct_sentences = f.read().splitlines()

# Initialize RuleExtractor
extractor = RuleExtractor(correct_sentences, incorrect_sentences)

# Extract rules
rules = extractor.extract_rules()
print("Extracted Rules:")
print(json.dumps(rules, ensure_ascii=False, indent=4))

# Example: Apply extracted rules
def apply_rules(sentence, rules):
    # Apply subject-verb order rules
    for rule in rules["subject_verb_order"]:
        if sentence == rule["incorrect"]:
            return rule["correct"]

    # Apply pronoun-ending match rules
    words = sentence.split()
    if len(words) > 1:
        pronoun = words[0]
        if pronoun in ["මම", "අපි", "ඔහු"]:
            correct_end = next(
                (r["end"] for r in rules["pronoun_ending_match"] if r["start"] == pronoun),
                None
            )
            if correct_end and not words[-1].endswith(correct_end):
                words[-1] = words[-1][:-2] + correct_end
                return " ".join(words)

    # Apply verb conjugation rules
    for rule in rules["verb_conjugation"]:
        if rule["incorrect"] in sentence:
            return sentence.replace(rule["incorrect"], rule["correct"])

    return sentence


# Example usage
example_sentence = "අපි පොතක් කියවම ."
corrected_sentence = apply_rules(example_sentence, rules)
print("Original Sentence:", example_sentence)
print("Corrected Sentence:", corrected_sentence)




Extracted Rules:
{
    "subject_verb_order": [
        {
            "incorrect": "පාසැල් නුඹ  යන්නෙහි",
            "correct": "නුඹ පාසැල් යන්නෙහි"
        },
        {
            "incorrect": "ඔබ සැම ගුරුතුමාට  කරන්නෙහු ආචාර",
            "correct": "ඔබ සැම ගුරුතුමාට ආචාර කරන්නෙහු"
        },
        {
            "incorrect": "ගුරුතුමා  උගන්වයි පාඩම",
            "correct": "ගුරුතුමා පාඩම උගන්වයි"
        },
        {
            "incorrect": "ගුරුවරු  උගන්වති පාඩම්",
            "correct": "ගුරුවරු පාඩම් උගන්වති"
        },
        {
            "incorrect": "පාසැල් දරුවා  ගියේය",
            "correct": "දරුවා පාසැල් ගියේය"
        },
        {
            "incorrect": "දරුවෝ  ගියෝය පාසැල්",
            "correct": "දරුවෝ පාසැල් ගියෝය"
        },
        {
            "incorrect": "දරුවෝ  ගියහ පාසැල්",
            "correct": "දරුවෝ පාසැල්  ගියහ"
        },
        {
            "incorrect": "පාසැල් දරුවා  යයි ",
            "correct": "දරුවා පාසැල් යයි "
        },
        {
      

In [5]:
import json

# RuleExtractor class (already provided by you)
class RuleExtractor:
    def __init__(self, correct_sentences, incorrect_sentences):
        self.correct_sentences = correct_sentences
        self.incorrect_sentences = incorrect_sentences

    def extract_rules(self):
        rules = {
            "subject_verb_order": [],
            "pronoun_ending_match": [],
            "verb_conjugation": []  # New category
        }

        for correct, incorrect in zip(self.correct_sentences, self.incorrect_sentences):
            if correct != incorrect:
                correct_words = correct.split()
                incorrect_words = incorrect.split()

                # Subject-Verb order rule
                if sorted(correct_words) == sorted(incorrect_words):
                    rules["subject_verb_order"].append({
                        "incorrect": incorrect,
                        "correct": correct
                    })

                # Pronoun-ending match rule
                if len(correct_words) > 1 and len(incorrect_words) > 1:
                    pronoun = correct_words[0]
                    correct_ending = correct_words[-1][-2:]
                    incorrect_ending = incorrect_words[-1][-2:]
                    if pronoun in ["මම", "අපි", "ඔහු"] and correct_ending != incorrect_ending:
                        rules["pronoun_ending_match"].append({
                            "start": pronoun,
                            "end": correct_ending
                        })

                # Verb conjugation rule
                if len(correct_words) == len(incorrect_words):
                    for cw, iw in zip(correct_words, incorrect_words):
                        if cw != iw and cw.endswith("යි") and iw.endswith("ය"):
                            rules["verb_conjugation"].append({
                                "incorrect": iw,
                                "correct": cw
                            })

        # Save rules to a JSON file
        with open(".extracted_rules.json", "w", encoding="utf-8") as file:
            json.dump(rules, file, ensure_ascii=False, indent=4)

        return rules


# Load sentences from files
with open('incorrectSentences.txt', 'r', encoding='utf-8') as f:
    incorrect_sentences = f.read().splitlines()

with open('correctSentences.txt', 'r', encoding='utf-8') as f:
    correct_sentences = f.read().splitlines()

# Initialize RuleExtractor
extractor = RuleExtractor(correct_sentences, incorrect_sentences)

# Extract rules
rules = extractor.extract_rules()
print("Extracted Rules:")
print(json.dumps(rules, ensure_ascii=False, indent=4))

# Example: Apply extracted rules
def apply_rules(sentence, rules):
    # Apply subject-verb order rules
    for rule in rules["subject_verb_order"]:
        if sentence == rule["incorrect"]:
            return rule["correct"]

    # Apply pronoun-ending match rules
    words = sentence.split()
    if len(words) > 1:
        pronoun = words[0]
        if pronoun in ["මම", "අපි", "ඔහු"]:
            correct_end = next(
                (r["end"] for r in rules["pronoun_ending_match"] if r["start"] == pronoun),
                None
            )
            if correct_end and not words[-1].endswith(correct_end):
                words[-1] = words[-1][:-2] + correct_end
                return " ".join(words)

    # Apply verb conjugation rules
    for rule in rules["verb_conjugation"]:
        if rule["incorrect"] in sentence:
            return sentence.replace(rule["incorrect"], rule["correct"])

    return sentence
def calculate_accuracy(correct_sentences, incorrect_sentences, rules):
    """
    Calculate the accuracy of rule-based correction.
    Compares the corrected sentences to the correct ones.
    """
    total_sentences = len(correct_sentences)
    correct_count = 0

    for correct, incorrect in zip(correct_sentences, incorrect_sentences):
        corrected_sentence = apply_rules(incorrect, rules)
        if corrected_sentence == correct:
            correct_count += 1

    accuracy = (correct_count / total_sentences) * 100
    return accuracy


# Calculate accuracy
accuracy = calculate_accuracy(correct_sentences, incorrect_sentences, rules)
print(f"Accuracy of the rule-based correction system: {accuracy:.2f}%")

# Example usage
example_sentence = "පාසැල් දරුවා  ගියේය"
corrected_sentence = apply_rules(example_sentence, rules)
print("Original Sentence:", example_sentence)
print("Corrected Sentence:", corrected_sentence)




Extracted Rules:
{
    "subject_verb_order": [
        {
            "incorrect": "පාසැල් නුඹ  යන්නෙහි",
            "correct": "නුඹ පාසැල් යන්නෙහි"
        },
        {
            "incorrect": "ඔබ සැම ගුරුතුමාට  කරන්නෙහු ආචාර",
            "correct": "ඔබ සැම ගුරුතුමාට ආචාර කරන්නෙහු"
        },
        {
            "incorrect": "ගුරුතුමා  උගන්වයි පාඩම",
            "correct": "ගුරුතුමා පාඩම උගන්වයි"
        },
        {
            "incorrect": "ගුරුවරු  උගන්වති පාඩම්",
            "correct": "ගුරුවරු පාඩම් උගන්වති"
        },
        {
            "incorrect": "පාසැල් දරුවා  ගියේය",
            "correct": "දරුවා පාසැල් ගියේය"
        },
        {
            "incorrect": "දරුවෝ  ගියෝය පාසැල්",
            "correct": "දරුවෝ පාසැල් ගියෝය"
        },
        {
            "incorrect": "දරුවෝ  ගියහ පාසැල්",
            "correct": "දරුවෝ පාසැල්  ගියහ"
        },
        {
            "incorrect": "පාසැල් දරුවා  යයි ",
            "correct": "දරුවා පාසැල් යයි "
        },
        {
      

In [37]:
import re
import difflib

def preprocess_text(text):
    """
    Preprocess the given text by removing unwanted characters and normalizing it.
    This function keeps only Sinhala characters and spaces.
    """
    text = re.sub(r'[^\u0D80-\u0DFF\s]', '', text)
    text = text.strip().lower()
    return text

def tokenize(text):
    """
    Tokenize the text by splitting it into individual words based on spaces.
    """
    return text.split()

def load_data(sentences_path, dictionary_path):
    """
    Load sentences with their labels and the Sinhala dictionary.
    """
    sentences = []
    with open(sentences_path, 'r', encoding='utf-8') as f:
        for line in f:
            label, sentence = line.strip().split(' ', 1)
            sentences.append((int(label), sentence))

    with open(dictionary_path, 'r', encoding='utf-8') as f:
        dictionary = set(f.read().splitlines())

    return sentences, dictionary

def spell_check(sentence, dictionary):
    """
    Identify misspelled words in a sentence.
    """
    words = tokenize(preprocess_text(sentence))
    misspelled = [word for word in words if word not in dictionary]
    return misspelled

def auto_correct(sentence, dictionary):
    """
    Automatically correct misspelled words using the closest match in the dictionary.
    """
    words = tokenize(preprocess_text(sentence))
    corrected_words = []
    for word in words:
        if word in dictionary:
            corrected_words.append(word)
        else:
            closest_matches = difflib.get_close_matches(word, dictionary, n=1)
            corrected_words.append(closest_matches[0] if closest_matches else word)
    return ' '.join(corrected_words)

def basic_grammar_check(sentence):
    """
    Perform a basic grammar check on a sentence.
    - Checks for repeated words.
    - Checks for missing punctuation.
    """
    grammar_issues = []
    words = tokenize(preprocess_text(sentence))

    for i in range(len(words) - 1):
        if words[i] == words[i + 1]:
            grammar_issues.append(f"Repeated word: '{words[i]}'")

    if not re.match(r'.*[.!?]$', sentence.strip()):
        grammar_issues.append("The sentence does not end with proper punctuation (., !, ?).")

    return grammar_issues

def auto_correct_grammar(sentence):
    """
    Automatically correct basic grammar issues:
    - Removes repeated words.
    - Adds missing punctuation at the end.
    """
    words = tokenize(preprocess_text(sentence))
    corrected_words = []

    for i in range(len(words)):
        if i == 0 or words[i] != words[i - 1]:
            corrected_words.append(words[i])

    corrected_text = ' '.join(corrected_words)

    if not re.match(r'.*[.!?]$', corrected_text.strip()):
        corrected_text += '.'

    return corrected_text

def evaluate_input(paragraph, dictionary):
    """
    Evaluate the input paragraph by correcting spelling and grammar in each sentence.
    Returns the corrected paragraph and the accuracy of the corrections.
    """
    sentences = paragraph.split('.')  # Split paragraph into sentences
    corrected_sentences = []
    total_sentences = len(sentences)
    corrected_count = 0  # To track how many sentences were corrected

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        # Preprocess the original sentence for comparison later
        original_clean = preprocess_text(sentence)

        # Correct spelling
        corrected_spelling = auto_correct(sentence, dictionary)

        # Correct grammar
        corrected_grammar = auto_correct_grammar(corrected_spelling)

        corrected_clean = preprocess_text(corrected_grammar)

        # Check if any correction was made
        if original_clean != corrected_clean:
            corrected_count += 1

        corrected_sentences.append(corrected_grammar.strip())

    # Combine corrected sentences into a paragraph
    corrected_paragraph = '. '.join(corrected_sentences) + '.'

    # Calculate accuracy: (number of corrected sentences / total sentences) * 100
    accuracy = (corrected_count / total_sentences) * 100 if total_sentences > 0 else 0

    return corrected_paragraph, accuracy

def main():
    # Paths to data files
    dictionary_path = "sinhala_dictionary.txt"  # Dictionary file

    # Load dictionary
    with open(dictionary_path, 'r', encoding='utf-8') as f:
        dictionary = set(f.read().splitlines())

    # Example input for testing
    input_paragraph = input("Enter a paragraph to check spelling and grammar (separate sentences with '.'): ")

    # Evaluate the input paragraph
    corrected_paragraph, accuracy = evaluate_input(input_paragraph, dictionary)

    # Output the results
    print("\nCorrected Paragraph (Spelling & Grammar Fixed):")
    print(corrected_paragraph)
    print(f"\nAccuracy of the spell and grammar checker: {accuracy:.2f}%")

if __name__ == "__main__":
    main()


Enter a paragraph to check spelling and grammar (separate sentences with '.'): මම දවස්පරා පාඩම කරම. බලලා යහන මත නිදසටී. පූසා හයයෙන් කෑගහවා. ගසෙහි ඇත  කොළ. මම රම පීසා කවා. දප්තිමත්ව හිරු දිලසෙන. අප ආහාර සකසා ගත්තෙය.ඔහු පාසල් යති.. මම ආහාර පිළියෙළ කරමු .ඔහු පාසල් යති. ඔහු පාසල් යයි .

Corrected Paragraph (Spelling & Grammar Fixed):
මම විස්තර පාඩම කරමු.. බල්ලා යහන මත නිදාසිටී.. පූසා හයියෙන් කෑගැහුවා.. ගසෙහි ඇත කොළ.. මම රෑම පීසා කෑවා.. දීප්තිමත්ව හිරු දිලිසෙනවා.. අප ආහාර සේවකයා ගත්තේය.. ඔහු පාසල් යති.. මම ආහාර පිළියෙළ කරමු.. ඔහු පාසල් යති.. ඔහු පාසල් යයි..

Accuracy of the spell and grammar checker: 53.85%


In [43]:
import re
import difflib

def preprocess_text(text):
    """
    Preprocess the given text by removing unwanted characters and normalizing it.
    This function keeps only Sinhala characters and spaces.
    """
    text = re.sub(r'[^\u0D80-\u0DFF\s]', '', text)
    text = text.strip().lower()
    return text

def tokenize(text):
    """
    Tokenize the text by splitting it into individual words based on spaces.
    """
    return text.split()

def load_data(sentences_path, dictionary_path):
    """
    Load sentences with their labels and the Sinhala dictionary.
    """
    sentences = []
    with open(sentences_path, 'r', encoding='utf-8') as f:
        for line in f:
            label, sentence = line.strip().split(' ', 1)
            sentences.append((int(label), sentence))

    with open(dictionary_path, 'r', encoding='utf-8') as f:
        dictionary = set(f.read().splitlines())

    return sentences, dictionary

def spell_check(sentence, dictionary):
    """
    Identify misspelled words in a sentence.
    """
    words = tokenize(preprocess_text(sentence))
    misspelled = [word for word in words if word not in dictionary]
    return misspelled

def auto_correct(sentence, dictionary):
    """
    Automatically correct misspelled words using the closest match in the dictionary.
    """
    words = tokenize(preprocess_text(sentence))
    corrected_words = []
    for word in words:
        if word in dictionary:
            corrected_words.append(word)
        else:
            closest_matches = difflib.get_close_matches(word, dictionary, n=1)
            corrected_words.append(closest_matches[0] if closest_matches else word)
    return ' '.join(corrected_words)

def basic_grammar_check(sentence):
    """
    Perform a basic grammar check on a sentence.
    - Checks for repeated words.
    - Checks for missing punctuation.
    """
    grammar_issues = []
    words = tokenize(preprocess_text(sentence))

    for i in range(len(words) - 1):
        if words[i] == words[i + 1]:
            grammar_issues.append(f"Repeated word: '{words[i]}'")

    if not re.match(r'.*[.!?]$', sentence.strip()):
        grammar_issues.append("The sentence does not end with proper punctuation (., !, ?).")

    return grammar_issues

def auto_correct_grammar(sentence):
    """
    Automatically correct basic grammar issues:
    - Removes repeated words.
    - Adds missing punctuation at the end.
    """
    words = tokenize(preprocess_text(sentence))
    corrected_words = []

    for i in range(len(words)):
        if i == 0 or words[i] != words[i - 1]:
            corrected_words.append(words[i])

    corrected_text = ' '.join(corrected_words)

    if not re.match(r'.*[.!?]$', corrected_text.strip()):
        corrected_text += '.'

    return corrected_text

def evaluate(sentences, dictionary):
    """
    Evaluate the spell and grammar checker on the dataset.
    Calculates the accuracy based on correcting incorrect sentences (label = 1).
    """
    correct_count = 0
    total = len(sentences)

    for label, sentence in sentences:
        if label == 0:  # Correct sentence, skip evaluation
            correct_count += 1
            continue

        # Process incorrect sentences
        corrected_spelling = auto_correct(sentence, dictionary)
        corrected_grammar = auto_correct_grammar(corrected_spelling)

        # A simple assumption: Corrected sentence should match expected correct sentences in the dataset
        if corrected_grammar.strip() == preprocess_text(sentence).strip():
            correct_count += 1

    accuracy = (correct_count / total) * 100
    return accuracy

def main():
    # Paths to data files
    sentences_path = "sentences.txt"  # Dataset file
    dictionary_path = "sinhala_dictionary.txt"  # Dictionary file

    # Load data and dictionary
    sentences, dictionary = load_data(sentences_path, dictionary_path)

    # Evaluate the model
    accuracy = evaluate(sentences, dictionary)
    print(f"Accuracy of the spell and grammar checker: {accuracy:.2f}%")

    # Example input for testing
    input_sentence = input("Enter a sentence to check spelling and grammar: ")

    # Check spelling
    misspelled_words = spell_check(input_sentence, dictionary)
    if misspelled_words:
        print("Misspelled words found:", ", ".join(misspelled_words))
    else:
        print("No spelling errors found.")

    # Auto-correct spelling
    corrected_spelling = auto_correct(input_sentence, dictionary)
    print("Corrected Sentence (Spelling Fixed):")
    print(corrected_spelling)

    # Auto-correct grammar
    corrected_grammar = auto_correct_grammar(corrected_spelling)
    print("Corrected Sentence (Grammar Fixed):")
    print(corrected_grammar)

if __name__ == "__main__":
    main()


Accuracy of the spell and grammar checker: 82.73%
Enter a sentence to check spelling and grammar: දරුවෝ  ගියහ පාසැල්
No spelling errors found.
Corrected Sentence (Spelling Fixed):
දරුවෝ ගියහ පාසැල්
Corrected Sentence (Grammar Fixed):
දරුවෝ ගියහ පාසැල්.


In [18]:
import re
import difflib

def preprocess_text(text):
    """
    Preprocess the given text by removing unwanted characters and normalizing it.
    This function keeps only Sinhala characters and spaces.
    """
    text = re.sub(r'[^\u0D80-\u0DFF\s]', '', text)
    text = text.strip().lower()
    return text

def tokenize(text):
    """
    Tokenize the text by splitting it into individual words based on spaces.
    """
    return text.split()

def load_data(sentences_path, dictionary_path):
    """
    Load sentences with their labels and the Sinhala dictionary.
    """
    sentences = []
    with open(sentences_path, 'r', encoding='utf-8') as f:
        for line in f:
            label, sentence = line.strip().split(' ', 1)
            sentences.append((int(label), sentence))

    with open(dictionary_path, 'r', encoding='utf-8') as f:
        dictionary = set(f.read().splitlines())

    return sentences, dictionary

def spell_check(sentence, dictionary):
    """
    Identify misspelled words in a sentence.
    """
    words = tokenize(preprocess_text(sentence))
    misspelled = [word for word in words if word not in dictionary]
    return misspelled

def auto_correct(sentence, dictionary):
    """
    Automatically correct misspelled words using the closest match in the dictionary.
    """
    words = tokenize(preprocess_text(sentence))
    corrected_words = []
    for word in words:
        if word in dictionary:
            corrected_words.append(word)
        else:
            closest_matches = difflib.get_close_matches(word, dictionary, n=1)
            corrected_words.append(closest_matches[0] if closest_matches else word)
    return ' '.join(corrected_words)

def basic_grammar_check(sentence):
    """
    Perform a basic grammar check on a sentence.
    - Checks for repeated words.
    - Checks for missing punctuation.
    """
    grammar_issues = []
    words = tokenize(preprocess_text(sentence))

    for i in range(len(words) - 1):
        if words[i] == words[i + 1]:
            grammar_issues.append(f"Repeated word: '{words[i]}'")

    if not re.match(r'.*[.!?]$', sentence.strip()):
        grammar_issues.append("The sentence does not end with proper punctuation (., !, ?).")

    return grammar_issues

def auto_correct_grammar(sentence):
    """
    Automatically correct basic grammar issues:
    - Removes repeated words.
    - Adds missing punctuation at the end.
    """
    words = tokenize(preprocess_text(sentence))
    corrected_words = []

    for i in range(len(words)):
        if i == 0 or words[i] != words[i - 1]:
            corrected_words.append(words[i])

    corrected_text = ' '.join(corrected_words)

    if not re.match(r'.*[.!?]$', corrected_text.strip()):
        corrected_text += '.'

    return corrected_text

def process_paragraph(paragraph, dictionary):
    """
    Processes a paragraph by correcting spelling and grammar for each sentence.
    """
    sentences = paragraph.split('.')
    corrected_sentences = []

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        # Correct spelling
        corrected_spelling = auto_correct(sentence, dictionary)

        # Correct grammar
        corrected_grammar = auto_correct_grammar(corrected_spelling)

        corrected_sentences.append(corrected_grammar.strip())

    # Combine sentences back into a paragraph
    corrected_paragraph = '. '.join(corrected_sentences) + '.'
    return corrected_paragraph

def evaluate(sentences, dictionary):
    """
    Evaluate the accuracy of the spell and grammar checker.
    """
    correct_count = 0
    total = len(sentences)

    for label, sentence in sentences:
        # Correct the sentence
        corrected_paragraph = process_paragraph(sentence, dictionary)

        # Preprocess for comparison
        corrected_clean = preprocess_text(corrected_paragraph)
        original_clean = preprocess_text(sentence)

        # Check if correction is successful
        if label == 0 and corrected_clean == original_clean:  # Already correct
            correct_count += 1
        elif label == 1 and corrected_clean != original_clean:  # Corrected incorrect sentence
            correct_count += 1

    accuracy = (correct_count / total) * 100
    return accuracy

def main():
    # Paths to data files
    sentences_path = "sentences.txt"  # Labeled dataset file
    dictionary_path = "sinhala_dictionary.txt"  # Dictionary file

    # Load data and dictionary
    sentences, dictionary = load_data(sentences_path, dictionary_path)

    # Evaluate accuracy
    accuracy = evaluate(sentences, dictionary)
    print(f"Accuracy of the spell and grammar checker: {accuracy:.2f}%")

    # Example input for testing
    input_paragraph = input("Enter a paragraph to check spelling and grammar: ")

    # Process paragraph
    corrected_paragraph = process_paragraph(input_paragraph, dictionary)

    print("\nCorrected Paragraph:")
    print(corrected_paragraph)

if __name__ == "__main__":
    main()


Accuracy of the spell and grammar checker: 16.47%
Enter a paragraph to check spelling and grammar: දරුවෝ පාසැල් ගියෝ. උසිකාව බණ අසත. උපසකයා බණ ඇසුවේය. ගුරුවරයා සිසුන් කැඳවයි. මම අලුයම පිබිදෙය. ගස මල්වලින් පිර තිබේ.

Corrected Paragraph:
දරුවෝ පාසැල් ගියෝය.. උපාසිකාවෝ බණ අසති.. උපාසකයා බණ ඇසුවේය.. ගුරුවරයා සිසුන් කැඳවයි.. මම අලුයම පිබිදෙයි.. ගස මල්වලින් පිරී තිබේ..


In [14]:
import re
import difflib
import json


class RuleExtractor:
    def __init__(self, correct_sentences, incorrect_sentences):
        self.correct_sentences = correct_sentences
        self.incorrect_sentences = incorrect_sentences

    def extract_rules(self):
        rules = {
            "subject_verb_order": [],
            "pronoun_ending_match": [],
            "verb_conjugation": []  # New category
        }

        for correct, incorrect in zip(self.correct_sentences, self.incorrect_sentences):
            if correct != incorrect:
                correct_words = correct.split()
                incorrect_words = incorrect.split()

                # Subject-Verb order rule
                if sorted(correct_words) == sorted(incorrect_words):
                    rules["subject_verb_order"].append({
                        "incorrect": incorrect,
                        "correct": correct
                    })

                # Pronoun-ending match rule
                if len(correct_words) > 1 and len(incorrect_words) > 1:
                    pronoun = correct_words[0]
                    correct_ending = correct_words[-1][-2:]
                    incorrect_ending = incorrect_words[-1][-2:]
                    if pronoun in ["මම", "අපි", "ඔහු"] and correct_ending != incorrect_ending:
                        rules["pronoun_ending_match"].append({
                            "start": pronoun,
                            "end": correct_ending
                        })

                # Verb conjugation rule
                if len(correct_words) == len(incorrect_words):
                    for cw, iw in zip(correct_words, incorrect_words):
                        if cw != iw and cw.endswith("යි") and iw.endswith("ය"):
                            rules["verb_conjugation"].append({
                                "incorrect": iw,
                                "correct": cw
                            })

        # Save rules to a JSON file
        with open(".extracted_rules.json", "w", encoding="utf-8") as file:
            json.dump(rules, file, ensure_ascii=False, indent=4)

        return rules


def preprocess_text(text):
    """
    Preprocess the given text by removing unwanted characters and normalizing it.
    """
    text = re.sub(r'[^\u0D80-\u0DFF\s]', '', text)
    text = text.strip().lower()
    return text


def tokenize(text):
    """
    Tokenize the text by splitting it into individual words based on spaces.
    """
    return text.split()


def spell_check(sentence, dictionary):
    """
    Identify misspelled words in a sentence.
    """
    words = tokenize(preprocess_text(sentence))
    misspelled = [word for word in words if word not in dictionary]
    return misspelled


def auto_correct(sentence, dictionary):
    """
    Automatically correct misspelled words using the closest match in the dictionary.
    """
    words = tokenize(preprocess_text(sentence))
    corrected_words = []
    for word in words:
        if word in dictionary:
            corrected_words.append(word)
        else:
            closest_matches = difflib.get_close_matches(word, dictionary, n=1)
            corrected_words.append(closest_matches[0] if closest_matches else word)
    return ' '.join(corrected_words)


def apply_rules(sentence, rules, dictionary):
    """
    Apply spelling corrections first, followed by rule-based grammar corrections.
    """
    # Step 1: Correct spelling
    corrected_sentence = auto_correct(sentence, dictionary)

    # Step 2: Apply subject-verb order rules
    for rule in rules["subject_verb_order"]:
        if corrected_sentence == rule["incorrect"]:
            corrected_sentence = rule["correct"]

    # Step 3: Apply pronoun-ending match rules
    words = corrected_sentence.split()
    if len(words) > 1:
        pronoun = words[0]
        if pronoun in ["මම", "අපි", "ඔහු"]:
            correct_end = next(
                (r["end"] for r in rules["pronoun_ending_match"] if r["start"] == pronoun),
                None
            )
            if correct_end and not words[-1].endswith(correct_end):
                words[-1] = words[-1][:-2] + correct_end
                corrected_sentence = " ".join(words)

    # Step 4: Apply verb conjugation rules
    for rule in rules["verb_conjugation"]:
        if rule["incorrect"] in corrected_sentence:
            corrected_sentence = corrected_sentence.replace(rule["incorrect"], rule["correct"])

    return corrected_sentence


def calculate_accuracy(correct_sentences, incorrect_sentences, rules, dictionary):
    """
    Calculate the accuracy of rule-based correction.
    Compares the corrected sentences to the correct ones.
    """
    total_sentences = len(correct_sentences)
    correct_count = 0

    for correct, incorrect in zip(correct_sentences, incorrect_sentences):
        corrected_sentence = apply_rules(incorrect, rules, dictionary)
        if corrected_sentence == correct:
            correct_count += 1

    accuracy = (correct_count / total_sentences) * 100
    return accuracy


# Load sentences and dictionary from files
with open('incorrectSentences.txt', 'r', encoding='utf-8') as f:
    incorrect_sentences = f.read().splitlines()

with open('correctSentences.txt', 'r', encoding='utf-8') as f:
    correct_sentences = f.read().splitlines()

with open('sinhala_dictionary.txt', 'r', encoding='utf-8') as f:
    dictionary = set(f.read().splitlines())

# Initialize RuleExtractor
extractor = RuleExtractor(correct_sentences, incorrect_sentences)

# Extract rules
rules = extractor.extract_rules()
print("Extracted Rules:")
print(json.dumps(rules, ensure_ascii=False, indent=4))

# Calculate accuracy
accuracy = calculate_accuracy(correct_sentences, incorrect_sentences, rules, dictionary)
print(f"Accuracy of the rule-based correction system: {accuracy:.2f}%")

# Example usage
example_sentence = "  පාසැල දරුව ගියේය"
corrected_sentence = apply_rules(example_sentence, rules, dictionary)
print("Original Sentence:", example_sentence)
print("Corrected Sentence:", corrected_sentence)


Extracted Rules:
{
    "subject_verb_order": [
        {
            "incorrect": "පාසැල් නුඹ  යන්නෙහි",
            "correct": "නුඹ පාසැල් යන්නෙහි"
        },
        {
            "incorrect": "ඔබ සැම ගුරුතුමාට  කරන්නෙහු ආචාර",
            "correct": "ඔබ සැම ගුරුතුමාට ආචාර කරන්නෙහු"
        },
        {
            "incorrect": "ගුරුතුමා  උගන්වයි පාඩම",
            "correct": "ගුරුතුමා පාඩම උගන්වයි"
        },
        {
            "incorrect": "ගුරුවරු  උගන්වති පාඩම්",
            "correct": "ගුරුවරු පාඩම් උගන්වති"
        },
        {
            "incorrect": "පාසැල් දරුවා  ගියේය",
            "correct": "දරුවා පාසැල් ගියේය"
        },
        {
            "incorrect": "දරුවෝ  ගියෝය පාසැල්",
            "correct": "දරුවෝ පාසැල් ගියෝය"
        },
        {
            "incorrect": "දරුවෝ  ගියහ පාසැල්",
            "correct": "දරුවෝ පාසැල්  ගියහ"
        },
        {
            "incorrect": "පාසැල් දරුවා  යයි ",
            "correct": "දරුවා පාසැල් යයි "
        },
        {
      