Rule base grammar error correction as baseline model.

In [2]:
# nepali_gec_rule_based_extended.py

import re
from difflib import get_close_matches

# -----------------------
# 1. Load Nepali dictionary
# -----------------------
with open("../data/dictionary.txt", encoding="utf-8") as f:
    correct_words = set(line.strip() for line in f if line.strip())

# -----------------------
# 2. POS tag sets
# -----------------------
verb_pos_tags = {"VBF", "VBX", "VBI", "VBO", "VBKO", "VBNE"}
noun_pos_tags = {"NN", "NNP", "PLAI", "PKO"}   # expand with your tagger
adj_pos_tags = {"JJ", "JJD", "JJM"}

# -----------------------
# 3. Spell-check
# -----------------------
def spell_check(word):
    if word not in correct_words:
        suggestions = get_close_matches(word, correct_words, n=1, cutoff=0.7)
        if suggestions:
            return suggestions[0]
    return word

# -----------------------
# 4. Postposition rules
# -----------------------
postposition_rules = {
    r"लई\b": "लाई",
    r"ले\b": "ले",
    r"बाट\b": "बाट",
    r"संग\b": "संग"
}

def fix_postpositions(sentence):
    for pattern, replacement in postposition_rules.items():
        sentence = re.sub(pattern, replacement, sentence)
    return sentence

# -----------------------
# 5. Verb rules
# -----------------------
verb_rules = {
    "जाऊँछु": "जाँछु",
    "जाऊछ": "जाँछ",
    "खेल्छु": "खेल्छ",
    "पढ्छु": "पढ्छ",
    "खान्छु": "खान्छ",
    "रमाइ": "रमाइलो",
    "गयो": "गयो"
}

# -----------------------
# 6. Noun rules
# -----------------------
noun_rules = {
    "स्कुल": "विद्यालय",
    "बजारम": "बजार",
    "साथीहरूम": "साथीहरू"
}

# -----------------------
# 7. Adjective rules
# -----------------------
adj_rules = {
    "रमाइलोछ": "रमाइलो छ",
    "राम्रोछ": "राम्रो छ",
    "ठूलोछ": "ठूलो छ"
}

# -----------------------
# 8. Fix verbs, nouns, adjectives
# -----------------------
def fix_words(pos_tagged_tokens):
    words = [list(d.keys())[0] for d in pos_tagged_tokens]
    pos_tags = [list(d.values())[0] for d in pos_tagged_tokens]

    corrected_words = []
    for w, pos in zip(words, pos_tags):
        if pos in verb_pos_tags and w in verb_rules:
            corrected_words.append(verb_rules[w])
        elif pos in noun_pos_tags and w in noun_rules:
            corrected_words.append(noun_rules[w])
        elif pos in adj_pos_tags and w in adj_rules:
            corrected_words.append(adj_rules[w])
        else:
            corrected_words.append(w)
    return " ".join(corrected_words)

# -----------------------
# 9. Example POS tagger (replace with your trained model)
# -----------------------
def my_pos_tagger(sentence):
    """
    Replace with your trained POS tagger.
    Output format example: [[{'ऊ': 'PP'}, {'आज': 'RBO'}, {'बजार': 'NN'}, {'गयो': 'VBF'}, {'।': 'YF'}]]
    """
    if sentence == "ऊ आज स्कुल गयो ।":
        return [[{'ऊ': 'PP'}, {'आज': 'RBO'}, {'स्कुल': 'NN'}, {'गयो': 'VBF'}, {'।': 'YF'}]]
    elif sentence == "म रमाइलोछ किताब पढ्छु":
        return [[{'म': 'PP'}, {'रमाइलोछ': 'JJ'}, {'किताब': 'NN'}, {'पढ्छु': 'VBI'}]]
    else:
        return [[{w: "NN"} for w in sentence.split()]]

# -----------------------
# 10. Rule-based correction pipeline
# -----------------------
def rule_based_correct(sentence):
    sentence = fix_postpositions(sentence)
    pos_tagged_tokens = my_pos_tagger(sentence)[0]  # first sentence
    sentence = fix_words(pos_tagged_tokens)
    sentence = " ".join([spell_check(w) for w in sentence.split()])
    return sentence

# -----------------------
# 11. Test the baseline
# -----------------------
if __name__ == "__main__":
    test_sentences = [
        "म बजार जाऊँछु",
        "तिमी स्कूल जाऊछ",
        "ऊ रमाइलोछ किताब पढ्छु",
        "हामी घर जाऔछ",
        "म साथीहरूम भेट्छु",
        "ऊ आज विद्यालय गयो ।"
    ]

    print("Extended Rule-based Nepali GEC Baseline\n")
    for s in test_sentences:
        corrected = rule_based_correct(s)
        print(f"Original: {s}")
        print(f"Corrected: {corrected}\n")


Extended Rule-based Nepali GEC Baseline

Original: म बजार जाऊँछु
Corrected: म बजार जाऊँ

Original: तिमी स्कूल जाऊछ
Corrected: तिमी स्कूल जाऊ

Original: ऊ रमाइलोछ किताब पढ्छु
Corrected: ऊ रमाइलो किताब पढ्छु

Original: हामी घर जाऔछ
Corrected: हामी घर जाऔं

Original: म साथीहरूम भेट्छु
Corrected: म साथीहरू भेट्छु

Original: ऊ आज विद्यालय गयो ।
Corrected: ऊ आज विद्यालय गयो ।

