1.Rule-Based Approach with AI

In [8]:
import chardet

with open('/content/test_dataset.txt', 'rb') as file:
    raw_data = file.read()
    result = chardet.detect(raw_data)
    print(result)


{'encoding': 'UTF-16', 'confidence': 1.0, 'language': ''}


In [3]:
# Load test data with UTF-16 encoding
test_data_path = '/content/test_dataset.txt'

with open(test_data_path, 'r', encoding='utf-16') as file:
    sentences = [line.strip().split(" ", 1) for line in file.readlines()]

# Process the sentences and load into DataFrame
import pandas as pd

df = pd.DataFrame(sentences, columns=["label", "sentence"])
df["label"] = df["label"].astype(int)

# Check the first few rows of the dataframe to ensure it's loaded correctly
df.head()


Unnamed: 0,label,sentence
0,0,මම යති
1,0,මම යත්වා
2,0,මම යනවා
3,0,මම යනවාලා
4,0,මම යනු


2. Define Grammar Rules
We'll define rules for subject-verb agreement (singular/plural) and tense consistency.

Rule 1: Subject-Verb Agreement
In Sinhala, the verb form must agree with the subject in number (singular or plural).

For example:

Singular: "මම බත් කනවා" -> "මම බත් කමි" (Singular subject, verb must end in "මි")
Plural: "අපි බත් කනවා" -> "අපි බත් කමු" (Plural subject, verb must end in "මු")

Rule 2: Tense Consistency
Ensure the correct usage of present, past, and future tenses in the sentences.

3. Error Detection Using Rules


In [4]:
import re

# Rule 1: Subject-Verb Agreement (Singular/Plural)
def check_subject_verb_agreement(sentence):
    singular_subjects = ["මම", "ඔයා", "ඇය", "ඔහු"]
    plural_subjects = ["අපි", "ඔවුන්", "අපෙ"]

    if any(subject in sentence for subject in singular_subjects):
        if "කනවා" in sentence:
            return "Correct"  # Singular subject with correct verb form
        else:
            return "Incorrect"  # Singular subject with incorrect verb form

    if any(subject in sentence for subject in plural_subjects):
        if "කමු" in sentence:
            return "Correct"  # Plural subject with correct verb form
        else:
            return "Incorrect"  # Plural subject with incorrect verb form

    return "Unknown"  # No subject detected

# Rule 2: Tense Consistency
def check_tense(sentence):
    present_tense_keywords = r"\b(කනවා|ඉන්නවා|නේ|ඇතුලත් කරගනවා|ඇතුල් වෙන්න)\b"
    past_tense_keywords = r"\b(කියා|ගියා|පු|බලා|ඉවත්|ලියා)\b"
    future_tense_keywords = r"\b(කරනවා|යන්න|යුතු|නැතිනම්)\b"

    if re.search(present_tense_keywords, sentence):
        return "Present Tense"
    elif re.search(past_tense_keywords, sentence):
        return "Past Tense"
    elif re.search(future_tense_keywords, sentence):
        return "Future Tense"
    else:
        return "Unknown Tense"

# Test error detection with a sample sentence
test_sentence = "මම යන්නෙමු"
print("Subject-Verb Agreement:", check_subject_verb_agreement(test_sentence))
print("Tense Consistency:", check_tense(test_sentence))


Subject-Verb Agreement: Incorrect
Tense Consistency: Future Tense


Updated Grammar Checker with Suggestion

In [5]:
import re

# Rule 1: Subject-Verb Agreement
def check_subject_verb_agreement(sentence):
    singular_subjects = ["මම", "ඔයා", "ඇය", "ඔහු"]
    plural_subjects = ["අපි", "ඔවුන්", "ඔබලා", "අපෙ"]

    singular_verb_endings = ["නවා", "නී", "ඉන්නවා"]
    plural_verb_endings = ["මු", "නවා", "නී", "ඉන්නවා"]

    # Check if the subject is singular or plural
    subject = sentence.split()[0]  # The first word is the subject

    if subject in singular_subjects:
        # Check if the verb is singular
        if re.search(r"කනවා|ඉන්නවා", sentence):
            correct_verb = sentence.replace("කනවා", "කමි")  # Example: "කනවා" -> "කමි"
            return "Incorrect", correct_verb
    elif subject in plural_subjects:
        # Check if the verb is plural
        if re.search(r"කනවා", sentence):
            correct_verb = sentence.replace("කනවා", "කමු")  # Example: "කනවා" -> "කමු"
            return "Incorrect", correct_verb

    return "Correct", sentence

# Rule 2: Tense Consistency
def check_tense(sentence):
    present_tense_keywords = r"\b(කනවා|ඉන්නවා|නේ|ඇතුලත් කරගනවා|ඇතුල් වෙන්න)\b"
    past_tense_keywords = r"\b(කියා|ගියා|පු|බලා|ඉවත්|ලියා)\b"
    future_tense_keywords = r"\b(කරනවා|යන්න|යුතු|නැතිනම්)\b"

    if re.search(present_tense_keywords, sentence):
        return "Present Tense"
    elif re.search(past_tense_keywords, sentence):
        return "Past Tense"
    elif re.search(future_tense_keywords, sentence):
        return "Future Tense"
    else:
        return "Unknown Tense"

# Grammar checking function
def grammar_checker(sentence, df):
    print(f"Input sentence - : {sentence}")

    # Check the sentence using the rules
    subject_verb_status, corrected_sentence = check_subject_verb_agreement(sentence)
    tense_status = check_tense(sentence)

    # Check if the sentence is correct
    if subject_verb_status == "Correct" and "Tense" in tense_status:
        print("Sentence is Correct")
    else:
        print("Sentence is Incorrect")
        # Suggest correction based on the rule-based correction
        print(f"Correct sentence - : {corrected_sentence}")

# Example usage
test_sentence1 = "මම බත් කනවා"  # Example of a correct sentence
grammar_checker(test_sentence1, df)  # This should print "Sentence is Correct"

test_sentence2 = "අපි බත් කනවා"  # Example of an incorrect sentence
grammar_checker(test_sentence2, df)  # This should print a correction suggestion

test_sentence3 = " මම ගෙදර යමි"
grammar_checker(test_sentence3, df)

test_sentence4 = " නුබ ඔහුට පොතක් දෙන්නෙහි"
grammar_checker(test_sentence4, df)


test_sentence5 = "  ළමයා ඔහුට පොතක් දුන්නේය"
grammar_checker(test_sentence5, df)

Input sentence - : මම බත් කනවා
Sentence is Incorrect
Correct sentence - : මම බත් කමි
Input sentence - : අපි බත් කනවා
Sentence is Incorrect
Correct sentence - : අපි බත් කමු
Input sentence - :  මම ගෙදර යමි
Sentence is Correct
Input sentence - :  නුබ ඔහුට පොතක් දෙන්නෙහි
Sentence is Correct
Input sentence - :   ළමයා ඔහුට පොතක් දුන්නේය
Sentence is Correct


 Model Accuracy


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Feature extraction
def extract_features(sentence):
    features = []

    # Subject-Verb Agreement
    subject_verb_status = check_subject_verb_agreement(sentence)
    features.append(1 if subject_verb_status == "Correct" else 0)

    # Tense Consistency
    tense_status = check_tense(sentence)
    features.append(1 if "Tense" in tense_status else 0)

    return features

# Prepare feature set
X = [extract_features(sentence) for sentence in df["sentence"]]
y = df["label"]

# Split the data for training/testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 86.29%




---

