## Mount the google drive

In [76]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Define the dataset path

In [77]:
# dataset_path = '/content/drive/MyDrive/EC9640 - AI Project/tamil_grammar_dataset_200.csv'
excel_file_path = '/content/drive/MyDrive/EC9640 - AI Project/grammar_errors_dataset.xlsx'

In [78]:
##############################################################################
# Import necessary libraries
##############################################################################

import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [79]:
##############################################################################
# 1) LOAD THE DATA FROM EXCEL
##############################################################################

df = pd.read_excel(excel_file_path)

# Optional: remove duplicates
df.drop_duplicates(
    subset=["grammatical_error_sentence", "correct_sentence"],
    inplace=True
)

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print("Dataset size:", len(df))

Dataset size: 75


In [80]:
##############################################################################
# 2) BUILD TRAINING SAMPLES USING SUBJECT & VERB
##############################################################################
# We'll parse each erroneous and correct sentence,
# and pick the first token as subject, the last token as verb.

err_inputs = []   # X
cor_labels = []   # y

for idx, row in df.iterrows():
    err_sent = str(row["grammatical_error_sentence"]).strip().split()
    cor_sent = str(row["correct_sentence"]).strip().split()

    # If we have fewer than 2 tokens, skip
    if len(err_sent) < 2 or len(cor_sent) < 2:
        continue

    # Subject is first token
    err_subject = err_sent[0]
    # Verb is last token
    err_verb = err_sent[-1]

    # Similarly for correct
    cor_verb = cor_sent[-1]  # last token of correct sentence

    # Combine subject+verb as the input
    input_str = err_subject + "_" + err_verb
    err_inputs.append(input_str)
    cor_labels.append(cor_verb)

print("Total (subject, verb) pairs:", len(err_inputs))

Total (subject, verb) pairs: 75


In [81]:
##############################################################################
# 3) GROUP RARE CORRECT VERBS AS "OTHER" (to avoid stratify errors)
##############################################################################

MIN_FREQ = 2
counts = Counter(cor_labels)

new_labels = []
for tok in cor_labels:
    if counts[tok] < MIN_FREQ:
        new_labels.append("OTHER")
    else:
        new_labels.append(tok)

print("Unique correct verbs BEFORE grouping:", len(counts))
print("Unique correct verbs AFTER  grouping:", len(set(new_labels)))

Unique correct verbs BEFORE grouping: 24
Unique correct verbs AFTER  grouping: 9


In [82]:
##############################################################################
# 4) FEATURE EXTRACTION (CHAR-LEVEL N-GRAMS)
##############################################################################

vectorizer = CountVectorizer(
    analyzer="char",
    ngram_range=(1,3),
    max_features=2000
)

X_vec = vectorizer.fit_transform(err_inputs)

y = new_labels

In [83]:
##############################################################################
# 5) TRAIN-TEST SPLIT
##############################################################################

X_train, X_test, y_train, y_test = train_test_split(
    X_vec,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # safe now that "OTHER" is used
)

print("Train size:", X_train.shape[0])
print("Test size :", X_test.shape[0])

Train size: 60
Test size : 15


In [84]:
##############################################################################
# 6) TRAIN A CLASSIFIER
##############################################################################

clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train, y_train)

In [85]:
##############################################################################
# 7) EVALUATE
##############################################################################

y_pred = clf.predict(X_test)
print("\n=== EVALUATION ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


=== EVALUATION ===
Accuracy: 1.0
                 precision    recall  f1-score   support

          OTHER       1.00      1.00      1.00         3
   வாசிக்கிறான்       1.00      1.00      1.00         1
   வாசிக்கிறாய்       1.00      1.00      1.00         1
வாசிக்கிறார்கள்       1.00      1.00      1.00         2
   வாசிக்கிறாள்       1.00      1.00      1.00         2
வாசிக்கிறீர்கள்       1.00      1.00      1.00         2
   வாசிக்கிறேன்       1.00      1.00      1.00         2
   வாசிக்கிறோம்       1.00      1.00      1.00         2

       accuracy                           1.00        15
      macro avg       1.00      1.00      1.00        15
   weighted avg       1.00      1.00      1.00        15



In [86]:
##############################################################################
# 8) FUNCTION TO CORRECT NEW SENTENCES
##############################################################################

def correct_tamil_sentence(sentence: str) -> str:
    """
    1) Tokenize the new sentence.
    2) Suppose first token = subject, last token = verb.
    3) Build subject_verb feature, predict corrected_verb.
    4) Reconstruct sentence with everything in the middle unchanged.
    """
    tokens = sentence.strip().split()
    if len(tokens) < 2:
        # Not enough tokens to do subject-verb approach
        return sentence  # fallback: no correction

    subject = tokens[0]
    err_verb = tokens[-1]

    # Vectorize subject_verb
    input_str = subject + "_" + err_verb
    vec = vectorizer.transform([input_str])
    predicted_verb = clf.predict(vec)[0]

    if predicted_verb == "OTHER":
        # Fallback: keep original verb
        predicted_verb = err_verb

    # Rebuild the sentence
    # keep the tokens in the middle as-is
    corrected = [subject] + tokens[1:-1] + [predicted_verb]
    return " ".join(corrected)

In [88]:

##############################################################################
# 9) DEMO
##############################################################################

demo_sentences = [
    "நான் புத்தகம் வாசிக்கிறோம்",
    "அவன் புத்தகம் வாசிக்கிறீர்கள்",
    "அவள் புத்தகம் வாசிக்கிறான்"
]

for s in demo_sentences:
    corrected_s = correct_tamil_sentence(s)
    print("\nOriginal  :", s)
    print("Corrected :", corrected_s)



Original  : நான் புத்தகம் வாசிக்கிறோம்
Corrected : நான் புத்தகம் வாசிக்கிறேன்

Original  : அவன் புத்தகம் வாசிக்கிறீர்கள்
Corrected : அவன் புத்தகம் வாசிக்கிறான்

Original  : அவள் புத்தகம் வாசிக்கிறான்
Corrected : அவள் புத்தகம் வாசிக்கிறாள்
