In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


In [None]:
# Load the dataset
df = pd.read_csv("/content/sample_data/tamil_grammar_dataset_200.csv")
df.head()

Unnamed: 0,id,error_type,grammatical_error_sentence,correct_sentence
0,1,Error1,நான் புத்தகம் வாசிக்கிறோம்,நான் புத்தகம் வாசிக்கிறேன்
1,2,Error1,நீ உணவு சாப்பிடுகிறேன்,நீ உணவு சாப்பிடுகிறாய்
2,3,Error1,அவன் பந்து விளையாடுகிறோம்,அவன் பந்து விளையாடுகிறான்
3,4,Error1,அவள் பாடல் எழுதுகிறார்கள்,அவள் பாடல் எழுதுகிறாள்
4,5,Error1,நாங்கள் இசை கேட்கிறேன்,நாங்கள் இசை கேட்கிறோம்


In [None]:
df.shape

(200, 4)

In [None]:
duplicates = df[df.duplicated(subset=["error_type", "grammatical_error_sentence", "correct_sentence"], keep=False)]
print("Duplicates:")
print(duplicates)


Duplicates:
      id error_type   grammatical_error_sentence             correct_sentence
0      1     Error1   நான் புத்தகம் வாசிக்கிறோம்   நான் புத்தகம் வாசிக்கிறேன்
1      2     Error1       நீ உணவு சாப்பிடுகிறேன்       நீ உணவு சாப்பிடுகிறாய்
2      3     Error1    அவன் பந்து விளையாடுகிறோம்    அவன் பந்து விளையாடுகிறான்
3      4     Error1    அவள் பாடல் எழுதுகிறார்கள்       அவள் பாடல் எழுதுகிறாள்
4      5     Error1       நாங்கள் இசை கேட்கிறேன்       நாங்கள் இசை கேட்கிறோம்
..   ...        ...                          ...                          ...
195  196     Error2      அவர்கள் நாளை வந்தார்கள்      அவர்கள் நாளை வருவார்கள்
196  197     Error2  நாங்கள் நேற்று சாப்பிடுவோம்  நாங்கள் நேற்று சாப்பிட்டோம்
197  198     Error2          நான் இன்று சென்றேன்        நான் இன்று செல்கிறேன்
198  199     Error2       அவன் நாளை செல்லுகிறான்         அவன் நாளை செல்லுவான்
199  200     Error2            நீ நேற்று வருவாய்            நீ நேற்று வந்தாய்

[196 rows x 4 columns]


In [None]:
# View duplicates based on specific columns
duplicates = df[df.duplicated(subset=["grammatical_error_sentence", "correct_sentence", "error_type"], keep=False)]
print("Duplicates:")
print(duplicates)

# Drop duplicates from the original dataframe
df.drop_duplicates(subset=["grammatical_error_sentence", "correct_sentence", "error_type"], inplace=True)
print("Dataset size after removing duplicates:", len(df))


Duplicates:
      id error_type   grammatical_error_sentence             correct_sentence
0      1     Error1   நான் புத்தகம் வாசிக்கிறோம்   நான் புத்தகம் வாசிக்கிறேன்
1      2     Error1       நீ உணவு சாப்பிடுகிறேன்       நீ உணவு சாப்பிடுகிறாய்
2      3     Error1    அவன் பந்து விளையாடுகிறோம்    அவன் பந்து விளையாடுகிறான்
3      4     Error1    அவள் பாடல் எழுதுகிறார்கள்       அவள் பாடல் எழுதுகிறாள்
4      5     Error1       நாங்கள் இசை கேட்கிறேன்       நாங்கள் இசை கேட்கிறோம்
..   ...        ...                          ...                          ...
195  196     Error2      அவர்கள் நாளை வந்தார்கள்      அவர்கள் நாளை வருவார்கள்
196  197     Error2  நாங்கள் நேற்று சாப்பிடுவோம்  நாங்கள் நேற்று சாப்பிட்டோம்
197  198     Error2          நான் இன்று சென்றேன்        நான் இன்று செல்கிறேன்
198  199     Error2       அவன் நாளை செல்லுகிறான்         அவன் நாளை செல்லுவான்
199  200     Error2            நீ நேற்று வருவாய்            நீ நேற்று வந்தாய்

[196 rows x 4 columns]
Dataset size after removing 

In [None]:
# Find duplicate rows based on all columns
duplicates = df[df.duplicated(keep=False)]

# Display the duplicate rows
print("Duplicate rows:")
print(duplicates)
df.duplicated().sum()

Duplicate rows:
Empty DataFrame
Columns: [id, error_type, grammatical_error_sentence, correct_sentence]
Index: []


0

In [None]:


# -----------------------------
# 1) LOAD / PREPARE DATA
# -----------------------------


# (Optional) Remove duplicates if your dataset has repeated rows
df.drop_duplicates(
    subset=["grammatical_error_sentence", "correct_sentence","error_type"],
    inplace=True
)

# Shuffle the dataset so data is not in any sorted order
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Dataset size after deduplication and shuffling:", len(df))

# Let's define the columns
X_raw = df["grammatical_error_sentence"].astype(str)
y = df["error_type"].astype(str)  # e.g. Error1, Error2, or NoError

# Optional: a cleanup function to handle quotes/punctuation
def basic_cleanup(text: str) -> str:
    text = text.replace('"', '').replace("'", "").strip()
    return text

X_cleaned = X_raw.apply(basic_cleanup)


Dataset size after deduplication and shuffling: 25


In [None]:
df.shape

(10, 4)

In [None]:

# -----------------------------
# 2) FEATURE EXTRACTION
# -----------------------------
# For Tamil, char-level n-grams may work better than word-level
vectorizer = CountVectorizer(
    analyzer="char",
    ngram_range=(1, 3),   # unigrams, bigrams, trigrams
    max_features=2000     # limit feature size
)

X_vec = vectorizer.fit_transform(X_cleaned)

# -----------------------------
# 3) TRAIN / VALIDATION SPLIT
# -----------------------------
# Use train_test_split properly so there's no overlap
X_train, X_test, y_train, y_test = train_test_split(
    X_vec,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y   # keeps class distribution consistent
)

print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

# -----------------------------
# 4) BUILD & TRAIN A CLASSIFIER
# -----------------------------
clf = SVC(kernel="linear", random_state=42)

# Train on training split only
clf.fit(X_train, y_train)

# Predict on held-out test set
y_pred = clf.predict(X_test)

# Evaluate
print("\n=== TEST SET RESULTS ===")
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# -----------------------------
# 5) CROSS-VALIDATION (OPTIONAL)
# -----------------------------
# For a more robust performance estimate, do K-fold cross-validation
scores = cross_val_score(clf, X_vec, y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", scores)
print("Mean CV Accuracy:", scores.mean())

# -----------------------------
# 6) DEMO CORRECTION LOGIC
# -----------------------------
# Suppose your code to fix grammar is rule-based/dictionary-based

def fix_error1(sentence: str) -> str:
    """ Hard-coded text replacements for subject-verb mismatches """
    sentence = sentence.replace("வாசிக்கிறோம்", "வாசிக்கிறேன்")
    sentence = sentence.replace("சாப்பிடுகிறேன்", "சாப்பிடுகிறாய்")
    # ... add more as needed
    return sentence

def fix_error2(sentence: str) -> str:
    """ Hard-coded text replacements for tense mismatches """
    sentence = sentence.replace("செல்வேன்", "சென்றேன்")
    # ... add more as needed
    return sentence

def correct_sentence(err_sentence: str, predicted_label: str) -> str:
    """
    Return a corrected version based on the predicted label.
    Adjust or expand this function for your real use case.
    """
    if predicted_label == "Error1":
        return fix_error1(err_sentence)
    elif predicted_label == "Error2":
        return fix_error2(err_sentence)
    else:
        return err_sentence  # e.g., NoError

# Test with a new example
test_sentence = "நான் புத்தகம் வாசிக்கிறோம்"
test_vec = vectorizer.transform([basic_cleanup(test_sentence)])
pred_label = clf.predict(test_vec)[0]
corrected_output = correct_sentence(test_sentence, pred_label)

print("\nDEMO:")
print("Original Sentence:", test_sentence)
print("Predicted Error Type:", pred_label)
print("Corrected Sentence:", corrected_output)


In [None]:
# POS tagging for Tamil sentences

# Install required libraries
!pip install stanza

import stanza

# Download Tamil model for Stanza
stanza.download('ta')

# Initialize the Stanza pipeline for Tamil
nlp = stanza.Pipeline(lang='ta', processors='tokenize,pos')



Collecting stanza
  Downloading stanza-1.9.2-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.9.2-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.9.2


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ta (Tamil) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ta/resolve/v1.9.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/ta/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ta (Tamil):
| Processor | Package      |
----------------------------
| tokenize  | ttb          |
| mwt       | ttb          |
| pos       | ttb_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
INFO:stanza:Done loading processors!


Word	POS
நான்	PRON
புத்தகம்	NOUN
வாசிக்கிறேன்	VERB


In [None]:
# Rule-Based Grammar Suggestions for Tamil Paragraphs

# Install required libraries
# pip install stanza

import stanza

# Download Tamil model for Stanza
stanza.download('ta')

# Initialize the Stanza pipeline for Tamil
nlp = stanza.Pipeline(lang='ta', processors='tokenize,pos')

# Define grammar rules and suggestions based on subject and tense
rules = [
    {
        "subject": "நான்",  # First-person singular
        "tense": "Present",
        "incorrect_suffix": "வாசிக்கிறோம்",
        "correct_suffix": "வாசிக்கிறேன்",
        "message": "For first-person singular in present tense, use 'வாசிக்கிறேன்' instead of 'வாசிக்கிறோம்'."
    },
    {
        "subject": "நீ",  # Second-person singular
        "tense": "Present",
        "incorrect_suffix": "சாப்பிடுகிறேன்",
        "correct_suffix": "சாப்பிடுகிறாய்",
        "message": "For second-person singular in present tense, use 'சாப்பிடுகிறாய்' instead of 'சாப்பிடுகிறேன்'."
    },
    {
        "subject": "நீ",  # Second-person singular
        "tense": "Present",
        "incorrect_suffix": "படிக்கிறோம்",
        "correct_suffix": "படிக்கிறாய்",
        "message": "For second-person singular in present tense, use 'படிக்கிறாய்' instead of 'படிக்கிறோம்'."
    }
]

# Function to split a paragraph into sentences
def split_into_sentences(paragraph):
    doc = nlp(paragraph)
    sentences = [sentence.text for sentence in doc.sentences]
    return sentences

# Function to detect tense based on auxiliary verbs and suffixes in Tamil
def detect_tense(sentence):
    present_suffixes = ['க்கிற', 'கின்ற']
    past_suffixes = ['த்த', 'ந்த', 'ின']
    future_suffixes = ['வான்', 'வேன்', 'வோம்', 'வார்', 'வாள்']

    tense = "Unknown"
    for sentence in nlp(sentence).sentences:
        for word in sentence.words:
            if any(suffix in word.text for suffix in present_suffixes):
                tense = "Present"
            elif any(suffix in word.text for suffix in past_suffixes):
                tense = "Past"
            elif any(suffix in word.text for suffix in future_suffixes):
                tense = "Future"

    return tense

# Function to extract the root verb from a sentence
def extract_root_verb(sentence):
    doc = nlp(sentence)
    for word in doc.sentences[0].words:
        if word.upos == "VERB":
            return word.lemma  # Return the root form of the verb
    return None

# Function to apply grammar rules and provide corrections
def apply_grammar_rules(sentence, tense):
    suggestions = []
    for rule in rules:
        if rule["tense"] == tense and rule["incorrect_suffix"] in sentence:
            if rule["subject"] in sentence:
                suggestions.append({
                    "error": rule["incorrect_suffix"],
                    "suggestion": rule["correct_suffix"],
                    "message": rule["message"]
                })
    return suggestions

# Main function to process paragraphs and provide corrections
def process_paragraph(paragraph):
    sentences = split_into_sentences(paragraph)
    for sentence in sentences:
        print(f"Analyzing sentence: {sentence}")
        tense = detect_tense(sentence)
        print(f"Detected tense: {tense}")
        root_verb = extract_root_verb(sentence)
        print(f"Root verb: {root_verb}")
        suggestions = apply_grammar_rules(sentence, tense)

        if suggestions:
            print("Grammar Suggestions:")
            for suggestion in suggestions:
                print(f"Error: {suggestion['error']}")
                print(f"Suggestion: {suggestion['suggestion']}")
                print(f"Message: {suggestion['message']}")
                print()
        else:
            print("No grammar errors detected.")
        print("-")

# Example usage
paragraph = "நான் புத்தகம் வாசிக்கிறோம். நீ பாடம் படிக்கிறோம். அவர் பாடம் எழுதினான்."
process_paragraph(paragraph)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ta (Tamil) ...
INFO:stanza:File exists: /root/stanza_resources/ta/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ta (Tamil):
| Processor | Package      |
----------------------------
| tokenize  | ttb          |
| mwt       | ttb          |
| pos       | ttb_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
INFO:stanza:Done loading processors!


Analyzing sentence: நான் புத்தகம் வாசிக்கிறோம்.
Detected tense: Present
Root verb: None
Grammar Suggestions:
Error: வாசிக்கிறோம்
Suggestion: வாசிக்கிறேன்
Message: For first-person singular in present tense, use 'வாசிக்கிறேன்' instead of 'வாசிக்கிறோம்'.

-
Analyzing sentence: நீ பாடம் படிக்கிறோம்.
Detected tense: Present
Root verb: None
Grammar Suggestions:
Error: படிக்கிறோம்
Suggestion: படிக்கிறாய்
Message: For second-person singular in present tense, use 'படிக்கிறாய்' instead of 'படிக்கிறோம்'.

-
Analyzing sentence: அவர் பாடம் எழுதினான்.
Detected tense: Past
Root verb: None
No grammar errors detected.
-


In [None]:


# Function to extract the root verb from a sentence
def extract_root_verb(sentence):
    """
    Extracts the root verb (lemma) from a Tamil sentence.

    Parameters:
    sentence (str): The input Tamil sentence.

    Returns:
    str: The root verb if found, else None.
    """
    doc = nlp(sentence)
    # print("doc: ",doc)
    for word in doc.sentences[0].words:
      print("\nword: ",word)
      print("\nword.upos: ",word.upos)
      print("\nword.lemma: ",word.lemma)
      if word.upos == "VERB":
          return word.lemma  # Return the root form of the verb
    return None

# Example usage
if __name__ == "__main__":
    tamil_sentence = "நான் புத்தகம் வாசிக்கிறேன்."
    root_verb = extract_root_verb(tamil_sentence)
    if root_verb:
        print(f"Root verb: {root_verb}")
    else:
        print("No verb detected in the sentence.")



word:  {
  "id": 1,
  "text": "நான்",
  "upos": "PRON",
  "xpos": "RpN-1SA--",
  "feats": "Animacy=Anim|Case=Nom|Gender=Com|Number=Sing|Person=1|PronType=Prs",
  "start_char": 0,
  "end_char": 4
}

word.upos:  PRON

word.lemma:  None

word:  {
  "id": 2,
  "text": "புத்தகம்",
  "upos": "NOUN",
  "xpos": "NNN-3SN--",
  "feats": "Case=Nom|Gender=Neut|Number=Sing|Person=3",
  "start_char": 5,
  "end_char": 13
}

word.upos:  NOUN

word.lemma:  None

word:  {
  "id": 3,
  "text": "வாசிக்கிறேன்",
  "upos": "VERB",
  "xpos": "Vr-P1SAAA",
  "feats": "Animacy=Anim|Gender=Com|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act",
  "start_char": 14,
  "end_char": 26
}

word.upos:  VERB

word.lemma:  None
No verb detected in the sentence.


In [None]:
# Advanced Rule-Based Grammar Corrections for Tamil Sentences



# Enhanced custom lemmatizer for Tamil verbs
def enhanced_custom_lemmatizer(verb):
    """
    Enhanced custom lemmatizer for Tamil verbs by removing common suffixes and handling edge cases.

    Parameters:
    verb (str): The Tamil verb.

    Returns:
    str: The root form of the verb.
    """
    suffixes = ['க்கிறேன்', 'க்கிறோம்', 'க்கிறாய்', 'க்கிறான்', 'க்கிறாள்', 'க்கிறார்கள்',
                'க்கின்றேன்', 'க்கின்றோம்', 'க்கின்றாய்', 'க்கின்றான்', 'க்கின்றாள்', 'க்கின்றார்கள்',
                'வான்', 'வேன்', 'வோம்', 'வார்', 'வாள்', 'த்தான்', 'த்தேன்', 'த்தோம்', 'த்தார்']

    # Handle reduplication and compound verbs
    compound_indicators = ['வைத்து', 'கொண்டு', 'பற்றி']

    # Remove compound indicators
    for indicator in compound_indicators:
        if indicator in verb:
            verb = verb.split(indicator)[0]

    # Remove suffixes to extract root verb
    for suffix in suffixes:
        if verb.endswith(suffix):
            return verb.replace(suffix, '')

    # Return the original verb if no matches
    return verb

# Function to extract the root verb from a sentence
def extract_root_verb(sentence):
    """
    Extracts the root verb from a Tamil sentence.

    Parameters:
    sentence (str): The input Tamil sentence.

    Returns:
    str: The root verb if found, else None.
    """
    doc = nlp(sentence)
    root_verbs = []  # To store multiple verbs if present

    for word in doc.sentences[0].words:
        if word.upos == "VERB":
            root_verbs.append(enhanced_custom_lemmatizer(word.text))  # Use enhanced lemmatizer

    # Return the root verbs if found, else None
    return root_verbs if root_verbs else None

# Advanced grammar rules and corrections
def apply_advanced_grammar_rules(sentence):
    """
    Apply advanced rule-based grammar corrections to a Tamil sentence.

    Parameters:
    sentence (str): The input Tamil sentence.

    Returns:
    dict: A dictionary of corrections or None if the sentence is correct.
    """
    corrections = []
    doc = nlp(sentence)

    # Example advanced rules: Match subject-verb agreement, tense consistency, and compound verb accuracy
    for word in doc.sentences[0].words:
        if word.upos == "VERB":
            root_verb = enhanced_custom_lemmatizer(word.text)

            # Rule: If subject is "நான்" and verb is plural form
            if "நான்" in sentence and "க்கிறோம்" in word.text:
                corrections.append({
                    "error": word.text,
                    "suggestion": root_verb + "க்கிறேன்",
                    "message": "For first-person singular (நான்), use 'க்கிறேன்' instead of 'க்கிறோம்'."
                })

            # Rule: If subject is "நாங்கள்" and verb is singular form
            elif "நாங்கள்" in sentence and "க்கிறேன்" in word.text:
                corrections.append({
                    "error": word.text,
                    "suggestion": root_verb + "க்கிறோம்",
                    "message": "For first-person plural (நாங்கள்), use 'க்கிறோம்' instead of 'க்கிறேன்'."
                })

            # Rule: Ensure consistent tense usage
            if "நாளை" in sentence and "ந்த" in word.text:
                corrections.append({
                    "error": word.text,
                    "suggestion": root_verb + "வான்",
                    "message": "For future context (நாளை), use future tense suffix 'வான்'."
                })

            if "நேற்று" in sentence and "வான்" in word.text:
                corrections.append({
                    "error": word.text,
                    "suggestion": root_verb + "ந்தான்",
                    "message": "For past context (நேற்று), use past tense suffix 'ந்தான்'."
                })

    return corrections if corrections else None

# Main function to process sentences and provide grammar corrections
def process_sentences(sentences):
    for sentence in sentences:
        print(f"Analyzing sentence: {sentence}")
        root_verbs = extract_root_verb(sentence)
        if root_verbs:
            print(f"Root verb(s): {', '.join(root_verbs)}")
        corrections = apply_advanced_grammar_rules(sentence)
        if corrections:
            print("Grammar Suggestions:")
            for correction in corrections:
                print(f"Error: {correction['error']}")
                print(f"Suggestion: {correction['suggestion']}")
                print(f"Message: {correction['message']}")
                print()
        else:
            print("No grammar errors detected.")
        print("-")

# Example usage
if __name__ == "__main__":
    tamil_sentences = [
        "நான் புத்தகம் வாசிக்கிறோம்.",
        "நாங்கள் பாடம் எழுதுகிறேன்.",
        "அவள் பாடம் எழுதிக்கொண்டு இருக்கிறாள்.",
        "நான் நாளை பள்ளிக்குச் செல்வேன்.",
        "அவர் நேற்று படிக்கவான்."
    ]

    process_sentences(tamil_sentences)


Sentence: நான் புத்தகம் வாசிக்கிறேன்.
Root verb(s): வாசி
Sentence: அவள் பாடம் எழுதிக்கொண்டு இருக்கிறாள்.
Root verb(s): எழுதிக்
Sentence: அவர்கள் பந்தயத்தில் ஓடினார்கள்.
Root verb(s): ஓடினார்கள்
Sentence: நான் நாளை பள்ளிக்குச் செல்வேன்.
Root verb(s): பள்ளிக்குச், செல்


In [None]:
# Advanced Rule-Based Grammar Corrections for Tamil Sentences


# Enhanced custom lemmatizer for Tamil verbs
def enhanced_custom_lemmatizer(verb):
    """
    Enhanced custom lemmatizer for Tamil verbs by removing common suffixes and handling edge cases.

    Parameters:
    verb (str): The Tamil verb.

    Returns:
    str: The root form of the verb.
    """
    suffixes = ['க்கிறேன்', 'க்கிறோம்', 'க்கிறாய்', 'க்கிறான்', 'க்கிறாள்', 'க்கிறார்கள்',
                'க்கின்றேன்', 'க்கின்றோம்', 'க்கின்றாய்', 'க்கின்றான்', 'க்கின்றாள்', 'க்கின்றார்கள்',
                'வான்', 'வேன்', 'வோம்', 'வார்', 'வாள்', 'த்தான்', 'த்தேன்', 'த்தோம்', 'த்தார்']

    # Handle reduplication and compound verbs
    compound_indicators = ['வைத்து', 'கொண்டு', 'பற்றி']

    # Remove compound indicators
    for indicator in compound_indicators:
        if indicator in verb:
            verb = verb.split(indicator)[0]

    # Remove suffixes to extract root verb
    for suffix in suffixes:
        if verb.endswith(suffix):
            return verb.replace(suffix, '')

    # Return the original verb if no matches
    return verb

# Function to extract the root verb from a sentence
def extract_root_verb(sentence):
    """
    Extracts the root verb from a Tamil sentence.

    Parameters:
    sentence (str): The input Tamil sentence.

    Returns:
    str: The root verb if found, else None.
    """
    doc = nlp(sentence)
    root_verbs = []  # To store multiple verbs if present

    for word in doc.sentences[0].words:
        if word.upos == "VERB":
            root_verbs.append(enhanced_custom_lemmatizer(word.text))  # Use enhanced lemmatizer

    # Return the root verbs if found, else None
    return root_verbs if root_verbs else None

# Advanced grammar rules and corrections
def apply_advanced_grammar_rules(sentence):
    """
    Apply advanced rule-based grammar corrections to a Tamil sentence.

    Parameters:
    sentence (str): The input Tamil sentence.

    Returns:
    dict: A dictionary of corrections or None if the sentence is correct.
    """
    corrections = []
    doc = nlp(sentence)

    # Example advanced rules: Match subject-verb agreement, tense consistency, and compound verb accuracy
    for word in doc.sentences[0].words:
        if word.upos == "VERB":
            root_verb = enhanced_custom_lemmatizer(word.text)

            # Rule: If subject is "நான்" and verb is plural form
            if "நான்" in sentence and "க்கிறோம்" in word.text:
                corrections.append({
                    "error": word.text,
                    "suggestion": root_verb + "க்கிறேன்",
                    "message": "For first-person singular (நான்), use 'க்கிறேன்' instead of 'க்கிறோம்'."
                })

            # Rule: If subject is "நாங்கள்" and verb is singular form
            elif "நாங்கள்" in sentence and "க்கிறேன்" in word.text:
                corrections.append({
                    "error": word.text,
                    "suggestion": root_verb + "க்கிறோம்",
                    "message": "For first-person plural (நாங்கள்), use 'க்கிறோம்' instead of 'க்கிறேன்'."
                })

            # Rule: Ensure consistent tense usage
            if "நாளை" in sentence and "ந்த" in word.text:
                corrections.append({
                    "error": word.text,
                    "suggestion": root_verb + "வான்",
                    "message": "For future context (நாளை), use future tense suffix 'வான்'."
                })

            if "நேற்று" in sentence and "வான்" in word.text:
                corrections.append({
                    "error": word.text,
                    "suggestion": root_verb + "ந்தான்",
                    "message": "For past context (நேற்று), use past tense suffix 'ந்தான்'."
                })

    return corrections if corrections else None

# Main function to process sentences and provide grammar corrections
def process_sentences():
    sentences = []
    print("Enter Tamil sentences (type 'DONE' to finish):")
    while True:
        sentence = input("Enter a sentence: ")
        if sentence.strip().upper() == 'DONE':
            break
        sentences.append(sentence)

    for sentence in sentences:
        print(f"Analyzing sentence: {sentence}")
        root_verbs = extract_root_verb(sentence)
        if root_verbs:
            print(f"Root verb(s): {', '.join(root_verbs)}")
        corrections = apply_advanced_grammar_rules(sentence)
        if corrections:
            print("Grammar Suggestions:")
            for correction in corrections:
                print(f"Error: {correction['error']}")
                print(f"Suggestion: {correction['suggestion']}")
                print(f"Message: {correction['message']}")
                print()
        else:
            print("No grammar errors detected.")
        print("-")

# Run the grammar checker
if __name__ == "__main__":
    process_sentences()


Enter Tamil sentences (type 'DONE' to finish):
Enter a sentence: அவன் பாடசாலைக்கு நேற்று செல்வான்
Enter a sentence: Done
Analyzing sentence: அவன் பாடசாலைக்கு நேற்று செல்வான்
Root verb(s): செல்
Grammar Suggestions:
Error: செல்வான்
Suggestion: செல்ந்தான்
Message: For past context (நேற்று), use past tense suffix 'ந்தான்'.

-


In [None]:
# Advanced Rule-Based Grammar Corrections for Tamil Sentences



# Define tense ending words for each pronoun and tense
tense_ending_words = {
    'past': {
        'நான்': ['தேன்', 'றேன்', 'னேன்', 'டேன்'],
        'நாங்கள்': ['தோம்', 'றோம்', 'னோம்', 'டோம்'],
        'நீ': ['தாய்', 'றாய்', 'னாய்', 'டாய்'],
        'நீங்கள்': ['தீர்கள்', 'றீர்கள்', 'னீர்கள்', 'டீர்கள்'],
        'அவன்': ['தான்', 'றான்', 'னான்', 'டான்'],
        'அவள்': ['தாள்', 'றாள்', 'னாள்', 'டாள்'],
        'அவர்': ['தார்', 'றார்', 'னார்', 'டார்'],
        'அவர்கள்': ['தார்கள்', 'றார்கள்', 'னார்கள்', 'டனர்', 'டார்கள்'],
        'அது': ['தது', 'றது', 'யது', 'டது'],
        'அவைகள்': ['தன', 'றன', 'டின', 'டன']
    },
    'present': {
        'நான்': ['கிறேன்', 'கின்றேன்'],
        'நாங்கள்': ['கிறோம்', 'கின்றோம்'],
        'நீ': ['கிறாய்', 'கின்றாய்'],
        'நீங்கள்': ['கிறீர்கள்', 'கின்றீர்கள்'],
        'அவன்': ['கிறான்', 'கின்றான்'],
        'அவள்': ['கிறாள்', 'கின்றாள்'],
        'அவர்': ['கிறார்', 'கின்றார்'],
        'அவர்கள்': ['கிறார்கள்', 'கின்றார்கள்'],
        'அது': ['கிறது', 'கின்றது'],
        'அவைகள்': ['கிறன', 'கின்றன']
    },
    'future': {
        'நான்': ['பேன்', 'வேன்'],
        'நாங்கள்': ['போம்', 'வோம்'],
        'நீ': ['பாய்', 'வாய்'],
        'நீங்கள்': ['பீர்கள்', 'வீர்கள்'],
        'அவன்': ['பான்', 'வான்'],
        'அவள்': ['பாள்', 'வாள்'],
        'அவர்': ['பார்', 'வார்'],
        'அவர்கள்': ['பார்கள்', 'வார்கள்'],
        'அது': ['கும்', 'லும்'],
        'அவைகள்': ['கும்', 'லும்']
    }
}

# Enhanced custom lemmatizer for Tamil verbs
def enhanced_custom_lemmatizer(verb):
    """
    Enhanced custom lemmatizer for Tamil verbs by removing common suffixes and handling edge cases.

    Parameters:
    verb (str): The Tamil verb.

    Returns:
    str: The root form of the verb.
    """
    for tense, pronouns in tense_ending_words.items():
        for suffixes in pronouns.values():
            for suffix in suffixes:
                if verb.endswith(suffix):
                    return verb.replace(suffix, '')
    return verb

# Function to detect tense using tense ending words
def detect_tense_with_endings(sentence):
    """
    Detects the tense of a Tamil sentence based on predefined tense ending words.

    Parameters:
    sentence (str): The input Tamil sentence.

    Returns:
    str: The detected tense (Present, Past, Future, or Unknown).
    """
    doc = nlp(sentence)
    for word in doc.sentences[0].words:
        if word.upos == "VERB":
            verb = word.text
            for tense, pronouns in tense_ending_words.items():
                for suffixes in pronouns.values():
                    if any(verb.endswith(suffix) for suffix in suffixes):
                        return tense.capitalize()
    return "Unknown"

# Advanced grammar rules and corrections using tense ending words
def apply_grammar_rules_with_tense(sentence):
    """
    Apply advanced rule-based grammar corrections to a Tamil sentence based on tense ending words.

    Parameters:
    sentence (str): The input Tamil sentence.

    Returns:
    dict: A dictionary of corrections or None if the sentence is correct.
    """
    corrections = []
    doc = nlp(sentence)

    pronoun = None
    verb = None

    for word in doc.sentences[0].words:
        if word.upos == "PRON":
            pronoun = word.text
        elif word.upos == "VERB":
            verb = word.text

    if pronoun and verb:
        root_verb = enhanced_custom_lemmatizer(verb)
        detected_tense = detect_tense_with_endings(sentence)

        # Validate against the tense ending words for the pronoun
        if pronoun in tense_ending_words[detected_tense.lower()]:
            valid_suffixes = tense_ending_words[detected_tense.lower()][pronoun]
            if not any(verb.endswith(suffix) for suffix in valid_suffixes):
                corrections.append({
                    "error": verb,
                    "suggestion": f"{root_verb}{valid_suffixes[0]}",
                    "message": f"For pronoun '{pronoun}' in {detected_tense} tense, use a valid verb form."
                })

    return corrections if corrections else None

# Main function to process sentences and provide grammar corrections
def process_sentences_with_tense():
    sentences = []
    print("Enter Tamil sentences (type 'DONE' to finish):")
    while True:
        sentence = input("Enter a sentence: ")
        if sentence.strip().upper() == 'DONE':
            break
        sentences.append(sentence)

    for sentence in sentences:
        print(f"Analyzing sentence: {sentence}")
        root_verbs = extract_root_verb(sentence)
        if root_verbs:
            print(f"Root verb(s): {', '.join(root_verbs)}")
        tense = detect_tense_with_endings(sentence)
        print(f"Detected tense: {tense}")
        corrections = apply_grammar_rules_with_tense(sentence)
        if corrections:
            print("Grammar Suggestions:")
            for correction in corrections:
                print(f"Error: {correction['error']}")
                print(f"Suggestion: {correction['suggestion']}")
                print(f"Message: {correction['message']}")
                print()
        else:
            print("No grammar errors detected.")
        print("-")

# Run the grammar checker
if __name__ == "__main__":
    process_sentences_with_tense()
