<a href="https://colab.research.google.com/github/Sajishvar/Spell_Grammer_Checker_Tamil/blob/main/Spell_Grammer_Checker_Tamil.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **SPELL CHECKER**

In [None]:
import sys
from collections import Counter

# Adjust encoding configuration for standard output in environments supporting it.
if hasattr(sys.stdout, 'reconfigure'):
    sys.stdout.reconfigure(encoding='utf-8')

def load_dictionary(file_path):
    # Load words from a dictionary file
    with open(file_path, 'r', encoding='utf-8') as file:
        dictionary = set(line.strip() for line in file)
    return dictionary

**HEURISTICS**

In [None]:
from collections import Counter

def word_length_similarity(misspelt_word, candidate_word):
    # Binary score: 1.0 if lengths match, 0.0 if they don't
    return 1.0 if len(misspelt_word) == len(candidate_word) else 0.0

def character_frequency_similarity(misspelt_word, candidate_word):
    # Count frequency of each character in both words
    misspelt_counter = Counter(misspelt_word)
    candidate_counter = Counter(candidate_word)
    # Compute sum of minimum matches for each character
    matching_count = sum(min(misspelt_counter[char], candidate_counter[char]) for char in misspelt_counter)
    # Normalize by the length of the misspelt word
    return matching_count / len(misspelt_word)

def position_similarity(misspelt_word, candidate_word):
    # Count matching characters at the same positions
    match_count = sum(1 for m_char, c_char in zip(misspelt_word, candidate_word) if m_char == c_char)

    # Normalize by the length of the shorter word to avoid penalizing due to different lengths
    return match_count / min(len(misspelt_word), len(candidate_word))

def first_letter_similarity(misspelt_word, candidate_word):
    # Check if the first letters of both words match
    return 1.0 if misspelt_word[0] == candidate_word[0] else 0.0

def levenshtein_distance(word1, word2):
    # Compute the Levenshtein distance between two words
    len_word1, len_word2 = len(word1), len(word2)
    matrix = [[0] * (len_word2 + 1) for _ in range(len_word1 + 1)]

    for i in range(len_word1 + 1):
        matrix[i][0] = i
    for j in range(len_word2 + 1):
        matrix[0][j] = j

    for i in range(1, len_word1 + 1):
        for j in range(1, len_word2 + 1):
            cost = 0 if word1[i-1] == word2[j-1] else 1
            matrix[i][j] = min(
                matrix[i-1][j] + 1,  # Deletion
                matrix[i][j-1] + 1,  # Insertion
                matrix[i-1][j-1] + cost  # Substitution
            )

    return matrix[len_word1][len_word2]

def distance_similarity(misspelt_word, candidate_word):
    # Levenshtein distance normalized by the maximum possible distance
    max_distance = max(len(misspelt_word), len(candidate_word))
    lev_distance = levenshtein_distance(misspelt_word, candidate_word)

    # If one word is a single character off due to insertion or deletion, this will be captured by Levenshtein.
    return 1 - (lev_distance / max_distance)  # Return similarity score (1 for exact match)



**SCORES**

In [None]:
def calculate_similarity_score(misspelt_word, candidate_word):
    # Weights
    weight_length = 0.2
    weight_frequency = 0.2
    weight_position = 0.2
    weight_first_letter = 0.2
    weight_distance = 0.2

    # Heuristic scores
    length_score = word_length_similarity(misspelt_word, candidate_word)
    frequency_score = character_frequency_similarity(misspelt_word, candidate_word)
    position_score = position_similarity(misspelt_word, candidate_word)
    first_letter_score = first_letter_similarity(misspelt_word, candidate_word)
    distance_score = distance_similarity(misspelt_word, candidate_word)

    # Weighted sum of the scores
    return (weight_length * length_score +
            weight_frequency * frequency_score +
            weight_position * position_score +
            weight_first_letter * first_letter_score +
            weight_distance * distance_score)

**CORRECTION SUGGESTION**

In [None]:
def suggest_corrections(misspelt_word, dictionary, threshold=0.5):
    # Calculate similarity scores for each word in the dictionary
    scored_candidates = [
        (candidate, calculate_similarity_score(misspelt_word, candidate))
        for candidate in dictionary
    ]
    # Filter candidates with scores above threshold
    scored_candidates = [(word, score) for word, score in scored_candidates if score > threshold]
    # Sort by score in descending order
    scored_candidates.sort(key=lambda x: x[1], reverse=True)

    return scored_candidates

**SENTENCE PROCCESSING**

In [None]:
def process_sentence(sentence, dictionary, threshold=0.5):
    words = sentence.split()  # Split the sentence into words
    corrections = {}  # Dictionary to store corrections for each misspelled word
    corrected_sentence = []  # List to build the corrected sentence

    for word in words:
        # Get suggestions for each word
        suggestions = suggest_corrections(word, dictionary, threshold)

        if suggestions:
            # If the top suggestion has a perfect score, the word is correct
            if suggestions[0][1] == 1.0:
                corrections[word] = "Correct"
                corrected_sentence.append(word)  # Keep the original word
            else:
                # Otherwise, save the top suggestions
                corrections[word] = [s[0] for s in suggestions[:5]]  # Top 5 suggestions
                corrected_sentence.append(suggestions[0][0])  # Use the top suggestion
        else:
            corrections[word] = "No suggestions"  # No suitable suggestions
            corrected_sentence.append(word)  # Keep the original word

    return corrections, ' '.join(corrected_sentence)

**USAGE**

In [None]:
# Example usage
dictionary_file = '/content/tamil_words.txt'  # Replace with your Tamil words file
sentence = 'நான் காற்ற பார்ப்பேன்'  # Replace with your Tamil sentence

# Load dictionary
dictionary = load_dictionary(dictionary_file)

# Process the sentence
corrections, corrected_sentence = process_sentence(sentence, dictionary)

# Print corrections for the sentence
for word, suggestion in corrections.items():
    if suggestion == "Correct":
        print(f"'{word}': Correct")
    elif suggestion == "No suggestions":
        print(f"'{word}': No suggestions found")
    else:
        print(f"'{word}': Suggestions: {', '.join(suggestion)}")

# Print the corrected sentence
print("Corrected Sentence:", corrected_sentence)

'நான்': Correct
'காற்ற': Suggestions: காற்று, கற்றா, காற்றாலை, காற்றாடி, கற்றை
'பார்ப்பேன்': Correct
Corrected Sentence: நான் காற்று பார்ப்பேன்


# **GRAMMAR CHECKER (RULE-BASED)**

**STANZA PIPELINE FOR POS TAGGING**

In [None]:
!pip install stanza

import stanza
from collections import Counter

# Load Tamil Stanza pipeline for POS tagging
stanza.download('ta')  # Download the Tamil language model if not already done
nlp = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=False)
from collections import Counter

# Load Tamil Stanza pipeline for POS tagging
stanza.download('ta')  # Download the Tamil language model if not already done
nlp = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=False)

Collecting stanza
  Downloading stanza-1.9.2-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.9.2-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.9.2


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ta (Tamil) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ta/resolve/v1.9.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/ta/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ta (Tamil):
| Processor | Package      |
----------------------------
| tokenize  | ttb          |
| mwt       | ttb          |
| pos       | ttb_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
INFO:stanza:Done loading processors!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ta (Tamil) ...
INFO:stanza:File exists: /root/stanza_resources/ta/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ta (Tamil):
| Processor | Package      |
----------------------------
| tokenize  | ttb          |
| mwt       | ttb          |
| pos       | ttb_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


**POS TAGGING**

In [None]:
# Process the sentence
doc = nlp(corrected_sentence)

# Print POS tags for each word
print("Word\tPOS")
for sent in doc.sentences:
    for word in sent.words:
        print(f"{word.text}\t{word.upos}")

Word	POS
நான்	PRON
காற்று	NOUN
பார்ப்பேன்	VERB


**RULE BASED GRAMMAR CHECKING**

In [None]:
def check_and_correct_grammar(sentence):
    """
    Check the grammar of a Tamil sentence, detect multiple issues, and produce a fully corrected sentence.
    """
    # Process the sentence with Stanza
    doc = nlp(sentence)
    errors = []
    corrected_words = sentence.split()  # Start with the original words

    # Extract words and POS tags
    words = []
    pos_tags = []
    for sent in doc.sentences:
        for word in sent.words:
            words.append(word.text)
            pos_tags.append(word.upos)

    # Rule 1: Subject-Object-Verb (SOV) Order
    if 'PRON' in pos_tags and 'NOUN' in pos_tags and 'VERB' in pos_tags:
        pron_index = pos_tags.index('PRON')
        noun_index = pos_tags.index('NOUN')
        verb_index = pos_tags.index('VERB')
        if not (pron_index < noun_index < verb_index):
            errors.append("Error: The sentence should follow Subject-Object-Verb (SOV) order.")
            # Correct the word order while preserving other parts of the sentence
            corrected_words = (
                [words[pron_index], words[noun_index], words[verb_index]]
                + [word for i, word in enumerate(words) if i not in (pron_index, noun_index, verb_index)]
            )

    # Rule 2: Adjective-Noun Order
    if 'ADJ' in pos_tags and 'NOUN' in pos_tags:
        for i, (tag, word) in enumerate(zip(pos_tags, words)):
            if tag == 'ADJ':
                adj_index = i
                # Look ahead for the nearest noun
                for j in range(adj_index + 1, len(pos_tags)):
                    if pos_tags[j] == 'NOUN':
                        if adj_index > j:
                            errors.append("Error: Adjectives should precede the noun they modify.")
                            # Swap the adjective and noun
                            words[adj_index], words[j] = words[j], words[adj_index]
                        break

    # Rule 3: Plural Agreement
    if 'PRON' in pos_tags and 'VERB' in pos_tags:
        pron_index = pos_tags.index('PRON')
        verb_index = pos_tags.index('VERB')
        pron_word = words[pron_index]
        verb_word = words[verb_index]
        # Check if the pronoun is plural but the verb isn't
        if pron_word.endswith("ள்") and not verb_word.endswith("ோம்"):
            errors.append("Error: Plural pronoun should match plural verb form.")
            # Attempt to correct verb form dynamically
            if "ேன்" in verb_word:
                corrected_words[verb_index] = verb_word.replace("ேன்", "ோம்")
            else:
                corrected_words[verb_index] += "ோம்"  # Add plural suffix if not present

    # Return errors and corrections
    if errors:
        corrected_sentence = " ".join(corrected_words)
        return {
            "status": "errors",
            "details": errors,
            "corrected_sentence": corrected_sentence,
        }
    else:
        return {"status": "correct", "details": "The sentence is grammatically correct."}

**EXAMPLE USAGE**

In [None]:
sentence = "பள்ளிக்கு நாங்கள் செல்வேன்"  # Incorrect Tamil sentence
result = check_and_correct_grammar(sentence)

# Display Results
if result["status"] == "correct":
    print(result["details"])
else:
    print("Grammar Errors Found:")
    for error in result["details"]:
        print(f"- {error}")
    if "corrected_sentence" in result:
        print(f"Corrected Sentence: {result['corrected_sentence']}")

Grammar Errors Found:
- Error: The sentence should follow Subject-Object-Verb (SOV) order.
- Error: Plural pronoun should match plural verb form.
Corrected Sentence: நாங்கள் பள்ளிக்கு செல்வோம்


# **GRAMMAR CHECKER (ML)**

**PRE-PROCESSING**

In [4]:
!pip install indic-nlp-library

import pandas as pd


# Load dataset
df = pd.read_csv("tamil_grammar_dataset.csv")

# Display first few rows
print(df.head())

# Remove any unwanted characters (quotes, extra spaces)
df['incorrect_sentence'] = df['incorrect_sentence'].str.replace(r'[“”]', '', regex=True).str.strip()
df['correct_sentence'] = df['correct_sentence'].str.replace(r'[“”]', '', regex=True).str.strip()


Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme->indic-nlp-library)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Downloading sphinx_argparse-0.5.2-py3-none-any.whl (12 kB)
Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl (7.7 MB)
[2K   [90m━

In [5]:
from indicnlp.tokenize import indic_tokenize

# Tokenize each sentence
df['incorrect_tokens'] = df['incorrect_sentence'].apply(lambda x: indic_tokenize.trivial_tokenize(x, lang='ta'))
df['correct_tokens'] = df['correct_sentence'].apply(lambda x: indic_tokenize.trivial_tokenize(x, lang='ta'))

print(df[['incorrect_tokens', 'correct_tokens']].head())


                          incorrect_tokens  \
0        [புத்தகம், அவன், வாசிக்கிறான், .]   
1             [நான், வாசிக்கிறேன், கதை, .]   
2          [சாப்பிடுகிறான், அவன், உணவு, .]   
3  [அழகான, வீட்டில், அவன், இருக்கிறான், .]   
4    [சிவப்பு, பள்ளி, பேருந்து, வந்தது, .]   

                              correct_tokens  
0          [அவன், புத்தகம், வாசிக்கிறான், .]  
1               [நான், கதை, வாசிக்கிறேன், .]  
2            [அவன், உணவு, சாப்பிடுகிறான், .]  
3    [அவன், அழகான, வீட்டில், இருக்கிறான், .]  
4  [சிவப்பு, பேருந்து, பள்ளிக்கு, வந்தது, .]  


In [8]:
!pip install stanza
import stanza

# Download Tamil model (run only once)
stanza.download('ta')

# Load Tamil NLP pipeline
nlp = stanza.Pipeline(lang='ta', processors='tokenize,pos')

def get_pos_tags(sentence):
    """Returns POS tags for a given Tamil sentence."""
    doc = nlp(sentence)
    return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]

# Example usage
sentence = "அவன் புத்தகம் வாசிக்கிறான்."
print(get_pos_tags(sentence))


Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.10.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.10.1


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ta (Tamil) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ta/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/ta/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ta (Tamil):
| Processor | Package      |
----------------------------
| tokenize  | ttb          |
| mwt       | ttb          |
| pos       | ttb_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


[('அவன்', 'PRON'), ('புத்தகம்', 'NOUN'), ('வாசிக்கிறான்', 'VERB'), ('.', 'PUNCT')]


In [9]:
# Apply POS tagging to dataset
df['incorrect_pos'] = df['incorrect_sentence'].apply(get_pos_tags)
df['correct_pos'] = df['correct_sentence'].apply(get_pos_tags)

print(df[['incorrect_pos', 'correct_pos']].head())


                                       incorrect_pos  \
0  [(புத்தகம், NOUN), (அவன், PRON), (வாசிக்கிறான்...   
1  [(நான், PRON), (வாசிக்கிறேன், VERB), (கதை, NOU...   
2  [(சாப்பிடுகிறான், VERB), (அவன், PROPN), (உணவு,...   
3  [(அழகான, ADV), (வீட்டில், NOUN), (அவன், PRON),...   
4  [(சிவப்பு, ADJ), (பள்ளி, NOUN), (பேருந்து, NOU...   

                                         correct_pos  
0  [(அவன், PRON), (புத்தகம், NOUN), (வாசிக்கிறான்...  
1  [(நான், PRON), (கதை, NOUN), (வாசிக்கிறேன், VER...  
2  [(அவன், PRON), (உணவு, NOUN), (சாப்பிடுகிறான், ...  
3  [(அவன், PRON), (அழகான, ADV), (வீட்டில், NOUN),...  
4  [(சிவப்பு, NOUN), (பேருந்து, NOUN), (பள்ளிக்கு...  


In [10]:
def check_sov_order(pos_tags):
    """Returns True if the sentence follows SOV order."""
    try:
        subject_index = next(i for i, tag in enumerate(pos_tags) if tag[1] == 'PRON')
        object_index = next(i for i, tag in enumerate(pos_tags) if tag[1] == 'NOUN')
        verb_index = next(i for i, tag in enumerate(pos_tags) if tag[1] == 'VERB')
        return subject_index < object_index < verb_index  # SOV order
    except:
        return False

df['incorrect_sov'] = df['incorrect_pos'].apply(check_sov_order)
df['correct_sov'] = df['correct_pos'].apply(check_sov_order)

print(df[['incorrect_sov', 'correct_sov']].head())


   incorrect_sov  correct_sov
0          False         True
1          False         True
2          False         True
3          False         True
4          False        False


In [11]:
def check_adj_noun_order(pos_tags):
    """Returns True if adjectives appear before nouns."""
    try:
        adj_index = next(i for i, tag in enumerate(pos_tags) if tag[1] == 'ADJ')
        noun_index = next(i for i, tag in enumerate(pos_tags) if tag[1] == 'NOUN')
        return adj_index < noun_index
    except:
        return False

df['incorrect_adj_noun'] = df['incorrect_pos'].apply(check_adj_noun_order)
df['correct_adj_noun'] = df['correct_pos'].apply(check_adj_noun_order)

print(df[['incorrect_adj_noun', 'correct_adj_noun']].head())


   incorrect_adj_noun  correct_adj_noun
0               False             False
1               False             False
2               False             False
3               False             False
4                True             False


In [12]:
def check_plural_agreement(pos_tags):
    """Checks if plural subject matches plural verb."""
    plural_noun = any(tag[1] == 'NOUN' and 'ள்' in tag[0] for tag in pos_tags)
    plural_verb = any(tag[1] == 'VERB' and 'கிறார்கள்' in tag[0] for tag in pos_tags)
    return plural_noun == plural_verb

df['incorrect_plural'] = df['incorrect_pos'].apply(check_plural_agreement)
df['correct_plural'] = df['correct_pos'].apply(check_plural_agreement)

print(df[['incorrect_plural', 'correct_plural']].head())


   incorrect_plural  correct_plural
0              True            True
1              True            True
2              True            True
3              True            True
4             False           False


In [13]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df['incorrect_sov'] = encoder.fit_transform(df['incorrect_sov'])
df['correct_sov'] = encoder.fit_transform(df['correct_sov'])

df['incorrect_adj_noun'] = encoder.fit_transform(df['incorrect_adj_noun'])
df['correct_adj_noun'] = encoder.fit_transform(df['correct_adj_noun'])

df['incorrect_plural'] = encoder.fit_transform(df['incorrect_plural'])
df['correct_plural'] = encoder.fit_transform(df['correct_plural'])

print(df.head())


                 incorrect_sentence                    correct_sentence  \
0       புத்தகம் அவன் வாசிக்கிறான்.         அவன் புத்தகம் வாசிக்கிறான்.   
1            நான் வாசிக்கிறேன் கதை.              நான் கதை வாசிக்கிறேன்.   
2         சாப்பிடுகிறான் அவன் உணவு.           அவன் உணவு சாப்பிடுகிறான்.   
3  அழகான வீட்டில் அவன் இருக்கிறான்.    அவன் அழகான வீட்டில் இருக்கிறான்.   
4    சிவப்பு பள்ளி பேருந்து வந்தது.  சிவப்பு பேருந்து பள்ளிக்கு வந்தது.   

                          incorrect_tokens  \
0        [புத்தகம், அவன், வாசிக்கிறான், .]   
1             [நான், வாசிக்கிறேன், கதை, .]   
2          [சாப்பிடுகிறான், அவன், உணவு, .]   
3  [அழகான, வீட்டில், அவன், இருக்கிறான், .]   
4    [சிவப்பு, பள்ளி, பேருந்து, வந்தது, .]   

                              correct_tokens  \
0          [அவன், புத்தகம், வாசிக்கிறான், .]   
1               [நான், கதை, வாசிக்கிறேன், .]   
2            [அவன், உணவு, சாப்பிடுகிறான், .]   
3    [அவன், அழகான, வீட்டில், இருக்கிறான், .]   
4  [சிவப்பு, பேருந்து, பள்ளிக்கு

In [14]:
df.to_csv("preprocessed_tamil_grammar.csv", index=False)

**Training Machine Learning Model**

In [15]:
import pandas as pd

# Load preprocessed dataset
df = pd.read_csv("preprocessed_tamil_grammar.csv")

# Display dataset structure
print(df.head())


                 incorrect_sentence                    correct_sentence  \
0       புத்தகம் அவன் வாசிக்கிறான்.         அவன் புத்தகம் வாசிக்கிறான்.   
1            நான் வாசிக்கிறேன் கதை.              நான் கதை வாசிக்கிறேன்.   
2         சாப்பிடுகிறான் அவன் உணவு.           அவன் உணவு சாப்பிடுகிறான்.   
3  அழகான வீட்டில் அவன் இருக்கிறான்.    அவன் அழகான வீட்டில் இருக்கிறான்.   
4    சிவப்பு பள்ளி பேருந்து வந்தது.  சிவப்பு பேருந்து பள்ளிக்கு வந்தது.   

                                    incorrect_tokens  \
0          ['புத்தகம்', 'அவன்', 'வாசிக்கிறான்', '.']   
1               ['நான்', 'வாசிக்கிறேன்', 'கதை', '.']   
2            ['சாப்பிடுகிறான்', 'அவன்', 'உணவு', '.']   
3  ['அழகான', 'வீட்டில்', 'அவன்', 'இருக்கிறான்', '.']   
4    ['சிவப்பு', 'பள்ளி', 'பேருந்து', 'வந்தது', '.']   

                                      correct_tokens  \
0          ['அவன்', 'புத்தகம்', 'வாசிக்கிறான்', '.']   
1               ['நான்', 'கதை', 'வாசிக்கிறேன்', '.']   
2            ['அவன்', 'உணவு', 'சாப்பிடுகிறான

In [16]:
# Features (input) - Incorrect sentence grammar features
X = df[['incorrect_sov', 'incorrect_adj_noun', 'incorrect_plural']]

# Labels (output) - Correct sentence grammar features
y = df[['correct_sov', 'correct_adj_noun', 'correct_plural']]


In [17]:
from sklearn.model_selection import train_test_split

# Split into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")


Training samples: 10, Testing samples: 3


In [18]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [19]:
from sklearn.metrics import accuracy_score

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.00
