<a href="https://colab.research.google.com/github/Sajishvar/Spell_Grammer_Checker_Tamil/blob/main/Spell_Grammer_Checker_Tamil.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **SPELL CHECKER**

In [None]:
import sys
from collections import Counter

# Adjust encoding configuration for standard output in environments supporting it.
if hasattr(sys.stdout, 'reconfigure'):
    sys.stdout.reconfigure(encoding='utf-8')

def load_dictionary(file_path):
    # Load words from a dictionary file
    with open(file_path, 'r', encoding='utf-8') as file:
        dictionary = set(line.strip() for line in file)
    return dictionary

**HEURISTICS**

In [None]:
def word_length_similarity(misspelt_word, candidate_word):
    # Binary score: 1.0 if lengths match, 0.0 if they don't
    return 1.0 if len(misspelt_word) == len(candidate_word) else 0.0

def character_frequency_similarity(misspelt_word, candidate_word):
    # Count frequency of each character in both words
    misspelt_counter = Counter(misspelt_word)
    candidate_counter = Counter(candidate_word)
    # Compute sum of minimum matches for each character
    matching_count = sum(min(misspelt_counter[char], candidate_counter[char]) for char in misspelt_counter)
    # Normalize by the length of the misspelt word
    return matching_count / len(misspelt_word)

def position_similarity(misspelt_word, candidate_word):
    # Count matching characters at the same positions
    match_count = sum(1 for m_char, c_char in zip(misspelt_word, candidate_word) if m_char == c_char)

    # Normalize by the length of the shorter word to avoid penalizing due to different lengths
    return match_count / min(len(misspelt_word), len(candidate_word))

def first_letter_similarity(misspelt_word, candidate_word):
    # Check if the first letters of both words match
    return 1.0 if misspelt_word[0] == candidate_word[0] else 0.0

def levenshtein_distance(word1, word2):
    # Compute the Levenshtein distance between two words
    len_word1, len_word2 = len(word1), len(word2)
    matrix = [[0] * (len_word2 + 1) for _ in range(len_word1 + 1)]

    for i in range(len_word1 + 1):
        matrix[i][0] = i
    for j in range(len_word2 + 1):
        matrix[0][j] = j

    for i in range(1, len_word1 + 1):
        for j in range(1, len_word2 + 1):
            cost = 0 if word1[i-1] == word2[j-1] else 1
            matrix[i][j] = min(
                matrix[i-1][j] + 1,  # Deletion
                matrix[i][j-1] + 1,  # Insertion
                matrix[i-1][j-1] + cost  # Substitution
            )

    return matrix[len_word1][len_word2]

def distance_similarity(misspelt_word, candidate_word):
    # Levenshtein distance normalized by the maximum possible distance
    max_distance = max(len(misspelt_word), len(candidate_word))
    lev_distance = levenshtein_distance(misspelt_word, candidate_word)
    return 1 - (lev_distance / max_distance)  # Return similarity score (1 for exact match)



**SCORES**

In [None]:
def calculate_similarity_score(misspelt_word, candidate_word):
    # Weights
    weight_length = 0.2
    weight_frequency = 0.2
    weight_position = 0.2
    weight_first_letter = 0.2
    weight_distance = 0.2

    # Heuristic scores
    length_score = word_length_similarity(misspelt_word, candidate_word)
    frequency_score = character_frequency_similarity(misspelt_word, candidate_word)
    position_score = position_similarity(misspelt_word, candidate_word)
    first_letter_score = first_letter_similarity(misspelt_word, candidate_word)
    distance_score = distance_similarity(misspelt_word, candidate_word)

    # Weighted sum of the scores
    return (weight_length * length_score +
            weight_frequency * frequency_score +
            weight_position * position_score +
            weight_first_letter * first_letter_score +
            weight_distance * distance_score)

**CORRECTION SUGGESTION**

In [None]:
def suggest_corrections(misspelt_word, dictionary, threshold=0.5):
    # Calculate similarity scores for each word in the dictionary
    scored_candidates = [
        (candidate, calculate_similarity_score(misspelt_word, candidate))
        for candidate in dictionary
    ]
    # Filter candidates with scores above threshold
    scored_candidates = [(word, score) for word, score in scored_candidates if score > threshold]
    # Sort by score in descending order
    scored_candidates.sort(key=lambda x: x[1], reverse=True)

    return scored_candidates

**SENTENCE PROCCESSING**

In [None]:
def process_sentence(sentence, dictionary, threshold=0.5):
    words = sentence.split()  # Split the sentence into words
    corrections = {}  # Dictionary to store corrections for each misspelled word
    corrected_sentence = []  # List to build the corrected sentence

    for word in words:
        # Get suggestions for each word
        suggestions = suggest_corrections(word, dictionary, threshold)

        if suggestions:
            # If the top suggestion has a perfect score, the word is correct
            if suggestions[0][1] == 1.0:
                corrections[word] = "Correct"
                corrected_sentence.append(word)  # Keep the original word
            else:
                # Otherwise, save the top suggestions
                corrections[word] = [s[0] for s in suggestions[:5]]  # Top 5 suggestions
                corrected_sentence.append(suggestions[0][0])  # Use the top suggestion
        else:
            corrections[word] = "No suggestions"  # No suitable suggestions
            corrected_sentence.append(word)  # Keep the original word

    return corrections, ' '.join(corrected_sentence)

**USAGE**

In [None]:
# Example usage
dictionary_file = '/content/tamil_words.txt'  # Replace with your Tamil words file
sentence = 'நான் காற்ற பார்ப்பேன்'  # Replace with your Tamil sentence

# Load dictionary
dictionary = load_dictionary(dictionary_file)

# Process the sentence
corrections, corrected_sentence = process_sentence(sentence, dictionary)

# Print corrections for the sentence
for word, suggestion in corrections.items():
    if suggestion == "Correct":
        print(f"'{word}': Correct")
    elif suggestion == "No suggestions":
        print(f"'{word}': No suggestions found")
    else:
        print(f"'{word}': Suggestions: {', '.join(suggestion)}")

# Print the corrected sentence
print("Corrected Sentence:", corrected_sentence)

# **GRAMMAR CHECKER**

**STANZA PIPELINE FOR POS TAGGING**

In [None]:
pip install stanza

import stanza
from collections import Counter

# Load Tamil Stanza pipeline for POS tagging
stanza.download('ta')  # Download the Tamil language model if not already done
nlp = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=False)

**POS TAGGING**

In [None]:
# Process the sentence
doc = nlp(corrected_sentence)

# Print POS tags for each word
print("Word\tPOS")
for sent in doc.sentences:
    for word in sent.words:
        print(f"{word.text}\t{word.upos}")

**RULE BASED GRAMMAR CHECKING**

In [None]:
def check_and_correct_grammar(sentence):
    """
    Check the grammar of a Tamil sentence, detect multiple issues, and produce a fully corrected sentence.
    """
    # Process the sentence with Stanza
    doc = nlp(sentence)
    errors = []
    corrected_words = sentence.split()  # Start with the original words

    # Extract words and POS tags
    words = []
    pos_tags = []
    for sent in doc.sentences:
        for word in sent.words:
            words.append(word.text)
            pos_tags.append(word.upos)

    # Rule 1: Subject-Object-Verb Order
    if 'PRON' in pos_tags and 'NOUN' in pos_tags and 'VERB' in pos_tags:
        pron_index = pos_tags.index('PRON')
        noun_index = pos_tags.index('NOUN')
        verb_index = pos_tags.index('VERB')
        if not (pron_index < noun_index < verb_index):
            errors.append("Error: The sentence should follow Subject-Object-Verb (SOV) order.")
            # Correct the word order
            corrected_order = [words[pron_index], words[noun_index], words[verb_index]]
            corrected_words = corrected_order + words[verb_index + 1:]

    # Rule 2: Adjective-Noun Order
    if 'ADJ' in pos_tags and 'NOUN' in pos_tags:
        adj_index = pos_tags.index('ADJ')
        noun_index = pos_tags.index('NOUN')
        if adj_index > noun_index:
            errors.append("Error: Adjectives should precede the noun they modify.")
            # Correct adjective-noun order
            corrected_words[adj_index], corrected_words[noun_index] = corrected_words[noun_index], corrected_words[adj_index]

    # Rule 4: Plural Agreement
    if 'PRON' in pos_tags and 'VERB' in pos_tags:
        pron_word = words[pos_tags.index('PRON')]
        verb_word = words[pos_tags.index('VERB')]
        if pron_word.endswith("ள்") and not verb_word.endswith("ோம்"):
            errors.append("Error: Plural pronoun should match plural verb form.")
            # Fix plural verb agreement
            corrected_words[pos_tags.index('VERB')] = verb_word.replace("ேன்", "ோம்")

    # Return errors and corrections
    if errors:
        corrected_sentence = " ".join(corrected_words)
        return {
            "status": "errors",
            "details": errors,
            "corrected_sentence": corrected_sentence,
        }
    else:
        return {"status": "correct", "details": "The sentence is grammatically correct."}


**EXAMPLE USAGE**

In [None]:
sentence = "பள்ளிக்கு நாங்கள் செல்வேன்"  # Incorrect Tamil sentence
result = check_and_correct_grammar(sentence)

# Display Results
if result["status"] == "correct":
    print(result["details"])
else:
    print("Grammar Errors Found:")
    for error in result["details"]:
        print(f"- {error}")
    if "corrected_sentence" in result:
        print(f"Corrected Sentence: {result['corrected_sentence']}")