In [22]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
pip install fasttext


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296187 sha256=cfb38af5ea49104a6844772e9078253ad8ca921c084123bb1b8838dbb62c037f
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

In [36]:
!gunzip cc.ta.300.bin.gz

gzip: cc.ta.300.bin.gz: No such file or directory


In [43]:
import re
from difflib import get_close_matches
from collections import defaultdict

# Function to load words from multiple .txt files
def load_words_from_files(file_paths):
    words = set()
    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    words.add(line.strip().replace(" ", ""))  # Add cleaned words
        except FileNotFoundError:
            print(f"Error: File {file_path} not found.")
    return words

# Function to generate n-grams from a word
def generate_ngrams(word, n=2):
    return [word[i:i+n] for i in range(len(word)-n+1)]

# Function to suggest corrections based on fuzzy matching and n-grams
def suggest_correction(word, word_list, ngram_dict, n=2):
    # Fuzzy matching: Get the closest matches for the word from the word list
    fuzzy_matches = get_close_matches(word, word_list, n=3, cutoff=0.7)  # Higher cutoff for relevant matches

    # If no fuzzy matches, use n-gram-based suggestions
    if not fuzzy_matches:
        word_ngrams = generate_ngrams(word, n)
        best_matches = defaultdict(int)
        for ngram in word_ngrams:
            for w in word_list:
                if ngram in generate_ngrams(w, n):
                    best_matches[w] += 1

        # Sort matches by count (the more n-grams match, the higher the score)
        best_matches = sorted(best_matches.items(), key=lambda x: x[1], reverse=True)
        fuzzy_matches = [match[0] for match in best_matches[:3]]

    return fuzzy_matches

# Function to process sentences/paragraphs
def check_paragraph_spelling(paragraph, word_list, ngram_dict):
    # Split the paragraph into words using spaces and Tamil punctuation
    words = re.findall(r'[\u0B80-\u0BFF]+', paragraph)

    word_suggestions = []
    for word in words:
        if word in word_list:
            word_suggestions.append((word, "Correct"))  # Exact match, no suggestion
        else:
            corrections = suggest_correction(word, word_list, ngram_dict)
            if corrections:
                word_suggestions.append((word, f"Suggested : {', '.join(corrections)}"))
            else:
                word_suggestions.append((word, "No suggestions available"))

    return word_suggestions

# Step 1: Define the paths of the .txt files you want to load
file_paths = [
    '/content/drive/MyDrive/AI/paragraph.txt',
    '/content/drive/MyDrive/AI/all-tamil-nouns.txt',
    '/content/drive/MyDrive/AI/noun.txt',
    '/content/drive/MyDrive/AI/verb1.txt',
    '/content/drive/MyDrive/AI/verb2.txt',
    '/content/drive/MyDrive/AI/numbers.txt',
]

# Step 2: Load the dictionary from all specified .txt files
word_list = load_words_from_files(file_paths)

# Step 3: Generate n-grams for the word list
ngram_dict = {}
for word in word_list:
    ngram_dict[word] = generate_ngrams(word)

# Step 4: Get input from the user
paragraph_to_check = input("Enter a paragraph in Tamil: ")

# Step 5: Check spelling for the entire paragraph
word_suggestions = check_paragraph_spelling(paragraph_to_check, word_list, ngram_dict)

# Step 6: Display individual word suggestions
print("\nDetailed Spell Check Results:")
for word, suggestion in word_suggestions:
    print(f"'{word}' : {suggestion}")

# Step 7: Auto-corrected paragraph based on suggestions
auto_corrected_paragraph = paragraph_to_check
for word, suggestion in word_suggestions:
    if "Suggested" in suggestion:
        correct_word = suggestion.split(':')[1].split(',')[0].strip()  # Choose the first suggestion
        auto_corrected_paragraph = auto_corrected_paragraph.replace(word, correct_word)

print("\nAuto-corrected Paragraph:")
print(auto_corrected_paragraph)


Enter a paragraph in Tamil: வணக்கம, எப்படீ இருக்கீங்க? நான் உங்களை நேசிகிறேன். இந்த சூரியன் காலைல எழும்புகும் போது அதை பாத்து எனக்கு ரொம்ப சந்தோஷம் ஆகிறது. நமக்கும் பயபட்டேண்டா தேவை.

Detailed Spell Check Results:
'வணக்கம' : Suggested : வணக்கம், வணக்கு, ஆவணக்களம்
'எப்படீ' : Suggested : எப்படி
'இருக்கீங்க' : Correct
'நான்' : Correct
'உங்களை' : Correct
'நேசிகிறேன்' : Suggested : நேசிக்கிறேன், நேசகன், தேசிகன்
'இந்த' : Correct
'சூரியன்' : Correct
'காலைல' : Suggested : காலை, காமலை, கசாலை
'எழும்புகும்' : Suggested : எழும்பும், எம்புகம், அழுப்புகம்
'போது' : Correct
'அதை' : Correct
'பாத்து' : Correct
'எனக்கு' : Correct
'ரொம்ப' : Correct
'சந்தோஷம்' : Correct
'ஆகிறது' : Correct
'நமக்கும்' : Correct
'பயபட்டேண்டா' : Suggested : ,உண்ணவேண்டிவருகிறோம்,காணவேண்டிவருகிறோம்,கேட்கவேண்டிவருகிறோம்,கொடுக்கவேண்டிவருகிறோம்,செய்யவேண்டிவருகிறோம்,அழவேண்டிவருகிறோம்,சாகவேண்டிவருகிறோம்,தின்கவேண்டிவருகிறோம்,நிற்கவேண்டிவருகிறோம்,கற்கவேண்டிவருகிறோம்,செல்லவேண்டிவருகிறோம்,பெறவேண்டிவருகிறோம்,ஓடவேண்டிவருகிறோம்,சொல்லவேண்டிவ