# **EC9640 - Artificial Intelligence Project**
### **Project Name** - Spelling corrector and grammar checker for Tamil
### **Team Member** - 2020/E/067 & 2020/E/145

## Connect the google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Download libraries

In [6]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

# **Deep Learning approach 2**

## Dataset: Token-Level Preparation

In [20]:
import pandas as pd
import re

# Hypothetical function to split Tamil sentences into tokens
def tokenize_tamil_sentence(sentence):
    # Simple approach: split by space/punctuation
    # For real-world Tamil, consider using a more robust tokenizer or Byte-Pair Encoding (IndicNLP, etc.)
    tokens = re.split(r"\\s+", sentence.strip())
    return tokens

def align_and_label(incorrect_sent, correct_sent):
    """
    Very naive approach:
    1. tokenize both
    2. if tokens differ at position i, label as 'ERROR' else 'OK'
    This won't handle cases where the number of tokens differ significantly.
    """
    inc_tokens = tokenize_tamil_sentence(incorrect_sent)
    cor_tokens = tokenize_tamil_sentence(correct_sent)

    # Pad shorter list if needed for demonstration (naive approach)
    max_len = max(len(inc_tokens), len(cor_tokens))
    inc_tokens += ["[PAD_INC]"] * (max_len - len(inc_tokens))
    cor_tokens += ["[PAD_COR]"] * (max_len - len(cor_tokens))

    labels = []
    for i in range(max_len):
        if inc_tokens[i] == "[PAD_INC]":
            # no real token in incorrect
            labels.append("OK")  # or "PAD"
        elif inc_tokens[i] != cor_tokens[i]:
            labels.append("ERROR")
        else:
            labels.append("OK")

    return inc_tokens, labels

# Suppose you have a small CSV dataset
dataset_path = '/content/drive/MyDrive/EC9640 - AI Project/tamil_grammar_dataset_200.csv'

df = pd.read_csv(dataset_path)

df.head()



Unnamed: 0,id,error_type,grammatical_error_sentence,correct_sentence
0,1,Error1,நான் புத்தகம் வாசிக்கிறோம்,நான் புத்தகம் வாசிக்கிறேன்
1,2,Error1,நீ உணவு சாப்பிடுகிறேன்,நீ உணவு சாப்பிடுகிறாய்
2,3,Error1,அவன் பந்து விளையாடுகிறோம்,அவன் பந்து விளையாடுகிறான்
3,4,Error1,அவள் பாடல் எழுதுகிறார்கள்,அவள் பாடல் எழுதுகிறாள்
4,5,Error1,நாங்கள் இசை கேட்கிறேன்,நாங்கள் இசை கேட்கிறோம்


In [22]:
all_inc_tokens = []
all_labels = []

for i, row in df.iterrows():
    inc_tokens, token_labels = align_and_label(
        row["grammatical_error_sentence"],
        row["correct_sentence"]
    )
    all_inc_tokens.append(inc_tokens)
    all_labels.append(token_labels)

# Now all_inc_tokens[i] is a list of tokens for sample i
# and all_labels[i] is the label for each token


## Convert Tokenized Data into Hugging Face Dataset

In [23]:
from datasets import Dataset

data_dict = {
    "tokens": all_inc_tokens,
    "labels": all_labels
}
raw_dataset = Dataset.from_dict(data_dict)

# We can train/val split if needed
train_val = raw_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_val["train"]
val_dataset = train_val["test"]


### Label Encoding

In [24]:
label2id = {"OK": 0, "ERROR": 1}
id2label = {0: "OK", 1: "ERROR"}


## Token Classification Setup with BERT


In [25]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

model_name = "bert-base-multilingual-cased"  # or a Tamil-specific BERT if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

num_labels = len(label2id)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenization / Alignment

In [26]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,  # important for token-level tasks
        padding="max_length",
        max_length=64
    )

    labels_batch = []
    for i, labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # map subwords to original tokens
        label_ids = []
        prev_word_id = None

        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # ignore
            elif word_id != prev_word_id:
                # first subword token of that word
                label_ids.append(label2id[labels[word_id]])
            else:
                # subsequent subword tokens, set to -100 to ignore
                label_ids.append(-100)
            prev_word_id = word_id
        labels_batch.append(label_ids)

    tokenized_inputs["labels"] = labels_batch
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)

train_dataset.set_format("torch")
val_dataset.set_format("torch")


Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

## Trainer Setup (Token Classification)

In [27]:
args = TrainingArguments(
    output_dir="token_classifier",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2
)

def compute_metrics(p):
    # Usually, you'd compute token-level accuracy, F1, etc.
    # We'll do a placeholder here
    return {}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0001,2.7e-05
2,0.0,1.2e-05
3,0.0,9e-06
4,0.0,8e-06
5,0.0,7e-06
6,0.0,6e-06
7,0.0,6e-06
8,0.0,6e-06
9,0.0,6e-06
10,0.0,5e-06


TrainOutput(global_step=200, training_loss=0.0029011882578924997, metrics={'train_runtime': 2031.2927, 'train_samples_per_second': 0.788, 'train_steps_per_second': 0.098, 'total_flos': 52259351347200.0, 'train_loss': 0.0029011882578924997, 'epoch': 10.0})

## Inference: Identify Errors, Replace with [MASK]

In [28]:
model.eval()

def detect_errors_and_mask(sentence):
    # 1. tokenize by words
    tokens = tokenize_tamil_sentence(sentence)
    # 2. BERT tokenization
    encoded = tokenizer(
        tokens,
        return_tensors="pt",
        is_split_into_words=True,
        max_length=64,
        truncation=True
    )
    with torch.no_grad():
        outputs = model(**encoded)
    logits = outputs.logits  # (batch_size=1, seq_len, num_labels)
    predictions = logits.argmax(dim=-1).squeeze(0).tolist()  # pick label id per subword token

    # Map subword -> original token id
    word_ids = encoded.word_ids(batch_index=0)

    # Build final token-level label
    word_label_map = {}
    for subword_idx, label_id in zip(word_ids, predictions):
        if subword_idx is None:
            continue
        if label_id == -100:
            continue
        # If the same token spans multiple subwords, use the first subword's label
        if subword_idx not in word_label_map:
            word_label_map[subword_idx] = label_id

    # Now create a new list of tokens where ERROR tokens become [MASK]
    masked_tokens = []
    for i, tok in enumerate(tokens):
        lbl_id = word_label_map.get(i, 0)  # default to 0=OK
        if lbl_id == label2id["ERROR"]:
            masked_tokens.append("[MASK]")
        else:
            masked_tokens.append(tok)

    return masked_tokens

test_sentence = "நான் புத்தகம் வாசிக்கிறோம்"  # example incorrect
masked_toks = detect_errors_and_mask(test_sentence)
print("Masked tokens:", masked_toks)
# e.g., might see something like ["நான்", "புத்தகம்", "[MASK]"]


Masked tokens: ['[MASK]']


## Error Correction (Mask-Filling)

In [29]:
from transformers import AutoModelForMaskedLM

mask_model_name = "bert-base-multilingual-cased"  # or a Tamil-specific ML model
mask_model = AutoModelForMaskedLM.from_pretrained(mask_model_name).eval()
mask_tokenizer = AutoTokenizer.from_pretrained(mask_model_name)

def fill_masks(masked_tokens):
    """
    For each [MASK], let the model propose a top candidate.
    We'll do a simple single-mask approach for each error token.
    If multiple errors exist in the same sentence, we can handle them one by one or all at once.
    """
    final_tokens = masked_tokens[:]
    for i, tok in enumerate(final_tokens):
        if tok == "[MASK]":
            # Construct sentence
            masked_sentence = " ".join(final_tokens)
            inputs = mask_tokenizer(masked_sentence, return_tensors="pt")
            with torch.no_grad():
                outputs = mask_model(**inputs)
            logits = outputs.logits
            # Which index is [MASK] in subword space?
            mask_idx = (inputs["input_ids"] == mask_tokenizer.mask_token_id).nonzero(as_tuple=True)
            # For simplicity, assume only 1 mask at a time
            mask_idx = mask_idx[1].item()  # the position in sequence
            # Extract logits for that position
            mask_logits = logits[0, mask_idx, :]
            top_token_id = mask_logits.argmax(dim=-1).item()
            predicted_token = mask_tokenizer.decode([top_token_id]).strip()

            # Replace [MASK] with predicted token
            final_tokens[i] = predicted_token

    return " ".join(final_tokens)

corrected_sentence = fill_masks(masked_toks)
print("Final corrected sentence:", corrected_sentence)


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with anot

Final corrected sentence: .


# **Deep Learning approach - mt5/small**

## Load the dataset

In [3]:
import pandas as pd

dataset_path = '/content/drive/MyDrive/EC9640 - AI Project/tamil_grammar_dataset_200.csv'

# 1. Load the data
df = pd.read_csv(dataset_path)  # your CSV file

df.head()

Unnamed: 0,id,error_type,grammatical_error_sentence,correct_sentence
0,1,Error1,நான் புத்தகம் வாசிக்கிறோம்,நான் புத்தகம் வாசிக்கிறேன்
1,2,Error1,நீ உணவு சாப்பிடுகிறேன்,நீ உணவு சாப்பிடுகிறாய்
2,3,Error1,அவன் பந்து விளையாடுகிறோம்,அவன் பந்து விளையாடுகிறான்
3,4,Error1,அவள் பாடல் எழுதுகிறார்கள்,அவள் பாடல் எழுதுகிறாள்
4,5,Error1,நாங்கள் இசை கேட்கிறேன்,நாங்கள் இசை கேட்கிறோம்


## Split the dataset

In [5]:
from sklearn.model_selection import train_test_split

# 2. Split into train/val/test
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=42)

print("Train dataset: ",len(train_df))
print("Validation dataset: ",len(val_df))
print("Test dataset: ",len(test_df))


Train dataset:  144
Validation dataset:  26
Test dataset:  30


## Create a Hugging Face Dataset

In [7]:
from datasets import Dataset

def build_dataset(df, prefix="grammar correction: "):
    # Each row is: incorrect_sentence, correct_sentence
    data = {
        "input_text": [prefix + row["grammatical_error_sentence"] for _, row in df.iterrows()],
        "target_text": [row["correct_sentence"] for _, row in df.iterrows()]
    }
    return Dataset.from_dict(data)

train_dataset = build_dataset(train_df)
val_dataset = build_dataset(val_df)
test_dataset = build_dataset(test_df)


## Fine-tuning the Model

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
# 1. Tokenize function
def tokenize_function(batch):
    # batch["input_text"] is the incorrect sentence with prefix
    # batch["target_text"] is the correct sentence
    inputs = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=64)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=64)
    # Return input_ids, attention_masks, and labels
    inputs["labels"] = labels["input_ids"]
    return inputs

train_dataset_tokenized = train_dataset.map(tokenize_function, batched=True)
val_dataset_tokenized = val_dataset.map(tokenize_function, batched=True)
test_dataset_tokenized = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/144 [00:00<?, ? examples/s]



Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [10]:
# 2. Data collator for seq2seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [11]:
# 3. Training arguments
training_args = TrainingArguments(
    output_dir="my_mt5_tamil_correction",
    evaluation_strategy="steps",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    optim="adamw_torch"
)



In [12]:
# 4. Define metrics (optional)
def compute_metrics(eval_pred):
    # Typically, you'd decode model outputs and compare with references
    # For grammar correction, you might compute BLEU, ROUGE, or WER.
    # Here, we'll just do a placeholder
    return {}


In [13]:
# 5. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=val_dataset_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [14]:

# 6. Train!
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss


TrainOutput(global_step=90, training_loss=39.350819227430556, metrics={'train_runtime': 2752.3528, 'train_samples_per_second': 0.262, 'train_steps_per_second': 0.033, 'total_flos': 47587472179200.0, 'train_loss': 39.350819227430556, 'epoch': 5.0})

In [15]:
# 7. Evaluate on test set
trainer.evaluate(test_dataset_tokenized)


{'eval_loss': 22.154544830322266,
 'eval_runtime': 22.7862,
 'eval_samples_per_second': 1.317,
 'eval_steps_per_second': 0.176,
 'epoch': 5.0}

In [18]:
model.save_pretrained("my_finetuned_mt5_tamil_grammar")
tokenizer.save_pretrained("my_finetuned_mt5_tamil_grammar")

# Then load your fine-tuned model
model = AutoModelForSeq2SeqLM.from_pretrained("my_finetuned_mt5_tamil_grammar")
tokenizer = AutoTokenizer.from_pretrained("my_finetuned_mt5_tamil_grammar")

def correct_tamil_sentence(sentence):
    prefix = "grammar correction: "
    inputs = tokenizer(prefix + sentence, return_tensors="pt")
    output_ids = model.generate(**inputs, max_length=64, num_beams=4, early_stopping=True)
    corrected_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return corrected_text


# Example usage:
# incorrect = "நான் புத்தகம் வாசிக்கிறோம்"
incorrect = input("Enter the incorrect tamil sentence:\n")
corrected = correct_tamil_sentence(incorrect)
print("Original: ", incorrect)
print("Corrected:", corrected)


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Enter the incorrect tamil sentence:
நான் புத்தகம் வாசிக்கிறோம்
Original:  நான் புத்தகம் வாசிக்கிறோம்
Corrected: <extra_id_0> 
