In [1]:
!pip install --upgrade transformers
!pip install transformers indic-nlp-library sentencepiece datasets

# Import Necessary Libraries
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from datasets import load_dataset, Dataset
import numpy as np
import wandb



In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [3]:
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Use AutoModelForMaskedLM for masked language models like Albert
from transformers import AutoModelForMaskedLM # Import the correct model class
model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Prepare Dataset
# Dummy dataset with Tamil sentences
data = {
    "input": [
        "அவள் சென்றான் பள்ளி.",
        "நான் சாப்பிட்டான்.",
        "அவள் செரியான்.",
        "அவர்கள் போகும் சந்தேகமா.",
        "புத்தகம் அவன் எடுத்தேன்."
    ],
    "target": [
        "அவள் சென்றாள் பள்ளி.",
        "நான் சாப்பிட்டேன்.",
        "அவள் சரியானா.",
        "அவர்கள் போகும் சந்தேகம்.",
        "புத்தகம் அவன் எடுத்தான்."
    ]
}

In [5]:
dataset = Dataset.from_dict(data)
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']


In [6]:
# Preprocess Data for the Model
def preprocess_function(batch):
    inputs = tokenizer(batch["input"], max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    targets = tokenizer(batch["target"], max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    return {
        "input_ids": inputs["input_ids"].squeeze(),
        "attention_mask": inputs["attention_mask"].squeeze(),
        "labels": targets["input_ids"].squeeze(), # Renamed to 'labels' for clarity
        "decoder_input_ids": targets["input_ids"].squeeze()  # Add decoder_input_ids
    }

train_dataset = train_dataset.map(preprocess_function)
test_dataset = test_dataset.map(preprocess_function)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch", # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",  # Save at the end of each epoch to match evaluation
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=10,
)



In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [9]:
wandb.init(project="AI_Grammar_checker_DL_Technique")
# Step 8: Train the Model
trainer.train()

# Step 9: Evaluate the Model
results = trainer.evaluate()
print(f"Evaluation Results: {results}")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mskugasaran[0m ([33muniversityofsrilanka[0m). Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,No log,18.838339
2,No log,17.539721
3,No log,16.52067
4,No log,15.82538
5,No log,15.471945


There were missing keys in the checkpoint model loaded: ['predictions.decoder.weight', 'predictions.decoder.bias'].


Evaluation Results: {'eval_loss': 15.471944808959961, 'eval_runtime': 1.0131, 'eval_samples_per_second': 0.987, 'eval_steps_per_second': 0.987, 'epoch': 5.0}


In [13]:
# Step 10: Test Grammar Correction
def grammar_correction(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", max_length=128, truncation=True, padding="max_length")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # Remove use_cache=False or set it to True if supported
    outputs = model.generate(**inputs)
    corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_sentence

In [19]:
def grammar_correction(sentence):
    # Tokenize the input
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Pass the inputs to the model
    try:
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Predict the corrected tokens
        predicted_token_ids = logits.argmax(dim=-1)
        corrected_sentence = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)
        return corrected_sentence
    except Exception as e:
        print(f"Error in grammar_correction: {e}")
        return None  # Return None in case of errors


In [20]:
# Test the grammar correction function
test_sentences = [
    "அவள் சென்றான் பள்ளி.",  # Incorrect grammar
    "நான் சாப்பிட்டான்.",   # Incorrect grammar
    "அவள் செரியான்."       # Incorrect grammar
]

# Iterate through test sentences
for sentence in test_sentences:
    print(f"Input: {sentence}")
    # Correct the grammar using the grammar_correction function
    corrected = grammar_correction(sentence)
    print(f"Corrected: {corrected}\n")  # Print the corrected sentence


Input: அவள் சென்றான் பள்ளி.
Corrected: . அவள எனகு பளள. .

Input: நான் சாப்பிட்டான்.
Corrected: , நனபபடடன. ,

Input: அவள் செரியான்.
Corrected: . அவள சரயன. .



In [21]:
# Step 11: Save the Fine-Tuned Model
model.save_pretrained("./tamil_grammar_corrector")
tokenizer.save_pretrained("./tamil_grammar_corrector")

('./tamil_grammar_corrector/tokenizer_config.json',
 './tamil_grammar_corrector/special_tokens_map.json',
 './tamil_grammar_corrector/spiece.model',
 './tamil_grammar_corrector/added_tokens.json',
 './tamil_grammar_corrector/tokenizer.json')