In [None]:
%pip install transformers torch datasets sentencepiece


In [None]:
%pip install protobuf

In [None]:
%pip install -U "transformers[torch]"

In [None]:
%pip install -U "accelerate>=0.26.0"


In [None]:
%pip install sacrebleu evaluate rouge_score bert_score tensorboard

In [None]:
# ----------------------
# Model Loading (Updated to mT5)
# ----------------------
!pip install evaluate
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
import numpy as np
import evaluate
import os

# Load mT5-small model and tokenizer
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)

# mT5 uses different language codes than T5
tokenizer.src_lang = "tam_IN"  # Tamil
tokenizer.tgt_lang = "sin_LK"  # Sinhala

In [None]:
# ----------------------
# Data Loading (Unchanged)
# ----------------------
# !pip install -U datasets
dataset = load_dataset(
    "csv",
    data_files={
        "train": "/kaggle/input/mt5-data/train.tsv",
        # "validation": "/kaggle/input/mt5-data/val.tsv",
        "test": "/kaggle/input/mt5-data/test.tsv"
    },
    delimiter="\t",
    column_names=["source", "target"]
)

# Example usage
print("Train Sample:", dataset["train"][1])
print("Validation Sample:", dataset["validation"][1])
print("Test Sample:", dataset["test"][1])

In [None]:
# ----------------------
# Preprocessing (Updated for mT5)
# ----------------------
def preprocess_function(examples):
    # mT5 doesn't need forced prefix like T5, but we'll keep it for consistency
    inputs = ["translate Tamil to Sinhala: " + ex for ex in examples["source"]]
    targets = [ex for ex in examples["target"]]


    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    # Updated tokenizer call (no more as_target_tokenizer)
    labels = tokenizer(
        text_target=targets,  # New recommended way
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [None]:
from evaluate import load
import numpy as np
import torch

bleu = load("sacrebleu")
rouge = load("rouge")
chrf = load("chrf")
bart_score = load("bertscore")  # No direct "bartscore", use bertscore or integrate external lib

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = rouge.compute(predictions=decoded_preds, references=[l[0] for l in decoded_labels])
    chrf_result = chrf.compute(predictions=decoded_preds, references=decoded_labels)
    bertscore_result = bart_score.compute(predictions=decoded_preds, references=[l[0] for l in decoded_labels], lang="si")

    # Exact Match
    em = np.mean([p == l[0] for p, l in zip(decoded_preds, decoded_labels)])

    # Token Accuracy
    total = correct = 0
    for pred, label in zip(decoded_preds, decoded_labels):
        pred_tokens = pred.split()
        label_tokens = label[0].split()
        total += len(label_tokens)
        correct += sum([p == l for p, l in zip(pred_tokens, label_tokens)])
    token_acc = correct / total if total > 0 else 0

    return {
        "bleu": bleu_result["score"],
        "rougeL": rouge_result["rougeL"],
        "chrf": chrf_result["score"],
        "exact_match": em,
        "token_accuracy": token_acc,
        "bertscore_f1": np.mean(bertscore_result["f1"])
    }


In [None]:
# ----------------------
# Training Setup
# ----------------------
training_args = Seq2SeqTrainingArguments(
    output_dir="./mT5-results_ta_si",
    eval_strategy="epoch",
    save_strategy="steps",
    save_steps=10,
    save_total_limit=3,
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=1e-4,
    per_device_train_batch_size=4,  # Reduced for mT5-small's larger memory footprint
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=25,
    fp16=True,
    warmup_steps=1000,
    lr_scheduler_type="cosine", 
    gradient_accumulation_steps=2,
    optim="adafactor",
    report_to="tensorboard",
    metric_for_best_model="bleu",
    greater_is_better=True,
    predict_with_generate=True
)


In [None]:
# ----------------------
# Metrics (Same as before)
# ----------------------
# [Keep your existing compute_metrics function]

# ----------------------
# Trainer
# ----------------------
from transformers import MT5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainer

model_name = "google/mt5-small"
model = MT5ForConditionalGeneration.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[SavePerEpochCallback(tokenizer)]
)


In [None]:
trainer.train(resume_from_checkpoint="/kaggle/input/last-checkpoint-3")

In [None]:
import os
import shutil
from IPython.display import FileLink

output_dir = "./mT5-results_ta_si"

# List checkpoint folders and sort by checkpoint number
folders = [f for f in os.listdir(output_dir) if f.startswith("checkpoint")]
folders = sorted(folders, key=lambda x: int(x.split("-")[-1]))

latest_checkpoint = folders[-1]
checkpoint_folder = os.path.join(output_dir, latest_checkpoint)

# Zip the latest checkpoint folder
shutil.make_archive("last_checkpoint", 'zip', checkpoint_folder)

# Provide download link
FileLink("last_checkpoint.zip")


In [None]:
import torch

# Save full model as HuggingFace + PyTorch .pt
model_path = "./mT5-final-model"

# Save HuggingFace format (transformers)
trainer.save_model(model_path)  # includes config, tokenizer, model weights

# Also save raw PyTorch weights (if needed separately)
torch.save(model.state_dict(), f"{model_path}/mt5_final_model.pt")

In [None]:
FileLink("./mT5-final-model/mt5_final_model.pt")