In [None]:
# Install PyTorch with GPU support (CUDA 11.8 version)
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

import torch
print("CUDA Available:", torch.cuda.is_available())  # Should be True
print("GPU:", torch.cuda.get_device_name(0))  # Should print Quadro RTX 4000

In [None]:
%pip install -U "transformers[torch]"
%pip install -U datasets accelerate evaluate sacrebleu sentencepiece protobuf rouge_score bert_score tensorboard

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, DataCollatorForSeq2Seq
from datasets import load_dataset

In [None]:
%pip install sentencepiece protobuf
%pip install tiktoken

In [None]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

tokenizer.src_lang = "ta_IN"
tokenizer.tgt_lang = "si_LK"

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "csv",
    data_files={
        "train": "data/train-new.tsv",
        "validation": "data/val-new.tsv",
        # "test": "test.tsv"
    },
    delimiter="\t",  # TSV format
    column_names=["source", "target"]  # Only needed if your files don't have headers
)

# Example usage
print("Train Sample:", dataset["train"][1])
print("Validation Sample:", dataset["validation"][1])
# print("Test Sample:", dataset["test"][1])

In [None]:
%pip uninstall tensorflow -y
%pip install tensorflow

In [None]:
# Preprocess the dataset
def preprocess_function(examples):
    inputs = [ex for ex in examples["source"]]  # Source: Tamil
    targets = [ex for ex in examples["target"]]  # Target: Sinhala
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
   
    # Tokenize target language
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
   
    return model_inputs

# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [None]:
%pip install -U "transformers[torch]"
%pip install -U "accelerate>=0.26.0"

%pip install sacrebleu
%pip install evaluate

In [None]:
from transformers import TrainerCallback
import os


class SavePerEpochCallback(TrainerCallback):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer


    def on_epoch_end(self, args, state, control, **kwargs):
        epoch_dir = os.path.join(args.output_dir, f"epoch_{int(state.epoch)}_model")
        os.makedirs(epoch_dir, exist_ok=True)
        kwargs["model"].save_pretrained(epoch_dir)
        self.tokenizer.save_pretrained(epoch_dir)
        return control


In [None]:
! pip install rouge_score

! pip install bert_score


In [None]:
from evaluate import load
import numpy as np
import torch


bleu = load("sacrebleu")
rouge = load("rouge")
chrf = load("chrf")
bart_score = load("bertscore")  # No direct "bartscore", use bertscore or integrate external lib


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)


    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)


    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)


    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = rouge.compute(predictions=decoded_preds, references=[l[0] for l in decoded_labels])
    chrf_result = chrf.compute(predictions=decoded_preds, references=decoded_labels)
    bertscore_result = bart_score.compute(predictions=decoded_preds, references=[l[0] for l in decoded_labels], lang="si")


    # Exact Match
    em = np.mean([p == l[0] for p, l in zip(decoded_preds, decoded_labels)])


    # Token Accuracy
    total = correct = 0
    for pred, label in zip(decoded_preds, decoded_labels):
        pred_tokens = pred.split()
        label_tokens = label[0].split()
        total += len(label_tokens)
        correct += sum([p == l for p, l in zip(pred_tokens, label_tokens)])
    token_acc = correct / total if total > 0 else 0


    return {
        "bleu": bleu_result["score"],
        "rougeL": rouge_result["rougeL"],
        "chrf": chrf_result["score"],
        "exact_match": em,
        "token_accuracy": token_acc,
        "bertscore_f1": np.mean(bertscore_result["f1"])
    }


In [None]:
%pip install transformers[torch]
%pip install --upgrade accelerate
%pip install --upgrade "accelerate>=0.26.0"

In [None]:
# from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="./outputs_ta_si",
    eval_strategy="epoch",
    save_strategy = "epoch",
    # save_strategy="steps",       # save checkpoints every fixed number of steps

    save_steps=3,              # save checkpoint every 500 training steps
    save_total_limit=3,
    load_best_model_at_end = True,
    metric_for_best_model = "bleu",
    greater_is_better=True,

    logging_dir="./logs",
    logging_steps=5,

    num_train_epochs=5,
    learning_rate=5e-5,
    warmup_steps=500,
    lr_scheduler_type="linear",  # inside Seq2SeqTrainingArguments
    optim="adafactor",


    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    predict_with_generate=True,
    fp16=True,  # True if you have GPU
    report_to="tensorboard",
    weight_decay=0.01,
    overwrite_output_dir=True,
    max_grad_norm=1.0,

)


In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, DataCollatorForSeq2Seq
from transformers import EarlyStoppingCallback

#  Data collator for padding and batching
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[SavePerEpochCallback(tokenizer), EarlyStoppingCallback(early_stopping_patience=2)]
)


In [None]:
trainer.train()

In [None]:
# trainer.train(resume_from_checkpoint="2.results_ta_si/checkpoint-1191")

In [None]:
# Save final model
trainer.save_model("./final_model_ta_si")
tokenizer.save_pretrained("./final_model_ta_si")