In [None]:
!pip install transformers datasets sentencepiece sacrebleu accelerate gdown

    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
#mBART3 with BASE
import pandas as pd
import torch
import transformers
from datasets import Dataset, DatasetDict
from transformers import MBart50TokenizerFast, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import logging
import csv
import os
from datetime import datetime

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Set up logging to file
log_file = os.path.join("./mbart_finetuned3", "training_logs.csv")
os.makedirs("./mbart_finetuned3", exist_ok=True)

# Initialize CSV log file with headers
with open(log_file, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["epoch", "step", "training_loss", "validation_loss", "learning_rate", "timestamp"])

# Custom callback for logging
class CustomLoggingCallback(transformers.TrainerCallback):
    def __init__(self, log_file):
        self.log_file = log_file

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return
        # Extract relevant metrics
        epoch = state.epoch
        step = state.global_step
        training_loss = logs.get("loss", None)
        validation_loss = logs.get("eval_loss", None)
        learning_rate = logs.get("learning_rate", None)
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        # Write to CSV
        with open(self.log_file, mode='a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([epoch, step, training_loss, validation_loss, learning_rate, timestamp])

# Load DataFrame
df = pd.read_csv('./merged_output.csv')
df1 = df.copy()


# Verify DataFrame
print("DataFrame shape:", df1.shape)
print("Sample data:\n", df1.head(5))

# Rename columns
df1 = df1.rename(columns={"Tamil": "ta", "Telugu": "te"})

# Convert to Dataset
dataset = Dataset.from_pandas(df1)
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
split_dataset = DatasetDict({"train": split_dataset["train"], "test": split_dataset["test"]})
print("Train dataset size:", len(split_dataset["train"]))
print("Test dataset size:", len(split_dataset["test"]))

# Load mBART model and tokenizer
MBART_MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"
mbart_tokenizer = MBart50TokenizerFast.from_pretrained(MBART_MODEL_NAME, src_lang="ta_IN", tgt_lang="te_IN")
mbart_model = AutoModelForSeq2SeqLM.from_pretrained(MBART_MODEL_NAME).to(device)

# Verify vocab sizes
mbart_vocab_size_tokenizer = len(mbart_tokenizer)
mbart_vocab_size_model = mbart_model.get_output_embeddings().weight.size(0)
print("mBART - Initial tokenizer vocab size:", mbart_vocab_size_tokenizer)
print("mBART - Initial model output vocab size:", mbart_vocab_size_model)

# Handle vocab size mismatch
if mbart_vocab_size_tokenizer != mbart_vocab_size_model:
    print(f"Warning: mBART vocab size mismatch (Tokenizer: {mbart_vocab_size_tokenizer}, Model: {mbart_vocab_size_model}). Adjusting model embeddings.")
    mbart_model.resize_token_embeddings(mbart_vocab_size_tokenizer)
    print("Post-resize model vocab size:", mbart_model.get_output_embeddings().weight.size(0))
else:
    print("mBART - Vocab sizes match, no adjustment needed.")

# Preprocessing function
def mbart_preprocess_function(examples):
    inputs = [ta_text for ta_text in examples["ta"]]
    targets = [te_text for te_text in examples["te"]]
    model_inputs = mbart_tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with mbart_tokenizer.as_target_tokenizer():
        labels = mbart_tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    labels = [[-100 if token == mbart_tokenizer.pad_token_id else token for token in seq] for seq in labels]
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing
mbart_tokenized_datasets = split_dataset.map(
    mbart_preprocess_function,
    batched=True,
    batch_size=1000,
    remove_columns=["ta", "te"]
)
print("mBART - Tokenized train sample:", mbart_tokenized_datasets["train"][0])

# Training arguments
mbart_training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart_finetuned3",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    report_to="none",
    push_to_hub=False,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    logging_steps=500,
    save_steps=5000
)

# Data collator and trainer
mbart_data_collator = DataCollatorForSeq2Seq(mbart_tokenizer, model=mbart_model)
mbart_trainer = Seq2SeqTrainer(
    model=mbart_model,
    args=mbart_training_args,
    train_dataset=mbart_tokenized_datasets["train"],
    eval_dataset=mbart_tokenized_datasets["test"],
    tokenizer=mbart_tokenizer,
    data_collator=mbart_data_collator,
    callbacks=[CustomLoggingCallback(log_file)]
)

# Train
print("Training mBART...")
mbart_trainer.train()

# Save
mbart_trainer.save_model("./mbart_finetuned3")
mbart_tokenizer.save_pretrained("./mbart_finetuned3")

# Verify saved model
mbart_saved_model = AutoModelForSeq2SeqLM.from_pretrained("./mbart_finetuned3").to(device)
mbart_saved_tokenizer = MBart50TokenizerFast.from_pretrained("./mbart_finetuned3", src_lang="ta_IN", tgt_lang="te_IN")
print("mBART - Saved tokenizer vocab size:", len(mbart_saved_tokenizer))
print("mBART - Saved model output vocab size:", mbart_saved_model.get_output_embeddings().weight.size(0))

# Test translation with debugging
def mbart_translate_text(input_text, debug=False):
    inputs = mbart_saved_tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True, padding=True).to(device)
    if debug:
        print("Tokenized Input IDs:", inputs["input_ids"].tolist())
    outputs = mbart_saved_model.generate(
        **inputs,
        max_length=256,
        min_length=10,
        num_beams=5,
        early_stopping=False,
        length_penalty=1.0,
        no_repeat_ngram_size=2,
        forced_bos_token_id=mbart_saved_tokenizer.lang_code_to_id["te_IN"]
    )
    if debug:
        print("Raw Output IDs:", outputs[0].tolist())
        print("Decoded with special tokens:", mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=False))
    decoded_output = mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output.strip()

# Test
input_text = "வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?"  # "Hello, how are you?"
translated_text = mbart_translate_text(input_text, debug=True)
print("mBART Translation:", translated_text)

# Plotting script for training logs
import matplotlib.pyplot as plt

def plot_training_logs(log_file):
    # Read the CSV log file
    logs = pd.read_csv(log_file)
    
    # Filter rows with non-null training and validation loss
    train_logs = logs[logs['training_loss'].notnull()]
    valid_logs = logs[logs['validation_loss'].notnull()]
    
    # Plot
    plt.figure(figsize=(10, 6))
    if not train_logs.empty:
        plt.plot(train_logs['step'], train_logs['training_loss'], label='Training Loss', marker='o')
    if not valid_logs.empty:
        plt.plot(valid_logs['step'], valid_logs['validation_loss'], label='Validation Loss', marker='s')
    
    plt.xlabel('Step')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss Over Time')
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join("./mbart_finetuned3", "training_loss_plot.png"))
    plt.close()

# Generate plot
plot_training_logs(log_file)
print("Training log plot saved as 'training_loss_plot.png' in the output directory.")

2025-04-26 10:57:11.510466: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-26 10:57:11.523131: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745645231.537519    9107 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745645231.542050    9107 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745645231.553443    9107 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Using device: cuda
DataFrame shape: (547567, 2)
Sample data:
                                                Tamil  \
0     அவள் பெயர் கூட அவளுக்கு ஒன்றும் நினைவில் இல்லை   
1  சமைப்பது வேகமானது இதன் விளைவாக ஊட்டச்சத்துக்கள...   
2  நாம் ஏற்கனவே செய்வதை ரசிப்பதைக் கண்டுபிடிப்பதற...   
3  இது ஒரு மேனுவல் அல்லது ஆட்டோமேட்டிக் கியர்பாக்...   
4                           இதுவும் நல்ல முயற்சிதான்   

                                              Telugu  
0               కనీసం ఆమె పేరు కూడా ఆయనకు గుర్తులేదు  
1  వంట వేగంగా ఉంటుంది తద్వారా పోషకాలు మరియు విటమి...  
2  మనం ఇప్పటికే ఆనందించేదాన్ని గుర్తించడానికి బదు...  
3  ఇది మాన్యువల్ లేదా ఆటోమేటిక్ గేర్బాక్స్తో పెట్...  
4                ఇది కూడా మంచి ఉపయోగ కరమైన ప్రయత్నమే  
Train dataset size: 492810
Test dataset size: 54757




mBART - Initial tokenizer vocab size: 250054
mBART - Initial model output vocab size: 250054
mBART - Vocab sizes match, no adjustment needed.


Map:   0%|          | 0/492810 [00:00<?, ? examples/s]



Map:   0%|          | 0/54757 [00:00<?, ? examples/s]

mBART - Tokenized train sample: {'input_ids': [250044, 2690, 3770, 63277, 235753, 8182, 15453, 483, 55963, 86322, 78611, 8285, 6149, 80334, 8182, 131846, 8182, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [250045, 60078, 1296, 483, 6, 136571, 27013, 14206, 4276, 103646, 95432, 8197, 55763, 5271, 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,1.2821,1.240659
2,1.0649,1.125419
3,0.9227,1.088127
4,0.8052,1.072729
5,0.7108,1.076244


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


mBART - Saved tokenizer vocab size: 250054
mBART - Saved model output vocab size: 250054
Tokenized Input IDs: [[250044, 190574, 4, 37305, 29947, 128251, 73952, 32, 2]]
Raw Output IDs: [2, 250045, 9327, 3071, 1886, 89838, 4, 22735, 24722, 91064, 32, 2]
Decoded with special tokens: </s>te_IN నమస్కారం, మీరు ఎలా ఉన్నారు?</s>
mBART Translation: నమస్కారం, మీరు ఎలా ఉన్నారు?
Training log plot saved as 'training_loss_plot.png' in the output directory.


In [None]:
import torch
from transformers import MBart50TokenizerFast, AutoModelForSeq2SeqLM
from IPython.display import display, HTML

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the fine-tuned model and tokenizer
MODEL_PATH = "./mbart_finetuned3"
mbart_saved_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(device)
mbart_saved_tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_PATH, src_lang="ta_IN", tgt_lang="te_IN")

# Translation function
def mbart_translate_text(input_text, debug=False):
    inputs = mbart_saved_tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True, padding=True).to(device)
    if debug:
        print("Tokenized Input IDs:", inputs["input_ids"].tolist())
    outputs = mbart_saved_model.generate(
        **inputs,
        max_length=256,
        min_length=10,
        num_beams=5,
        early_stopping=False,
        length_penalty=1.0,
        no_repeat_ngram_size=2,
        forced_bos_token_id=mbart_saved_tokenizer.lang_code_to_id["te_IN"]
    )
    if debug:
        print("Raw Output IDs:", outputs[0].tolist())
        print("Decoded with special tokens:", mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=False))
    decoded_output = mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output.strip()

# Interactive translation function for Jupyter
def translate_interactively():
    display(HTML("<h3>Tamil to Telugu Translator</h3>"))
    print("Enter Tamil text below to translate to Telugu (type 'exit' to stop):")
    
    while True:
        # Get input from user
        user_input = input("Tamil Input: ").strip()
        
        # Check for exit condition
        if user_input.lower() == "exit":
            display(HTML("<p style='color: green;'>Exiting translator...</p>"))
            break
        
        if not user_input:
            display(HTML("<p style='color: red;'>Please enter some text.</p>"))
            continue
        
        # Translate and display result
        try:
            translated_text = mbart_translate_text(user_input, debug=False)  # Set debug=True for detailed output
            display(HTML(f"<p><b>Tamil:</b> {user_input}<br><b>Telugu Translation:</b> {translated_text}</p>"))
        except Exception as e:
            display(HTML(f"<p style='color: red;'>Error during translation: {e}</p>"))

# Run the translator
translate_interactively()

Using device: cuda


2025-05-05 15:47:57.253181: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-05 15:47:57.264918: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746440277.278350   83313 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746440277.282536   83313 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746440277.292606   83313 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Enter Tamil text below to translate to Telugu (type 'exit' to stop):


Tamil Input:  அமெரிக்காவில் வட்டி விகிதங்கள் அதிகரித்தால், உணர்வு சந்தையில் தங்க விலைகள் மோசமடைகின்றன.


KeyboardInterrupt: Interrupted by user

In [None]:
# Testing on csv files
# Install required dependencies
!pip install -q torch==2.3.1 torchvision==0.18.1
!pip install -q transformers==4.41.2 datasets==2.20.0
!pip install -q sacrebleu==2.3.1 pandas==2.2.2 numpy==1.25.2 tqdm==4.66.4
!pip install -q bert-score==0.3.13
!pip install -q protobuf==3.20.3  # Compatible protobuf version
!pip install -q indic-nlp-library
# Uncomment the line below if you want to use COMET metric
# !pip install -q unbabel-comet

# Tamil-Telugu Translation Model Evaluation
# Combines BLEU, BERTScore, COMET, chrF++ and TER evaluation metrics

import numpy as np
import pandas as pd
import torch
import logging
from tqdm import tqdm
from datasets import Dataset
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration, AutoConfig, AutoModelForSeq2SeqLM
from sacrebleu import corpus_bleu, corpus_chrf, corpus_ter
from indicnlp.tokenize import indic_tokenize
from bert_score import score as bert_score

# Try importing COMET (optional)
try:
    from comet import download_model, load_from_checkpoint
    comet_available = True
except ImportError:
    comet_available = False
    print("COMET not available. Will skip COMET evaluation.")

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")

# Configuration - Update these paths for your local environment
MODEL_PATH = "./mbart_finetuned3"  # Path to your fine-tuned model
DATASET_PATH = "./merged_testing.csv"  # Path to your test dataset
NUM_SAMPLES =2000   # Adjust as needed

# Output paths
BLEU_OUTPUT_PATH = "./mBART3_RESULTS/bleu_evaluation_results.csv"
BERTSCORE_OUTPUT_PATH = "./mBART3_RESULTS/bertscore_evaluation_results.csv" 
COMET_OUTPUT_PATH = "./mBART3_RESULTS/comet_evaluation_results.csv"
CHRF_OUTPUT_PATH = "./mBART3_RESULTS/chrf_evaluation_results.csv"
TER_OUTPUT_PATH = "./mBART3_RESULTS/ter_evaluation_results.csv"

# Load the model configuration first
logger.info("Loading model configuration...")
config = AutoConfig.from_pretrained(MODEL_PATH)
if hasattr(config, 'generation_config'):
    if config.generation_config.early_stopping is None:
        config.generation_config.early_stopping = True
else:
    config.early_stopping = True

# Load the model with the fixed config
logger.info("Loading model and tokenizer...")
# Choose the appropriate model class based on your saved model
try:
    mbart_saved_model = MBartForConditionalGeneration.from_pretrained(
        MODEL_PATH,
        config=config
    ).to(device)
except:
    # Fall back to AutoModelForSeq2SeqLM if MBartForConditionalGeneration fails
    mbart_saved_model = AutoModelForSeq2SeqLM.from_pretrained(
        MODEL_PATH,
        config=config
    ).to(device)

# Fix generation_config to avoid further issues
if hasattr(mbart_saved_model, 'generation_config'):
    mbart_saved_model.generation_config.early_stopping = True

mbart_saved_tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_PATH, src_lang="ta_IN", tgt_lang="te_IN")

# Load the test dataset
logger.info("Loading dataset...")
df = pd.read_csv(DATASET_PATH)
print(f"Dataset columns: {df.columns.tolist()}")
print(f"Dataset shape: {df.shape}")
print(f"First few rows:\n{df.head()}")

# Find the actual column names for Tamil and Telugu sentences
tamil_col = None
telugu_col = None

# Common column name patterns to check
tamil_patterns = ['tamil_sentence', 'tamil', 'source', 'src', 'Tamil', 'tamil_text']
telugu_patterns = ['telugu_sentence', 'telugu', 'target', 'tgt', 'Telugu', 'telugu_text']

for col in df.columns:
    if any(pattern.lower() in col.lower() for pattern in tamil_patterns):
        tamil_col = col
    if any(pattern.lower() in col.lower() for pattern in telugu_patterns):
        telugu_col = col

if tamil_col is None or telugu_col is None:
    raise ValueError(f"Could not identify Tamil and Telugu columns. Available columns: {df.columns.tolist()}")

print(f"Using Tamil column: {tamil_col}")
print(f"Using Telugu column: {telugu_col}")

# Select the appropriate columns and drop NaN values
df = df[[tamil_col, telugu_col]].dropna()
# Rename columns for consistency
df = df.rename(columns={tamil_col: 'tamil_sentence', telugu_col: 'telugu_sentence'})
test_dataset = Dataset.from_pandas(df)
print(f"Test dataset size: {len(test_dataset)}")

# Function to tokenize Telugu text using IndicNLP
def indic_tokenize_text(text):
    if not text or pd.isna(text):
        return ""
    return ' '.join(indic_tokenize.trivial_tokenize(text, lang='te'))

# Translation function (optimized parameters)
def mbart_translate_text(input_text, debug=False):
    inputs = mbart_saved_tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True, padding=True).to(device)
    if debug:
        logger.info(f"Tokenized Input IDs: {inputs['input_ids'].tolist()}")
    outputs = mbart_saved_model.generate(
        **inputs,
        max_length=256,
        min_length=10,
        num_beams=5,
        early_stopping=True,
        length_penalty=1.2,
        no_repeat_ngram_size=3,
        forced_bos_token_id=mbart_saved_tokenizer.lang_code_to_id["te_IN"]
    )
    if debug:
        logger.info(f"Raw Output IDs: {outputs[0].tolist()}")
        logger.info(f"Decoded with special tokens: {mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=False)}")
    decoded_output = mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output.strip()

# Generate translations for evaluation
def generate_translations(dataset, num_samples=NUM_SAMPLES):
    sources = []
    references = []
    hypotheses = []
    
    # Limit to num_samples or use full dataset
    test_data = dataset.select(range(min(num_samples, len(dataset))))
    logger.info(f"Generating translations for {len(test_data)} samples")
    
    for example in tqdm(test_data, desc="Generating translations"):
        input_text = example["tamil_sentence"]
        reference = example["telugu_sentence"]
        
        try:
            hypothesis = mbart_translate_text(input_text, debug=False)
        except Exception as e:
            logger.warning(f"Error translating '{input_text}': {e}")
            hypothesis = ""
            
        sources.append(input_text)
        references.append(reference)
        hypotheses.append(hypothesis)
    
    return test_data, sources, references, hypotheses

# Compute BLEU score
def compute_bleu(test_data, sources, references, hypotheses):
    logger.info("Computing BLEU score...")
    
    # Tokenize for BLEU calculation
    tokenized_hypotheses = [indic_tokenize_text(hyp) for hyp in hypotheses]
    tokenized_references = [[indic_tokenize_text(ref)] for ref in references]
    
    # Compute SacreBLEU score
    bleu = corpus_bleu(tokenized_hypotheses, tokenized_references, tokenize='none')  # Tokenization already done
    bleu_score = bleu.score
    logger.info(f"BLEU Score: {bleu_score:.2f}")
    
    # Save results
    results_df = pd.DataFrame({
        "tamil_sentence": sources,
        "telugu_sentence": references,
        "telugu_hypothesis": hypotheses,
        "bleu_score": [bleu_score] * len(sources)  # Same corpus BLEU for all rows
    })
    results_df.to_csv(BLEU_OUTPUT_PATH, index=False)
    logger.info(f"BLEU results saved to {BLEU_OUTPUT_PATH}")
    
    return bleu_score
    
# Compute chrF++ score
def compute_chrf(test_data, sources, references, hypotheses):
    logger.info("Computing chrF++ score...")
    
    # Prepare references format for chrF++ (list of references for each translation)
    refs_list = [[ref] for ref in references]
    
    # Compute chrF++ score (char order=6, word order=2, beta=2 are standard settings)
    chrf = corpus_chrf(hypotheses, refs_list, char_order=6, word_order=2, beta=2)
    chrf_score = chrf.score
    logger.info(f"chrF++ Score: {chrf_score:.2f}")
    
    # Save results
    results_df = pd.DataFrame({
        "tamil_sentence": sources,
        "telugu_sentence": references,
        "telugu_hypothesis": hypotheses,
        "chrf_score": [chrf_score] * len(sources)  # Same corpus chrF for all rows
    })
    results_df.to_csv(CHRF_OUTPUT_PATH, index=False)
    logger.info(f"chrF++ results saved to {CHRF_OUTPUT_PATH}")
    
    return chrf_score
    
# Compute TER score (Translation Edit Rate)
def compute_ter(test_data, sources, references, hypotheses):
    logger.info("Computing TER score...")
    
    # Prepare references format for TER (list of references for each translation)
    refs_list = [[ref] for ref in references]
    
    # Compute TER score
    ter = corpus_ter(hypotheses, refs_list)
    ter_score = ter.score
    logger.info(f"TER Score: {ter_score:.2f}")
    
    # Note: Lower TER is better (it's an error rate)
    
    # Save results
    results_df = pd.DataFrame({
        "tamil_sentence": sources,
        "telugu_sentence": references,
        "telugu_hypothesis": hypotheses,
        "ter_score": [ter_score] * len(sources)  # Same corpus TER for all rows
    })
    results_df.to_csv(TER_OUTPUT_PATH, index=False)
    logger.info(f"TER results saved to {TER_OUTPUT_PATH}")
    
    return ter_score

# Compute BERTScore
def compute_bertscore(test_data, sources, references, hypotheses):
    logger.info("Computing BERTScore...")
    
    # Compute BERTScore
    P, R, F1 = bert_score(
        hypotheses,
        references,
        lang="te",  # Telugu language code
        model_type="bert-base-multilingual-cased",
        device=device,
        verbose=True
    )
    
    # Average F1 score
    avg_f1 = F1.mean().item()
    logger.info(f"BERTScore F1: {avg_f1:.4f}")
    
    # Store per-sentence F1 scores
    bert_f1_scores = [f1.item() for f1 in F1]
    
    # Save results
    results_df = pd.DataFrame({
        "tamil_sentence": sources,
        "telugu_sentence": references,
        "telugu_hypothesis": hypotheses,
        "bertscore_f1": bert_f1_scores
    })
    results_df.to_csv(BERTSCORE_OUTPUT_PATH, index=False)
    logger.info(f"BERTScore results saved to {BERTSCORE_OUTPUT_PATH}")
    
    return avg_f1

# Compute COMET score
def compute_comet(test_data, sources, references, hypotheses):
    if not comet_available:
        logger.warning("COMET not available. Skipping COMET evaluation.")
        return None
    
    logger.info("Computing COMET score...")
    
    # Load COMET model
    logger.info("Downloading COMET model...")
    model_path = download_model("Unbabel/wmt22-comet-da")
    model = load_from_checkpoint(model_path)
    model.to(device)
    
    # Prepare data for COMET
    data = []
    for src, hyp, ref in zip(sources, hypotheses, references):
        data.append({
            "src": src,
            "mt": hyp,
            "ref": ref
        })
    
    # Compute scores
    logger.info("Running COMET evaluation...")
    model_output = model.predict(data, batch_size=8, gpus=1 if device == "cuda" else 0)
    comet_scores = model_output.scores
    avg_comet = model_output.system_score
    
    logger.info(f"COMET Score: {avg_comet:.4f}")
    
    # Save results
    results_df = pd.DataFrame({
        "tamil_sentence": sources,
        "telugu_sentence": references,
        "telugu_hypothesis": hypotheses,
        "comet_score": comet_scores
    })
    results_df.to_csv(COMET_OUTPUT_PATH, index=False)
    logger.info(f"COMET results saved to {COMET_OUTPUT_PATH}")
    
    return avg_comet

# Main evaluation function
def evaluate_model():
    # Generate translations
    test_data, sources, references, hypotheses = generate_translations(test_dataset, NUM_SAMPLES)
    
    # Compute metrics
    bleu_score = compute_bleu(test_data, sources, references, hypotheses)
    bertscore_f1 = compute_bertscore(test_data, sources, references, hypotheses)
    chrf_score = compute_chrf(test_data, sources, references, hypotheses)
    ter_score = compute_ter(test_data, sources, references, hypotheses)
    
    comet_score = None
    if comet_available:
        comet_score = compute_comet(test_data, sources, references, hypotheses)
    
    # Print summary
    print("\n" + "="*50)
    print("EVALUATION SUMMARY")
    print("="*50)
    print(f"Number of samples: {len(sources)}")
    print(f"BLEU Score: {bleu_score:.2f}")
    print(f"chrF++ Score: {chrf_score:.2f}")
    print(f"TER Score: {ter_score:.2f} (lower is better)")
    print(f"BERTScore F1: {bertscore_f1:.4f}")
    if comet_score is not None:
        print(f"COMET Score: {comet_score:.4f}")
    print("="*50)
    
    # Test a single translation
    test_input = "வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?"  # "Hello, how are you?"
    translated_text = mbart_translate_text(test_input, debug=True)
    print(f"\nTest Translation:")
    print(f"Source (Tamil): {test_input}")
    print(f"Target (Telugu): {translated_text}")
    
    return {
        "bleu": bleu_score,
        "chrf": chrf_score,
        "ter": ter_score,
        "bertscore": bertscore_f1,
        "comet": comet_score
    }

# Run the evaluation
evaluate_model()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[33 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "/home/mca/anaconda3/envs/nmt/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py", line 389, in <module>
  [31m   [0m     main()
  [31m   [0m   File "/home/mca/anaconda3/envs/nmt/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py", line 373, in main
  [31m   [0m     json_out["return_val"] = hook(**hook_input["kwargs"])
  [31m   [0m                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  [31m   [0m   File "/home/mca/anaconda3/envs/nmt/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py", line 137, in get_requires_for_build_wheel
  [31m   [0m     backend = _build_backend()
  [31

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
[0m

Using device: cuda
Loading model configuration...
Loading model and tokenizer...
Loading dataset...
Generating translations for 2000 samples


Dataset columns: ['Telugu', 'Tamil']
Dataset shape: (2051, 2)
First few rows:
                                              Telugu  \
0  "చిత్రము ""{0}""ను అప్‌లోడు చేయుచున్నది ({2} ల...   
1  "బుక్‌మార్కు సమాచారము సరికూర్పరి దర్శనమునందు చ...   
2             %s గదిని వదిలినదిfoo has left the room   
3  'బాహుబలి ' సినిమా తరువాత ప్రభాస్ నటిస్తున్న భా...   
4                          ( 1 యోహాను 3: 17 చదవండి.)   

                                               Tamil  
0      """{0}"" படத்தை பதிவேற்றுகிறது ({2} இல் {1})"  
1  "திருத்தி காட்சியில் காட்டப்படும் புத்தகக்குறி...  
2  %s அறையில் இருந்து வெளியேறினார்foo has left th...  
3  'பாகுபலி' என்ற பிரம்மாண்ட படத்திற்கு பிறகு பிர...  
4                 ( 1 யோவான் 3: 17 - ஐ வாசியுங்கள்.)  
Using Tamil column: Tamil
Using Telugu column: Telugu
Test dataset size: 2051


Generating translations: 100%|██████████████| 2000/2000 [15:13<00:00,  2.19it/s]
Computing BLEU score...
BLEU Score: 72.77
BLEU results saved to ./mBART3_RESULTS/bleu_evaluation_results.csv
Computing BERTScore...


calculating scores...
computing bert embedding.


  0%|          | 0/63 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/32 [00:00<?, ?it/s]

BERTScore F1: 0.8438
BERTScore results saved to ./mBART3_RESULTS/bertscore_evaluation_results.csv
Computing chrF++ score...


done in 3.90 seconds, 512.40 sentences/sec


chrF++ Score: 42.60
chrF++ results saved to ./mBART3_RESULTS/chrf_evaluation_results.csv
Computing TER score...
TER Score: 42.59
TER results saved to ./mBART3_RESULTS/ter_evaluation_results.csv
Computing COMET score...
Downloading COMET model...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.
/home/mca/anaconda3/envs/nmt/lib/python3.12/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
Running COMET evaluation...
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelis


EVALUATION SUMMARY
Number of samples: 2000
BLEU Score: 72.77
chrF++ Score: 42.60
TER Score: 42.59 (lower is better)
BERTScore F1: 0.8438
COMET Score: 0.8888

Test Translation:
Source (Tamil): வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?
Target (Telugu): నమస్కారం, మీరు ఎలా ఉన్నారు?


{'bleu': 72.76817202342089,
 'chrf': 42.59973856124301,
 'ter': 42.591251756889136,
 'bertscore': 0.843758225440979,
 'comet': 0.8887818599641323}