# Evaluation of finetuned model on benchmark datasets
* Evaluation dataset: 
    1. IN22 Gen (https://huggingface.co/datasets/ai4bharat/IN22-Gen)
    2. Tatoeba Challenge (https://github.com/Helsinki-NLP/Tatoeba-Challenge)
* Finetuned model:finetuned-mbart50-en-tel
* Evaluation metrics: BLEU score

## Setup

In [1]:
!pip install datasets transformers torch accelerate sacremoses sacrebleu --quiet

In [2]:
import torch, os
from datasets import load_dataset
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import sacrebleu

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["WANDB_DISABLED"] = "true"

## Load the model and Tokenizer

In [4]:
# Load your model and tokenizer
model_name ='/kaggle/input/lora-telegu/finetuned-mbart50-en-tel'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# IN22 Gen

In [5]:
# Load your dataset
df = load_dataset('ai4bharat/IN22-Gen', "eng_Latn-tel_Telu", trust_remote_code=True, split='gen')
english_sentences = df['sentence_eng_Latn']
telugu_sentences = df['sentence_tel_Telu']

Downloading builder script:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

Generating gen split: 0 examples [00:00, ? examples/s]

In [6]:
# Function to generate translation for a given input text
def generate_translation(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text

In [7]:
translations = []
references = []
for i in range(0, len(english_sentences)):
    translations.append(generate_translation(english_sentences[i]))
    references.append([telugu_sentences[i]])

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(translations, references)
print(f"BLEU score on IN22 Gen: {bleu.score}")

BLEU score on IN22 Gen: 14.753637948612852


# Tatoeba Challenge

In [8]:
# Load the dataset
df = pd.read_csv('/kaggle/input/tatoeba-telugu/Tatoeba-telugu.csv')

In [9]:
english_sentences = df['English']
telugu_sentences = df['Telugu']

In [10]:
translations = []
references = []
for i in range(0, len(english_sentences)):
    translations.append(generate_translation(english_sentences[i]))
    references.append([telugu_sentences[i]])

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(translations, references)
print(f"BLEU score on Tatoeba Challenge: {bleu.score}")

BLEU score on Tatoeba Challenge: 35.930411196308434
