# Setup

In [1]:
!pip install datasets transformers torch accelerate sacremoses sacrebleu --quiet
!pip install dataset --quiet

In [2]:
import torch, os
from transformers import MarianTokenizer, MarianMTModel
import pandas as pd
import sacrebleu
from tqdm import tqdm
from datasets import load_dataset


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["WANDB_DISABLED"] = "true"

# Load the Model and Tokenizer

In [4]:
model_name = '/kaggle/input/finetuned-opusmt-en-to-ta-model'
tokenizer = MarianTokenizer.from_pretrained(model_name)
# Load the model
model = MarianMTModel.from_pretrained(model_name)
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(64110, 512, padding_idx=64109)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(64110, 512, padding_idx=64109)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

# Tatoeba Benchmark Evaluation

## Load Tatoeba dataset

In [5]:
# Load the dataset 
df = pd.read_csv('/kaggle/input/tatoeba-tamil/Tatoeba-tamil.csv')
english_sentences = df['English'].tolist()
tamil_sentences = df['Tamil'].to_list()

In [6]:
print(english_sentences[:10])
print(tamil_sentences[:10])

['All of them went there.', 'All of us were silent.', 'Are you ready to go?', 'As all letters have the letter A for their first, so the world has the eternal God for its first.', 'A square has four equal sides.', 'A square has four sides.', "Because he's sick, he can't come.", 'Be kind to old people.', 'Beware of pickpockets.', 'Beware of the dog!']
['அவர்கள் எல்லோரும் அங்கே சென்றார்கள்', 'நாங்கள் அனைவரும் அமைதியாக இருந்தோம்', 'நீங்கள் போகத் தயாராக இருக்கிறீர்களா?', 'அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு.', 'ஒரு சதுரத்திற்கு நான்கு சமமான பக்கங்கள் உள்ளன', 'ஒரு சதுரத்திற்கு நான்கு பக்கங்கள் உள்ளன', 'அவனுக்கு உடல் நிலை சரியில்லாததனால் அவனால் வர இயலாது', 'வயோதிகர்களிடம் அன்பாக இரு', 'ஜேப்படிகாரர்களிடம் ஜாக்கிரதையாக இருக்கவும்', 'நாய் ஜாக்கிரதை!']


## Calculate BLEU: Tatoeba

In [7]:
# Function to generate translation for a given input text
def generate_translation(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text

translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append([tamil_sentences[i]])

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(translations, references)
print(f"BLEU score: {bleu.score}")


Translating: 100%|██████████| 356/356 [03:10<00:00,  1.87it/s]

BLEU score: 14.058533129758727





# IN-22 Benchmark Evaluation

## Load IN-22 dataset

In [8]:
# download and load specific pairs
dataset = load_dataset("ai4bharat/IN22-Gen", "eng_Latn-tam_Taml")


The repository for ai4bharat/IN22-Gen contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ai4bharat/IN22-Gen.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

Generating gen split: 0 examples [00:00, ? examples/s]

In [9]:
dataset

DatasetDict({
    gen: Dataset({
        features: ['id', 'context', 'source', 'url', 'domain', 'num_words', 'bucket', 'sentence_eng_Latn', 'sentence_tam_Taml'],
        num_rows: 1024
    })
})

In [10]:
english_sentences = dataset['gen']['sentence_eng_Latn']
tamil_sentences = dataset['gen']['sentence_tam_Taml']

# Convert them to lists
english_sentences = list(english_sentences)
tamil_sentences = list(tamil_sentences)

# Verify the first few elements of each list
print(english_sentences[:5])
print(tamil_sentences[:5])


['An appearance is a bunch of attributes related to the service person, like their shoes, clothes, tie, jewellery, hairstyle, make-up, watch, cosmetics, perfume, etc.', 'Ajanta, located in the Aurangabad District of Maharashtra has twenty-nine caitya and vihara caves decorated with sculptures and paintings from the first century B.C.E. to the fifth century C.E.', 'Body colour gets merged with the outer line, creating the effect of volume.', 'Ashoka started making extensive use of stone for sculptures and great monuments, whereas the previous tradition consisted of working with wood and clay.', 'Potatoes mixed in masalas, coated in besan batter and deep fried to perfection form this delicious and famous dish of Maharashtra.']
['தோற்றம் என்பது சேவை ஊழியரின் காலணிகள், உடை, டை, நகை, சிகையலங்காரம், மேக்-அப், கைக்கடிகாரம், அழகு சாதனங்கள், நறுமணம் போன்ற அவர் சார்ந்த பண்புகளின் ஒரு தொகுப்பைக் குறிக்கிறது.', 'மகாராஷ்டிரத்தின் அவுரங்காபாத் மாவட்டத்தில் உள்ள அஜந்தாவில் இருபத்தி ஒன்பது சைத்யா மற்ற

In [11]:
from transformers import MarianTokenizer, MarianMTModel

model_name = '/kaggle/input/finetuned-opusmt-en-to-ta-model'
tokenizer = MarianTokenizer.from_pretrained(model_name)
# Load the model
model = MarianMTModel.from_pretrained(model_name)
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(64110, 512, padding_idx=64109)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(64110, 512, padding_idx=64109)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

## Calculate BLEU: IN-22

In [12]:
# Function to generate translation for a given input text
def generate_translation(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text

translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append([tamil_sentences[i]])

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(translations, references)
print(f"BLEU score: {bleu.score}")


Translating: 100%|██████████| 1024/1024 [37:37<00:00,  2.21s/it]

BLEU score: 5.8823705487151745





## Calculate chrF

In [13]:
translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append(tamil_sentences[i])  
# Calculate chrF score
chrf = sacrebleu.corpus_chrf(translations, references)
print(f"chrF score: {chrf.score}")

Translating: 100%|██████████| 1024/1024 [37:21<00:00,  2.19s/it]


chrF score: 4.400440044004401
