In [1]:
!pip install datasets transformers torch accelerate sacremoses sacrebleu --quiet

In [2]:
import torch, os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["WANDB_DISABLED"] = "true"

In [3]:
from transformers import MBart50Tokenizer,MBartForConditionalGeneration
model_name ='/kaggle/input/tamil-val'
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
# Load the model
model = MBartForConditionalGeneration.from_pretrained(model_name)
model.to(device)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_

In [4]:
import pandas as pd
# Load your dataset (assuming it's a CSV file with an 'english' column)
df = pd.read_csv('/kaggle/input/tatoeba-tamil/Tatoeba-tamil.csv')
english_sentences = df['English'].tolist()
tamil_sentences = df['Tamil'].to_list()

In [5]:
import sacrebleu

# Function to generate translation for a given input text
def generate_translation(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text


translations = []
references = []
for i in range(0, len(english_sentences)):
    translations.append(generate_translation(english_sentences[i]))
    references.append([tamil_sentences[i]])

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(translations, references)
print(f"BLEU score: {bleu.score}")



BLEU score: 14.058533129758727


In [8]:
!pip install tqdm
from tqdm import tqdm



In [10]:
translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append(tamil_sentences[i])  # For chrF++, references should be a flat list

# Calculate chrF++ score
chrf = sacrebleu.corpus_chrf(translations, references)
print(f"chrF++ score: {chrf.score}")

Translating: 100%|██████████| 356/356 [01:47<00:00,  3.31it/s]


chrF++ score: 10.94890510948905


In [11]:
from datasets import load_dataset

# download and load specific pairs
dataset = load_dataset("ai4bharat/IN22-Gen", "eng_Latn-tam_Taml")


Downloading builder script:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

The repository for ai4bharat/IN22-Gen contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ai4bharat/IN22-Gen.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

Generating gen split: 0 examples [00:00, ? examples/s]

In [12]:
dataset

DatasetDict({
    gen: Dataset({
        features: ['id', 'context', 'source', 'url', 'domain', 'num_words', 'bucket', 'sentence_eng_Latn', 'sentence_tam_Taml'],
        num_rows: 1024
    })
})

In [13]:
# Assuming 'sentence_eng_Latn' is the English column and 'sentence_hin_Deva' is the Hindi column
english_sentences = dataset['gen']['sentence_eng_Latn']
tamil_sentences = dataset['gen']['sentence_tam_Taml']

# Convert them to lists
english_sentences = list(english_sentences)
tamil_sentences = list(tamil_sentences)

# Verify the first few elements of each list
print(english_sentences[:5])
print(tamil_sentences[:5])


['An appearance is a bunch of attributes related to the service person, like their shoes, clothes, tie, jewellery, hairstyle, make-up, watch, cosmetics, perfume, etc.', 'Ajanta, located in the Aurangabad District of Maharashtra has twenty-nine caitya and vihara caves decorated with sculptures and paintings from the first century B.C.E. to the fifth century C.E.', 'Body colour gets merged with the outer line, creating the effect of volume.', 'Ashoka started making extensive use of stone for sculptures and great monuments, whereas the previous tradition consisted of working with wood and clay.', 'Potatoes mixed in masalas, coated in besan batter and deep fried to perfection form this delicious and famous dish of Maharashtra.']
['தோற்றம் என்பது சேவை ஊழியரின் காலணிகள், உடை, டை, நகை, சிகையலங்காரம், மேக்-அப், கைக்கடிகாரம், அழகு சாதனங்கள், நறுமணம் போன்ற அவர் சார்ந்த பண்புகளின் ஒரு தொகுப்பைக் குறிக்கிறது.', 'மகாராஷ்டிரத்தின் அவுரங்காபாத் மாவட்டத்தில் உள்ள அஜந்தாவில் இருபத்தி ஒன்பது சைத்யா மற்ற

In [14]:
import torch, os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
from transformers import MBart50Tokenizer,MBartForConditionalGeneration
model_name ='/kaggle/input/tamil-val'
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
# Load the model
model = MBartForConditionalGeneration.from_pretrained(model_name)
model.to(device)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_

In [16]:
import sacrebleu
from tqdm import tqdm

# Function to generate translation for a given input text
def generate_translation(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text


In [17]:
translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append([tamil_sentences[i]])

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(translations, references)
print(f"BLEU score: {bleu.score}")


Translating: 100%|██████████| 1024/1024 [19:43<00:00,  1.16s/it]


BLEU score: 10.161064908809763


In [18]:
translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append(tamil_sentences[i])  # For chrF++, references should be a flat list

# Calculate chrF++ score
chrf = sacrebleu.corpus_chrf(translations, references)
print(f"chrF++ score: {chrf.score}")

Translating: 100%|██████████| 1024/1024 [19:44<00:00,  1.16s/it]


chrF++ score: 4.118404118404119
