In [1]:
!pip install datasets transformers torch accelerate sacremoses sacrebleu --quiet

In [2]:
import torch, os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["WANDB_DISABLED"] = "true"

In [3]:
from transformers import MBart50Tokenizer,MBartForConditionalGeneration
model_name ='/kaggle/input/hindi-1-val'
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
# Load the model
model = MBartForConditionalGeneration.from_pretrained(model_name)
model.to(device)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_

In [4]:
import pandas as pd
# Load your dataset (assuming it's a CSV file with an 'english' column)
df = pd.read_csv('/kaggle/input/tatoeba-dataset/Tatoeba-Challenge.csv')
english_sentences = df['English'].tolist()
hindi_sentences = df['Hindi'].to_list()

In [5]:
print(english_sentences[:10])
print(hindi_sentences[:10])

["A baby is God's opinion that the world should go on.", 'Absence of rain caused the plants to die.', 'A button has come off my raincoat.', 'A cat ran after a mouse.', 'A clock has two hands.', 'A country is a dangerous machine.', 'A crow is as black as coal.', 'Actinium was discovered by André-Louis Debierne in 1899.', 'Add a little sugar and cream.', 'Adopt the pace of nature: her secret is patience.']
['नन्हे शिशु के जन्म का अर्थ है कि भगवान यह चाहते हैं कि यह दुनिया बनी रहे।', 'पौधे बारिश के बिना मर गए।', 'मेरे रेनकोट से एक बटन निकल आया है।', 'एक बिल्ली चूहे के पीछे भागी।', 'घड़ी के दो हाथ होते हैं.', 'देश एक खतरनाक मशीन होती है।', 'कौआ कोयले जैसा काला होता है।', 'ऐक्टिनियम का खोज आंड्रे-लूई डेबिएर्न ने साल १८९९ में किया था।', 'थोड़ी शक़्क़र और मलाई डालो।', 'प्रकृति की गति अपनाएं: उसका रहस्य है धीरज।']


In [6]:
import sacrebleu
from tqdm import tqdm

# Function to generate translation for a given input text
def generate_translation(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text


translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append([hindi_sentences[i]])

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(translations, references)
print(f"BLEU score: {bleu.score}")

Translating: 100%|██████████| 5000/5000 [23:18<00:00,  3.58it/s]


BLEU score: 11.208466750961147


In [7]:
translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append(hindi_sentences[i])  # For chrF++, references should be a flat list

# Calculate chrF++ score
chrf = sacrebleu.corpus_chrf(translations, references)
print(f"chrF++ score: {chrf.score}")

Translating: 100%|██████████| 5000/5000 [23:24<00:00,  3.56it/s]


chrF++ score: 14.563106796116504


In [8]:
from datasets import load_dataset

# download and load specific pairs
dataset = load_dataset("ai4bharat/IN22-Gen", "eng_Latn-hin_Deva")


Downloading builder script:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

The repository for ai4bharat/IN22-Gen contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ai4bharat/IN22-Gen.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

Generating gen split: 0 examples [00:00, ? examples/s]

In [9]:
# Assuming 'sentence_eng_Latn' is the English column and 'sentence_hin_Deva' is the Hindi column
english_sentences = dataset['gen']['sentence_eng_Latn']
hindi_sentences = dataset['gen']['sentence_hin_Deva']

# Convert them to lists
english_sentences = list(english_sentences)
hindi_sentences = list(hindi_sentences)

# Verify the first few elements of each list
print(english_sentences[:5])
print(hindi_sentences[:5])


['An appearance is a bunch of attributes related to the service person, like their shoes, clothes, tie, jewellery, hairstyle, make-up, watch, cosmetics, perfume, etc.', 'Ajanta, located in the Aurangabad District of Maharashtra has twenty-nine caitya and vihara caves decorated with sculptures and paintings from the first century B.C.E. to the fifth century C.E.', 'Body colour gets merged with the outer line, creating the effect of volume.', 'Ashoka started making extensive use of stone for sculptures and great monuments, whereas the previous tradition consisted of working with wood and clay.', 'Potatoes mixed in masalas, coated in besan batter and deep fried to perfection form this delicious and famous dish of Maharashtra.']
['सेवा संबंधी लोगों के लिए भेष कई गुणों का संयोजन है, जैसे कि उनके जूते, कपड़े, टाई, आभूषण, केश शैली, मेक-अप, घड़ी, कॉस्मेटिक, इत्र, आदि।', 'महाराष्ट्र के औरंगाबाद जिले में स्थित अजंता में उन्तीस चैत्य और विहार गुफाएँ हैं जो पहली शताब्दी ई.पू. से ले कर पाँचवीं शताब

In [10]:
import torch, os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
from transformers import MBart50Tokenizer,MBartForConditionalGeneration
model_name ='/kaggle/input/hindi-1-val'
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
# Load the model
model = MBartForConditionalGeneration.from_pretrained(model_name)
model.to(device)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_

In [12]:
import sacrebleu
from tqdm import tqdm

# Function to generate translation for a given input text
def generate_translation(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text


In [13]:
translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append([hindi_sentences[i]])

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(translations, references)
print(f"BLEU score: {bleu.score}")


Translating: 100%|██████████| 1024/1024 [18:44<00:00,  1.10s/it]


BLEU score: 26.069430553765887


In [14]:
translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append(hindi_sentences[i])  # For chrF++, references should be a flat list

# Calculate chrF++ score
chrf = sacrebleu.corpus_chrf(translations, references)
print(f"chrF++ score: {chrf.score}")

Translating: 100%|██████████| 1024/1024 [18:38<00:00,  1.09s/it]


chrF++ score: 4.439746300211416
