# Setup

In [1]:
!pip install datasets transformers torch accelerate sacremoses sacrebleu --quiet
!pip install dataset --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython-sql 0.5.0 requires sqlalchemy>=2.0, but you have sqlalchemy 1.4.53 which is incompatible.[0m[31m
[0m

In [2]:
import torch, os
from transformers import MarianTokenizer, MarianMTModel
import pandas as pd
import sacrebleu
from tqdm import tqdm
from datasets import load_dataset

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["WANDB_DISABLED"] = "true"

# Load the Model and Tokenizer

In [4]:
model_name = '/kaggle/input/finetuned-opusmt-en-hi-ta-model'
tokenizer = MarianTokenizer.from_pretrained(model_name)
# Load the model
model = MarianMTModel.from_pretrained(model_name)
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(64110, 512, padding_idx=64109)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(64110, 512, padding_idx=64109)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

# Tatoeba Benchmark Evaluation

## Load Tatoeba dataset

In [5]:
# Load the dataset 
df = pd.read_csv('/kaggle/input/tatoeba-gujarati/Tatoeba-gujarati.csv')
english_sentences = df['English'].tolist()
gujarati_sentences = df['Gujarati'].to_list()

In [6]:
print(english_sentences[:10])
print(gujarati_sentences[:10])

['Ahmedabad is the largest city in Gujarat.', 'Ajay is a bad boy.', 'Ajay is bad.', 'Ajay is poor.', 'Algeria is a country in North Africa.', 'A nephew is the son of a sibling.', 'A niece is the daughter of a brother.', 'A niece is the daughter of a sibling.', 'A niece is the daughter of a sister.', 'Are you alone?']
['અમદાવાદ ગુજરાતનું સૌથી મોટુ શહેર છે.', 'અજય ગરીબ છે.', 'અજય ગરીબ છે.', 'અજય ગરીબ છે.', 'ઉત્તર આફ્રિકામાં અલજીર્યા એક દેશ છે.', 'ભાઈ કે બહેનના દીકરાને ભત્રીજો કહેવાય', 'ભાઈની પુત્રી ને ભત્રીજી ક્હેવાય', 'ભાઈ કે બહેનની પુત્રી ને ભત્રીજી ક્હેવાય', 'બહેનની પુત્રી ને ભાણેજ ક્હેવાય', 'તું એકલો છો?']


## Calculate BLEU: Tatoeba

In [7]:
# Function to generate translation for a given input text
def generate_translation(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text

translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append([gujarati_sentences[i]])

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(translations, references)
print(f"BLEU score: {bleu.score}")

Translating: 100%|██████████| 154/154 [01:24<00:00,  1.83it/s]

BLEU score: 26.269098944241588





# IN-22 Benchmark Evaluation

## Load IN-22 dataset

In [8]:
# download and load specific pairs
dataset = load_dataset("ai4bharat/IN22-Gen", "eng_Latn-guj_Gujr", trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

Generating gen split: 0 examples [00:00, ? examples/s]

In [9]:
dataset

DatasetDict({
    gen: Dataset({
        features: ['id', 'context', 'source', 'url', 'domain', 'num_words', 'bucket', 'sentence_eng_Latn', 'sentence_guj_Gujr'],
        num_rows: 1024
    })
})

In [10]:
english_sentences = dataset['gen']['sentence_eng_Latn']
gujarati_sentences = dataset['gen']['sentence_guj_Gujr']

# Convert them to lists
english_sentences = list(english_sentences)
gujarati_sentences = list(gujarati_sentences)

# Verify the first few elements of each list
print(english_sentences[:5])
print(gujarati_sentences[:5])

['An appearance is a bunch of attributes related to the service person, like their shoes, clothes, tie, jewellery, hairstyle, make-up, watch, cosmetics, perfume, etc.', 'Ajanta, located in the Aurangabad District of Maharashtra has twenty-nine caitya and vihara caves decorated with sculptures and paintings from the first century B.C.E. to the fifth century C.E.', 'Body colour gets merged with the outer line, creating the effect of volume.', 'Ashoka started making extensive use of stone for sculptures and great monuments, whereas the previous tradition consisted of working with wood and clay.', 'Potatoes mixed in masalas, coated in besan batter and deep fried to perfection form this delicious and famous dish of Maharashtra.']
['દેખાવ એ સેવા કર્મીના લક્ષણોનો સમૂહ છે, જેમ કે તેમના બૂટ, કપડાં, ટાઈ, આભૂષણો, કેશકલાપ. શણગાર, ઘડિયાળ, શૃંગાર દ્રવ્યો, અત્તર, વગેરે.', 'મહારાષ્ટ્રના ઔરંગાબાદ જીલ્લામાં સ્થિત અજંતામાં ઓગણત્રીસ કૈત્ય અને વિહાર ગુફાઓ છે, જે ઈ.સ.પૂ. પ્રથમ સદીથી ઈ.સ. પાંચમી સદી સુધીના શ

In [11]:
from transformers import MarianTokenizer, MarianMTModel

model_name = '/kaggle/input/finetuned-opusmt-en-hi-ta-model'
tokenizer = MarianTokenizer.from_pretrained(model_name)
# Load the model
model = MarianMTModel.from_pretrained(model_name)
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(64110, 512, padding_idx=64109)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(64110, 512, padding_idx=64109)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

## Calculate BLEU: IN-22

In [12]:
# Function to generate translation for a given input text
def generate_translation(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text

translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append([gujarati_sentences[i]])

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(translations, references)
print(f"BLEU score: {bleu.score}")

Translating: 100%|██████████| 1024/1024 [45:15<00:00,  2.65s/it]


BLEU score: 5.811055908327921


## Calculate chrF

In [13]:
translations = []
references = []
for i in tqdm(range(0, len(english_sentences)), desc="Translating"):
    translations.append(generate_translation(english_sentences[i]))
    references.append(gujarati_sentences[i])  
# Calculate chrF score
chrf = sacrebleu.corpus_chrf(translations, references)
print(f"chrF score: {chrf.score}")

Translating: 100%|██████████| 1024/1024 [45:07<00:00,  2.64s/it]


chrF score: 5.447470817120622
