In [1]:
import pandas as pd
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import sacrebleu
from tqdm import tqdm

# Function to load input text files and split by lines
def load_input_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read().split('\n')

# Function to read references for BLEU calculation
def load_references(file_path):
    with open(file_path, 'rb') as file:
        return [[line.decode('utf-8').strip()] for line in file.read().splitlines()]

# Function to chunk text
def chunk_text(text, max_length=128):
    sentences = text
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length > max_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(sentence)
        current_length += sentence_length

    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

# Function to translate text using the M2M100 model
def test_translation(source_text, source_lang, target_lang='en', max_length=128):
    tokenizer.src_lang = source_lang
    encoded_text = tokenizer(source_text, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to('cuda')
    generated_tokens = model.generate(**encoded_text, forced_bos_token_id=tokenizer.get_lang_id(target_lang),max_length=max_length)
    translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return translated_text

In [2]:
# Load input texts
input_texts = {
    'gujarati': load_input_file('dev.guj_Gujr'),
    'galician': load_input_file('dev.glg_Latn'),
    'khmer': load_input_file('dev.khm_Khmr'),
    'burmese': load_input_file('dev.mya_Mymr'),
    'nepali': load_input_file('dev.npi_Deva')
}

# Load English references
english_labels = load_references('dev.eng_Latn')

# Language codes for M2M100
language_codes = {
    'gujarati': 'gu',
    'nepali': 'ne',
    'burmese': 'my',
    'khmer': 'km',
    'galician': 'gl',
    'english': 'en'
}

# Load the M2M100 model and tokenizer
model_name = 'facebook/m2m100_418M'
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name).to('cuda')

  return self.fget.__get__(instance, owner)()


In [3]:
# Process translations and compute BLEU scores
for lang, text in input_texts.items():
    print(f"Processing translation from {lang} to English...")
    chunks = chunk_text(text)
    translated_chunks = [test_translation(chunk, language_codes[lang]) for chunk in tqdm(chunks, desc=f"Translating {lang}")]
    bleu_score = sacrebleu.corpus_bleu(translated_chunks, english_labels)
    print(f"BLEU score for {lang}: {bleu_score.score}")

Processing translation from gujarati to English...


Translating gujarati: 100%|██████████| 172/172 [04:05<00:00,  1.43s/it]


BLEU score for gujarati: 22.66277494897848
Processing translation from galician to English...


Translating galician: 100%|██████████| 195/195 [05:32<00:00,  1.70s/it]


BLEU score for galician: 46.63945526554835
Processing translation from khmer to English...


Translating khmer: 100%|██████████| 46/46 [00:49<00:00,  1.07s/it]


BLEU score for khmer: 34.405943453197025
Processing translation from burmese to English...


Translating burmese: 100%|██████████| 82/82 [01:19<00:00,  1.04it/s]


BLEU score for burmese: 15.568939784469132
Processing translation from nepali to English...


Translating nepali: 100%|██████████| 151/151 [03:49<00:00,  1.52s/it]


BLEU score for nepali: 42.31838992507398
