In [1]:
import pandas as pd
import numpy as np
def read_binary_file(file_path):
    with open(file_path, 'rb') as file:
        lines = file.read().decode('utf-8').split('\n')
    return lines


gujarati_text = read_binary_file('../test_datasets/dev.guj_Gujr')
nepali_text = read_binary_file('../test_datasets/dev.npi_Deva')
burmese_text = read_binary_file('../test_datasets/dev.mya_Mymr')
khmer_text = read_binary_file('../test_datasets/dev.khm_Khmr')
galician_text = read_binary_file('../test_datasets/dev.glg_Latn')
english_labels = read_binary_file('../test_datasets/dev.eng_Latn')
english_labels = [[i] for i in english_labels]

In [2]:
# Correct language codes for M2M100
language_codes = {
    'gujarati': 'gu',
    'nepali': 'ne',
    'burmese': 'my',
    'khmer': 'km',
    'galician': 'gl',
    'english': 'en'
}

# Test sentences in different languages
input_texts = {
    'gujarati': gujarati_text,  
    'nepali': nepali_text,  
    'burmese': burmese_text,  
    'khmer': khmer_text,  
    'galician': galician_text  
}

In [7]:
import sacrebleu
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Load the input files
def load_input_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Correct language codes for small-100
language_codes = {
    'gujarati': 'gu',
    'nepali': 'ne',
    'burmese': 'my',
    'khmer': 'km',
    'galician': 'gl',
    'english': 'en'
}

# Load the SMaLL-100 model and tokenizer
model_name = 'alirezamsh/small100'
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

# Function to translate text
def test_translation(source_text, source_lang, target_lang='en', max_length=128):
    input_text = f"{source_text} </s> {target_lang}"
    encoded_text = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    generated_tokens = model.generate(**encoded_text, max_length=max_length)
    translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return translated_text

# Process translations
for lang, text in input_texts.items():
    print(f"Processing translation from {lang} to English...")
    translated_text = test_translation(text, language_codes[lang])
    blue_score = sacrebleu.corpus_bleu(translated_text, english_labels)
    print(f"BLEU score for {lang}: {blue_score.score}")

    # Save the translated text to a file
    with open(f"{lang}.txt", "w", encoding='utf-8') as file:
        file.write(translated_text)


Processing translation from gujarati to English...
BLEU score for gujarati: 0.0
Processing translation from nepali to English...
BLEU score for nepali: 0.0
Processing translation from burmese to English...
BLEU score for burmese: 0.0
Processing translation from khmer to English...
BLEU score for khmer: 0.0
Processing translation from galician to English...
BLEU score for galician: 0.0


In [10]:
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Load the SMaLL-100 model and tokenizer
model_name = 'alirezamsh/small100'
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

def translate(src_lang, tokenizer, model, text):
    tokenizer.src_lang = src_lang
    encoded_text = tokenizer(text, return_tensors="pt", padding=True)
    generated_tokens = model.generate(**encoded_text, 
                                      forced_bos_token_id=tokenizer.lang_code_to_id['en'])
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return translation

def batch_translate(src_lang, tokenizer, model, texts, batch_size=16):
    results = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_results = translate(src_lang, tokenizer, model, batch_texts)
        results.extend(batch_results)
    return results

# Perform translations
gujarati_translations = batch_translate("gu", tokenizer, model, gujarati_text)
nepali_translations = batch_translate("ne", tokenizer, model, nepali_text)
burmese_translations = batch_translate("my", tokenizer, model, burmese_text)
khmer_translations = batch_translate("km", tokenizer, model, khmer_text)
galician_translations = batch_translate("gl", tokenizer, model, galician_text)

# Save translations to files
def save_translations(file_name, translations):
    with open(file_name, 'w', encoding='utf-8') as file:
        for line in translations:
            file.write(f"{line}\n")

save_translations("gujarati_translations.txt", gujarati_translations)
save_translations("nepali_translations.txt", nepali_translations)
save_translations("burmese_translations.txt", burmese_translations)
save_translations("khmer_translations.txt", khmer_translations)
save_translations("galician_translations.txt", galician_translations)


In [11]:
gujarati_bleu = sacrebleu.corpus_bleu(gujarati_translations, english_labels)
print(f"BLEU score on Gujarati: {gujarati_bleu.score}")

nepali_bleu = sacrebleu.corpus_bleu(nepali_translations, english_labels)
print(f"BLEU score on Nepali: {nepali_bleu.score}")

burmese_bleu = sacrebleu.corpus_bleu(burmese_translations, english_labels)
print(f"BLEU score on Burmese: {burmese_bleu.score}")

khmer_bleu = sacrebleu.corpus_bleu(khmer_translations, english_labels)
print(f"BLEU score on Khmer: {khmer_bleu.score}")

galician_bleu = sacrebleu.corpus_bleu(galician_translations, english_labels)
print(f"BLEU score on Galician: {galician_bleu.score}")

# overall_translations = [[gujarati_translations[i], nepali_translations[i], 
#                          burmese_translations[i], khmer_translations[i], 
#                          galician_translations[i]] for i in range(len(english_labels))]

# overall_bleu = sacrebleu.corpus_bleu(english_labels, overall_translations)
# print(f"BLEU score on Overall: {overall_bleu.score}")

BLEU score on Gujarati: 0.43384866842485925
BLEU score on Nepali: 0.9896324688555468
BLEU score on Burmese: 8.9730240870212
BLEU score on Khmer: 46.713797772819994
BLEU score on Galician: 1.1448714311538606
