In [None]:
!pip install sacrebleu



In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)


In [None]:
import torch
from transformers import DataCollatorForSeq2Seq, Trainer, TrainingArguments
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from datasets import Dataset
import sacrebleu

In [None]:
def read_binary_file(file_path):
    with open(file_path, 'rb') as file:
        lines = file.read().decode('utf-8').split('\n')
    return lines


gujarati_text = read_binary_file('devtest.guj_Gujr')
nepali_text = read_binary_file('devtest.npi_Deva')
burmese_text = read_binary_file('devtest.mya_Mymr')
khmer_text = read_binary_file('devtest.khm_Khmr')
galician_text = read_binary_file('devtest.glg_Latn')
english_labels = read_binary_file('devtest.eng_Latn')

In [None]:
model_name = 'facebook/mbart-large-50-many-to-one-mmt'
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)

def encode(dataset, max_length=128):
    inputs = tokenizer(dataset['source'], padding="max_length", truncation=True, max_length=max_length)
    outputs = tokenizer(dataset['target'], padding="max_length", truncation=True, max_length=max_length)
    return {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask,
            "labels": outputs.input_ids}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Gujarati

In [None]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/302.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1

In [None]:
from sklearn.model_selection import train_test_split

# Assuming you already have gujarati_text and english_labels
gujarati_text_train, gujarati_text_eval, english_labels_train, english_labels_eval = train_test_split(gujarati_text, english_labels, test_size=0.2, random_state=42)

# Create datasets
gujarati_train_dataset = Dataset.from_dict({'source': gujarati_text_train, 'target': english_labels_train})
gujarati_eval_dataset = Dataset.from_dict({'source': gujarati_text_eval, 'target': english_labels_eval})

# Encode datasets
encoded_gujarati_train = gujarati_train_dataset.map(encode, batched=True)
encoded_gujarati_eval = gujarati_eval_dataset.map(encode, batched=True)

Map:   0%|          | 0/810 [00:00<?, ? examples/s]

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

In [None]:
tokenizer.src_lang = "gu_IN"

model = MBartForConditionalGeneration.from_pretrained(model_name).to('cuda')

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    fp16=True,
    gradient_accumulation_steps=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_gujarati_train,
    eval_dataset=encoded_gujarati_eval,
    tokenizer=tokenizer
)



ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,No log,5.26194
1,No log,2.655127
2,No log,1.867862


TrainOutput(global_step=150, training_loss=4.515906168619792, metrics={'train_runtime': 166.2056, 'train_samples_per_second': 14.62, 'train_steps_per_second': 0.902, 'total_flos': 649055365496832.0, 'train_loss': 4.515906168619792, 'epoch': 2.955665024630542})

In [None]:
# Create datasets
gujarati_test_dataset = Dataset.from_dict({'source': gujarati_text_train, 'target': english_labels_train})
gujarati_eval_dataset = Dataset.from_dict({'source': gujarati_text_eval, 'target': english_labels_eval})

# Encode datasets
encoded_gujarati_train = gujarati_train_dataset.map(encode, batched=True)
encoded_gujarati_eval = gujarati_eval_dataset.map(encode, batched=True)

Map:   0%|          | 0/810 [00:00<?, ? examples/s]

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

In [None]:
guj_test_fp = 'dev.guj_Gujr'
eng_test_fp = 'dev.eng_Latn'


read_binary_file('devtest.guj_Gujr')

gujarati_test_dataset = Dataset.from_dict({'source': read_binary_file(guj_test_fp), 'target': read_binary_file(eng_test_fp)})
encoded_gujarati_test = gujarati_test_dataset.map(encode, batched=True)


Map:   0%|          | 0/998 [00:00<?, ? examples/s]

In [None]:
def translate(src_lang, tokenizer, model, text):
    tokenizer.src_lang = src_lang
    encoded_text = tokenizer(text, return_tensors = "pt", padding = True).to('cuda')
    generated_tokens = model.generate(**encoded_text,
                                      forced_bos_token_id = tokenizer.lang_code_to_id['en_XX'])
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens = True)
    return translation


def batch_translate(src_lang, tokenizer, model, texts, batch_size=16):
    results = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_results = translate(src_lang, tokenizer, model, batch_texts)
        results.extend(batch_results)
    return results


In [None]:
gujarati_translations = batch_translate("gu_IN", tokenizer, model, read_binary_file('dev.guj_Gujr'))

In [None]:
def read_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    # Strip any extra whitespace characters
    sentences = [sentence.strip() for sentence in sentences]
    return sentences


eng_truth = [[sentence]for sentence in read_sentences('dev.eng_Latn')]

In [None]:
bleu = sacrebleu.corpus_bleu(gujarati_translations, eng_truth)
bleu

BLEU = 43.95 97.8/64.4/31.8/18.6 (BP = 1.000 ratio = 1.000 hyp_len = 46 ref_len = 46)

# Nepali


In [None]:
# Nepali
from sklearn.model_selection import train_test_split

# Assuming you already have nepali_text and english_labels,  #TODO: edit first arg of train_test_split
text_train, text_eval, english_labels_train, english_labels_eval = train_test_split(nepali_text, english_labels, test_size=0.2, random_state=42)

# Create datasets
train_dataset = Dataset.from_dict({'source': text_train, 'target': english_labels_train})
eval_dataset = Dataset.from_dict({'source': text_eval, 'target': english_labels_eval})

# Encode datasets
encoded_train = train_dataset.map(encode, batched=True)
encoded_eval = eval_dataset.map(encode, batched=True)

Map:   0%|          | 0/810 [00:00<?, ? examples/s]

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

In [None]:
tokenizer.src_lang = "ne_NP" #TODO: edit

model = MBartForConditionalGeneration.from_pretrained(model_name).to('cuda')

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    fp16=True,
    gradient_accumulation_steps=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_eval,
    tokenizer=tokenizer
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,No log,5.035835
1,No log,2.474657
2,No log,1.704185


TrainOutput(global_step=150, training_loss=4.299611002604166, metrics={'train_runtime': 90.8216, 'train_samples_per_second': 26.756, 'train_steps_per_second': 1.652, 'total_flos': 649055365496832.0, 'train_loss': 4.299611002604166, 'epoch': 2.955665024630542})

In [None]:
test_fp = 'dev.npi_Deva' # TODO: edit
eng_test_fp = 'dev.eng_Latn'

test_dataset = Dataset.from_dict({'source': read_binary_file(test_fp), 'target': read_binary_file(eng_test_fp)})
encoded_test = test_dataset.map(encode, batched=True)


Map:   0%|          | 0/998 [00:00<?, ? examples/s]

In [None]:
nep_translations = batch_translate(tokenizer.src_lang, tokenizer, model, read_binary_file(test_fp))
bleu = sacrebleu.corpus_bleu(nep_translations, eng_truth)
bleu

BLEU = 75.37 100.0/83.3/66.0/58.7 (BP = 1.000 ratio = 1.000 hyp_len = 49 ref_len = 49)

# Burmese

In [None]:
# Burmese, mya_Mymr
tokenizer.src_lang = "mya_Mymr"

# Assuming you already have burmese_text and english_labels
text_train, text_eval, english_labels_train, english_labels_eval = train_test_split(burmese_text, english_labels, test_size=0.2, random_state=42)

# Create datasets
train_dataset = Dataset.from_dict({'source': text_train, 'target': english_labels_train})
eval_dataset = Dataset.from_dict({'source': text_eval, 'target': english_labels_eval})

# Encode datasets
encoded_train = train_dataset.map(encode, batched=True)
encoded_eval = eval_dataset.map(encode, batched=True)


Map:   0%|          | 0/810 [00:00<?, ? examples/s]

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

In [None]:
model = MBartForConditionalGeneration.from_pretrained(model_name).to('cuda')


training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    fp16=True,
    gradient_accumulation_steps=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_eval,
    tokenizer=tokenizer
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,No log,5.309566
1,No log,2.833019
2,No log,1.914884


TrainOutput(global_step=150, training_loss=4.7055436197916665, metrics={'train_runtime': 93.0985, 'train_samples_per_second': 26.101, 'train_steps_per_second': 1.611, 'total_flos': 649055365496832.0, 'train_loss': 4.7055436197916665, 'epoch': 2.955665024630542})

In [None]:
test_fp = 'dev.mya_Mymr'

test_dataset = Dataset.from_dict({'source': read_binary_file(test_fp), 'target': read_binary_file(eng_test_fp)})
encoded_test = test_dataset.map(encode, batched=True)

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

In [None]:
burmese_translations = batch_translate(tokenizer.src_lang, tokenizer, model, read_binary_file(test_fp))

In [None]:
bleu = sacrebleu.corpus_bleu(burmese_translations, eng_truth) # TODO: edit
bleu

BLEU = 36.99 69.7/49.3/28.4/19.2 (BP = 1.000 ratio = 1.407 hyp_len = 76 ref_len = 54)

In [None]:
# Khmer, khm_Khmr
tokenizer.src_lang = "km_KH" # TODO: edit

# Assuming you already have burmese_text and english_labels, #TODO: edit first arg of train_test_split
text_train, text_eval, english_labels_train, english_labels_eval = train_test_split(khmer_text, english_labels, test_size=0.2, random_state=42)

# Create datasets
train_dataset = Dataset.from_dict({'source': text_train, 'target': english_labels_train})
eval_dataset = Dataset.from_dict({'source': text_eval, 'target': english_labels_eval})

# Encode datasets
encoded_train = train_dataset.map(encode, batched=True)
encoded_eval = eval_dataset.map(encode, batched=True)


Map:   0%|          | 0/810 [00:00<?, ? examples/s]

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

In [None]:
model = MBartForConditionalGeneration.from_pretrained(model_name).to('cuda')


training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    fp16=True,
    gradient_accumulation_steps=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_eval,
    tokenizer=tokenizer
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,No log,5.251957
1,No log,2.727656
2,No log,1.966222


TrainOutput(global_step=150, training_loss=4.543141682942708, metrics={'train_runtime': 92.3922, 'train_samples_per_second': 26.301, 'train_steps_per_second': 1.624, 'total_flos': 649055365496832.0, 'train_loss': 4.543141682942708, 'epoch': 2.955665024630542})

In [None]:
test_fp = 'dev.khm_Khmr'

test_dataset = Dataset.from_dict({'source': read_binary_file(test_fp), 'target': read_binary_file(eng_test_fp)})
encoded_test = test_dataset.map(encode, batched=True)

# TODO: edit var name
khmer_translations = batch_translate(tokenizer.src_lang, tokenizer, model, read_binary_file(test_fp))


Map:   0%|          | 0/998 [00:00<?, ? examples/s]

In [None]:
bleu = sacrebleu.corpus_bleu(khmer_translations, eng_truth) # TODO: edit first arg name
bleu

BLEU = 38.53 83.3/46.8/28.3/20.0 (BP = 1.000 ratio = 1.000 hyp_len = 48 ref_len = 48)

# Galician

In [None]:
# Galician, glg_Latn
tokenizer.src_lang = "gl_ES" # TODO: edit

# Assuming you already have burmese_text and english_labels, #TODO: edit first arg of train_test_split
text_train, text_eval, english_labels_train, english_labels_eval = train_test_split(galician_text, english_labels, test_size=0.2, random_state=42)

# Create datasets
train_dataset = Dataset.from_dict({'source': text_train, 'target': english_labels_train})
eval_dataset = Dataset.from_dict({'source': text_eval, 'target': english_labels_eval})

# Encode datasets
encoded_train = train_dataset.map(encode, batched=True)
encoded_eval = eval_dataset.map(encode, batched=True)

Map:   0%|          | 0/810 [00:00<?, ? examples/s]

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

In [None]:
model = MBartForConditionalGeneration.from_pretrained(model_name).to('cuda')


training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    fp16=True,
    gradient_accumulation_steps=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_eval,
    tokenizer=tokenizer
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,No log,5.084138
1,No log,2.481589
2,No log,1.701294


TrainOutput(global_step=150, training_loss=4.360506184895834, metrics={'train_runtime': 91.3423, 'train_samples_per_second': 26.603, 'train_steps_per_second': 1.642, 'total_flos': 649055365496832.0, 'train_loss': 4.360506184895834, 'epoch': 2.955665024630542})

In [None]:
test_fp = 'dev.glg_Latn'

test_dataset = Dataset.from_dict({'source': read_binary_file(test_fp), 'target': read_binary_file(eng_test_fp)})
encoded_test = test_dataset.map(encode, batched=True)

# TODO: edit var name
galician_translations = batch_translate(tokenizer.src_lang, tokenizer, model, read_binary_file(test_fp))

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

In [None]:
bleu = sacrebleu.corpus_bleu(galician_translations, eng_truth) # TODO: edit first arg name
bleu

BLEU = 46.67 85.0/55.9/37.9/26.3 (BP = 1.000 ratio = 1.111 hyp_len = 60 ref_len = 54)