In [2]:
from torch.utils.data import Dataset, DataLoader
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers.optimization import AdamW
from nltk.translate.bleu_score import corpus_bleu
import torch
from datasets import load_dataset
from tqdm import tqdm

In [37]:
class LanguageDataset(Dataset):                                  

    def __init__(self, ne_file, en_file, max_length, tokenizer):
        self.ne_file = ne_file
        self.en_file = en_file
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.ne_token = []
        self.en_token = []
        self.en = []
        self.ne = []
        with open(self.ne_file) as file:
            for sentence in file:
                # is padding token 1?
                sentence = sentence.strip()
                tokens = tokenizer(sentence, max_length=self.max_length, return_tensors="pt", truncation=True, padding='max_length')
                self.ne.append(sentence)
                self.ne_token.append(tokens)
        with open(self.en_file) as file_en:
            for sentence in file_en:
                # is padding token 1?
                sentence = sentence.strip()
                tokens = tokenizer(sentence, max_length=self.max_length, return_tensors="pt", truncation=True, padding='max_length')
                self.en.append(sentence)
                self.en_token.append(tokens)
                    
    def __len__(self):
        return len(self.ne_token)
              
    # input_ids attention_mask encoder_mask decoder_mask 
    # come back and fix shitty [0]
    def __getitem__(self, idx):
        return {
            'ne_tokens': self.ne_token[idx]['input_ids'][0],
            'ne_mask': self.ne_token[idx]['attention_mask'][0],
            'en_tokens': self.en_token[idx]['input_ids'][0],
            'en_mask': self.en_token[idx]['attention_mask'][0],
            'en': self.en[idx],
            'ne': self.ne[idx]
        }

In [4]:
# datasets
DIR_PATH = "/workspace"
BATCH_SIZE = 32
# A flag to see whether we are fine-tuning the model or not
fine_tune = True

In [41]:
print('init models')
#initlaize model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name)
model = model.cuda()
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
tokenizer.src_lang = "ne_NP"
tokenizer.tgt_lang = "en_XX"
print('done')

init models
done


In [7]:
print('prepping data')
train_dataset = LanguageDataset(f'{DIR_PATH}/train.ne_NP.txt', f'{DIR_PATH}/train.en_XX.txt', BATCH_SIZE, tokenizer)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
print('done')

prepping data
done


In [13]:
'''
Perform fine-tuning of mBART in case the flag is set appropriately
You can check out references like these to understand the code better:
    - https://colab.research.google.com/drive/1d2mSWLiv93X2hgt9WV8IJFms9dQ6arLn?usp=sharing
    - https://github.com/huggingface/transformers/issues/23185#issuecomment-1537690520
'''
if fine_tune:
    print('fine tuning')
    # Moving the model to CUDA
    model = model.cuda()

    optimizer = AdamW(model.parameters(), lr=1e-4)
    model.train()
        
    num_epochs = 1
    # Fine-tune for the specified number of epochs
    for i in range(num_epochs):
        print(f'epoch {i+1} of {num_epochs}')
        total_batches = len(train_loader)
        counter = 0
        
        for batch in tqdm(train_loader, total=total_batches, desc="Training", unit="batch"):
            counter += 1
            model_inputs = {
                'input_ids': batch['ne_tokens'].to('cuda'),
                'attention_mask': batch['ne_mask'].to('cuda')
            }
            labels = batch['en_tokens'].to('cuda')
            optimizer.zero_grad()
            output = model(**model_inputs, labels=labels)  # Forward pass
            loss = output.loss
            loss.backward()  # Backward pass
            optimizer.step()

fine tuning
epoch 1 of 1


Training: 100%|██████████| 5114/5114 [32:08<00:00,  2.65batch/s]


In [79]:
test_dataset = LanguageDataset(f'{DIR_PATH}/test.ne_NP.txt', f'{DIR_PATH}/test.en_XX.txt', BATCH_SIZE, tokenizer)

#init generated translations
generated_translations = []
reference = []

#for each nepali sentence, generate english translation, and add to generated translations
for sentence in test_dataset:

    input_ids = tokenizer(sentence['ne'], return_tensors="pt").input_ids.to('cuda')
    forced_bos_token_id = tokenizer.lang_code_to_id["ne_NP"]
    # input_ids = sentence['en_tokens'].to('cuda').reshape(1, -1)
    outputs = model.generate(input_ids=input_ids, forced_bos_token_id=forced_bos_token_id, max_length=BATCH_SIZE)

    english_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    generated_translations.append(english_translation)

    reference.append(sentence['en'])
    
hypotheses = [gen.split() for gen in generated_translations]
references = [[ref.split()] for ref in reference]

print("bleu score: ", bleu_score)

bleu score:  0.14348966088442014


In [76]:
# Test
#get nepali sentences and english references
dataset_np = load_dataset("text", data_files= {"train": f"{DIR_PATH}/train.ne_NP.txt", "test": f"{DIR_PATH}/test.ne_NP.txt"})
dataset_en = load_dataset("text", data_files={"train": f"{DIR_PATH}/train.en_XX.txt", "test": f"{DIR_PATH}/test.en_XX.txt"})
nepali_sentences = dataset_np["test"]
english_references = dataset_en["test"]

#init generated translations
generated_translations = []

#for each nepali sentence, generate english translation, and add to generated translations
for nepali_sentence in nepali_sentences:

    input_ids = tokenizer(nepali_sentence['text'], return_tensors="pt").input_ids.to('cuda')
    
    #find forced beginning of sentence token id
    forced_bos_token_id = tokenizer.lang_code_to_id["en_XX"]

    outputs = model.generate(input_ids=input_ids, forced_bos_token_id=forced_bos_token_id, max_length=BATCH_SIZE)

    #decode generated english translation back into a sentence
    english_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # add english sentence to generated translations
    generated_translations.append(english_translation)

#calculate bleu score
# map each sentence to a [ sentence.split() ]

references = [[reference.split()] for reference in english_references["text"]]
hypotheses = [gen.split() for gen in generated_translations]

bleu_score = corpus_bleu(references, hypotheses)
print("bleu score: ", bleu_score)

bleu score:  0.14348966088442014
