In [None]:
%%capture
!pip install transformers
!pip install accelerate
!pip install peft
!pip install datasets
!pip install unbabel-comet
!pip install wandb

In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mgianfree_romani[0m ([33mmt2magic[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import pandas as pd
from tqdm import tqdm
import os

import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.tokenize import word_tokenize
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.translate.bleu_score import SmoothingFunction
from comet import download_model, load_from_checkpoint

import warnings
warnings.filterwarnings("ignore")

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, BloomModel, MBartForConditionalGeneration, MBart50TokenizerFast
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType
from datasets import load_dataset, DatasetDict
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup

import logging
logging.disable(logging.CRITICAL)

import wandb

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
wandb.init(project='translated-challenge', entity='mt2magic')

[34m[1mwandb[0m: Currently logged in as: [33mgianfree_romani[0m ([33mmt2magic[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
m = "t5" # "bloom" "mbart" "nllb" "t5"

In [None]:
src_lang ="eng"
trg_lang = "spa"

In [None]:
prefix = "translate English to Spanish: "

# Evaluator

In [None]:
class Evaluator:

    def __init__(self, model_name='Unbabel/wmt22-comet-da'):

        self.COMET_model_path = download_model(model_name, saving_directory='./models/')

    def calculate_sentence_bleu(self, df_evaluation):
        """
            Calculating the sentence BLEU score for each translation.
        """
        df_evaluation['BLEU'] = 0
        smoothie = SmoothingFunction().method4
        for i, r in df_evaluation.iterrows():
            bleu_score = sentence_bleu([word_tokenize(r['target'])], word_tokenize(r['translation'])
                                       , smoothing_function=smoothie)
            df_evaluation.at[i, 'BLEU'] = bleu_score

        return df_evaluation

    def calculate_sentence_chrf(self, df_evaluation):
        """
            Calculating the sentence chrf score for each translation.
        """
        df_evaluation['chrf'] = 0
        for i, r in df_evaluation.iterrows():
            chrf_score = sentence_chrf((r['target']), r['translation'])
            df_evaluation.at[i, 'chrf'] = chrf_score

        return df_evaluation

    def calculate_COMET(self, df_evaluation, batch_size=8, gpu_numbers=1):
        """
            Calculating the COMET score for each translation.
            model_name (:obj:`str`): Model name of COMET library from below link:
                1. https://huggingface.co/Unbabel
                The default value is 'Unbabel/wmt22-comet-da' which is built on top of XLM-R
                and has been trained on direct assessments from WMT17 to WMT20 and provides scores ranging from 0 to 1
                , where 1 represents a perfect translation.
                batch_size (:obj: 'int'): batch_size
                gpu_numbers (:obj: 'int'): Number of GPUs
        """
        if torch.cuda.is_available():
            gpu_numbers = gpu_numbers
        else:
            gpu_numbers = 0

        model = load_from_checkpoint(self.COMET_model_path)
        df_evaluation['COMET'] = 0
        for i, r in df_evaluation.iterrows():
            data = [
                {
                    'src': r['source'],
                    'mt': r['translation'],
                    'ref': r['target']
                }
            ]
            model_output = model.predict(data, batch_size=batch_size, gpus=gpu_numbers)
            df_evaluation.at[i, 'COMET'] = model_output.scores[0]

        return df_evaluation

    def evaluating_from_dataframe(self, dataframe, save_path='/data/df_result_with_evaluation.csv'
                                  , COMET_model_batch_size=8, COMET_model_gpu_numbers=1):
        """
                    Evaluating translations from privided csv file path.
                    Keys and Values:
                        dataframe (:obj:`pandas dataframe'): Translation dataframe with agreed structure
                        save_path (:obj: 'str'): path for saving the result dataframe in csv format
                    Output:
                        dataframe (:obj: 'pandas dataframe'): The dataframe with 3 evaluation metrics columns (BLEU, chrf, COMET)
        """
        df_evaluation = dataframe.copy()
        df_evaluation = self.calculate_sentence_bleu(df_evaluation)
        df_evaluation = self.calculate_sentence_chrf(df_evaluation)
        df_evaluation = self.calculate_COMET(df_evaluation
                                             , batch_size=COMET_model_batch_size, gpu_numbers=COMET_model_gpu_numbers)

        if not os.path.exists(save_path):
            os.makedirs(save_path)

        df_evaluation.to_csv(save_path, sep=',')
        return df_evaluation

    def evaluating_from_file_path(self, prediction_file_path, sep=',', encoding='utf-8', save_path='/data/'
                                  , COMET_model_batch_size=8, COMET_model_gpu_numbers=1):
        """
                    Evaluating translations from privided csv file path.
                    Keys and Values:
                        prediction_file_path (:obj:`str'): CSV file path with agreed structure
                        sep (:obj: 'str'): seperator of csv file
                        encoding (:obj: 'str'): encoding of csv file
                        save_path (:obj: 'str'): path for saving the result dataframe in csv format
                    Output:
                        dataframe (:obj: 'pandas dataframe'): The dataframe with 3 evaluation metrics columns (BLEU, chrf, COMET)
        """

        df_evaluation = pd.read_csv(prediction_file_path, sep=sep, encoding=encoding)
        df_evaluation = self.calculate_sentence_bleu(df_evaluation)
        df_evaluation = self.calculate_sentence_chrf(df_evaluation)
        df_evaluation = self.calculate_COMET(df_evaluation
                                             , batch_size=COMET_model_batch_size, gpu_numbers=COMET_model_gpu_numbers)

        if not os.path.exists(save_path):
            os.makedirs(save_path)

        df_evaluation.to_csv(save_path, sep=',')
        return df_evaluation

    def calculate_corpus_bleu(self, df_evaluation):
        """
            Calculating the corpus BLEU score over entire translations.
        """
        list_of_references = []
        for sentence in df_evaluation['target'].values:
            list_of_references.append([word_tokenize(sentence)])

        hypotheses = []
        for sentence in df_evaluation['translation'].values:
            hypotheses.append(word_tokenize(sentence))

        smoothie = SmoothingFunction().method4
        return corpus_bleu(list_of_references, hypotheses, smoothing_function=smoothie)

    def calculate_mean_bleu(self, df_evaluation):
        """
            Calculating the mean BLEU score over entire translations.
        """
        mean_bleu = df_evaluation.loc[:, 'BLEU'].mean()
        return mean_bleu

    def calculate_corpus_chrf(self, df_evaluation):
        """
            Calculating the corpus chrf score over entire translations.
        """
        list_of_references = []
        for sentence in df_evaluation['target'].values:
            list_of_references.append([sentence])

        hypotheses = []
        for sentence in df_evaluation['translation'].values:
            hypotheses.append([sentence])

        return corpus_chrf(list_of_references, hypotheses)

    def calculate_mean_chrf(self, df_evaluation):
        """
            Calculating the mean chrf score over entire translations.
        """
        mean_bleu = df_evaluation.loc[:, 'chrf'].mean()
        return mean_bleu

    def calculate_system_score_COMET(self, df_evaluation, batch_size=256, gpu_numbers=1):
        """
            Calculate system_score (mean) COMET score over entire translations.
            Keys and Values:
                df_prediction (:obj:`pandas dataframe'): Dataframe contains source text, reference text ,and translation text
                model_name (:obj:`str`): Model name of COMET library from below link:
                1. https://huggingface.co/Unbabel
                The default value is 'Unbabel/wmt22-comet-da' which is built on top of XLM-R
                and has been trained on direct assessments from WMT17 to WMT20 and provides scores ranging from 0 to 1
                , where 1 represents a perfect translation.
                batch_size (:obj: 'int'): batch_size
                gpu_numbers (:obj: 'int'): Number of GPUs
            Output:
                system_score (:obj: 'float'): The mean COMET score over entire translations.
        """
        if torch.cuda.is_available():
            gpu_numbers = gpu_numbers
        else:
            gpu_numbers = 0

        model = load_from_checkpoint(self.COMET_model_path)

        data_list = []
        for i, r in df_evaluation.iterrows():
            data = {
                'src': r['source'],
                'mt': r['translation'],
                'ref': r['target']
            }
            data_list.append(data)

        model_output = model.predict(data_list, batch_size=batch_size, gpus=gpu_numbers)
        return model_output.system_score

#  PEFT

This is valid for sequence-to-sequence models, like:

*   T5
*   BLOOM
*   mBART
*   NLLB

T5 (and variants) and BLOOM need a prefix before the source sentence, mbart and nllb do not need a prefix

In [None]:
if m == "t5":
  #model_name = "google/flan-t5-small"
  model_name = "google/mt5-small"
  #model_name = "google/flan-ul2"
  tokenizer = T5Tokenizer.from_pretrained(model_name)
elif m == "bloom":
  model_name = "bigscience/mt0-small"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
elif m == "mbart":
  model_name = "facebook/mbart-large-50"
  tokenizer = MBart50TokenizerFast.from_pretrained(
                                                  model_name, 
                                                  src_lang="{}_XX".format(src_lang), 
                                                  tgt_lang="{}_XX".format(trg_lang)
                                                  )
elif m == "nllb":
  model_name = "facebook/nllb-200-distilled-600M" # "facebook/nllb-200-distilled-1.3B"
  tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Model chosen: {}".format(model_name))

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Model chosen: google/mt5-small


In [None]:
max_length = 256
lr = 1e-3
num_epochs = 3
batch_size = 8

In [None]:
config = wandb.config
config.model = model_name
config.batch_size = batch_size
config.learning_rate = lr
config.max_length = max_length
config.epochs = num_epochs

## Data

In [None]:
class FloresDataset(Dataset):
  def __init__(self, src_file, trg_file, tokenizer, max_length=128):

    self.src_sentences = []
    self.trg_sentences = []
    self.tokenizer = tokenizer
    self.max_length = max_length
    
    with open(src_file, 'r') as f:
      for line in f:
        if m in ["t5", "bloom"]:
          self.src_sentences.append(prefix + line.strip())
        else:
          self.src_sentences.append(line.strip())
    
    with open(trg_file, 'r') as f:
      for line in f:
        self.trg_sentences.append(line.strip())

  def __len__(self):
    return len(self.src_sentences)

  def __getitem__(self, index):
    src_encoding = self.tokenizer(self.src_sentences[index], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
    
    trg_encoding = self.tokenizer(self.trg_sentences[index], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
    
    input_ids = src_encoding['input_ids'].squeeze()
    attention_mask = src_encoding['attention_mask'].squeeze()
    trg_input_ids = trg_encoding['input_ids'].squeeze()
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': trg_input_ids}

In [None]:
train_dataset = FloresDataset("/content/{}_Latn.dev".format(src_lang), "/content/{}_Latn.dev".format(trg_lang), tokenizer)
eval_dataset = FloresDataset("/content/{}_Latn.devtest".format(src_lang), "/content/{}_Latn.devtest".format(trg_lang), tokenizer)
print("Number of samples in the train set: {}".format(len(train_dataset)))
print("Number of samples in the eval set: {}".format(len(eval_dataset)))

FileNotFoundError: ignored

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=True)

NameError: ignored

## Fine-Tuning

In [None]:
AVAIL_GPUS = 0
if torch.cuda.is_available():       
    device = torch.device("cuda")
    AVAIL_GPUS = torch.cuda.device_count()
    print(f'There are {AVAIL_GPUS} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
                                                                                                                                                                                                                                            
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")   

In [None]:
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
peft_model = get_peft_model(model, peft_config)
peft_model

In [None]:
peft_model.print_trainable_parameters()

In [None]:
optimizer = torch.optim.AdamW(peft_model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)
config.optimizer = "AdamW"

In [None]:
# training and evaluation
peft_model = peft_model.to(device)
wandb.watch(peft_model, log="all")

for epoch in range(num_epochs):
    peft_model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = peft_model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    peft_model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = peft_model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
    wandb.log({'epoch': epoch + 1, 'train_loss': train_epoch_loss, 'eval_loss':eval_epoch_loss})

## Evaluation

In [None]:
def get_predictions(model, samples, target):
  results = []
  for i,m in enumerate(samples):
    message = prefix + m
    inputs = tokenizer.encode(message, return_tensors="pt").to("cuda")
    output = model.generate(inputs=inputs)
    results.append([m, tokenizer.decode(output[0]), target[i]])

  df = pd.DataFrame(results, columns=["source","target","translation"])
  return df

In [None]:
data_eng = load_dataset("gsarti/flores_101",src_lang)
data_fra = load_dataset("gsarti/flores_101",trg_lang)

In [None]:
samples = data_eng["devtest"]["sentence"][:5]
target = data_fra["devtest"]["sentence"][:5]
results = []

In [None]:
evaluated_version = "peft" # "peft" or anything else to evaluate the normal model

In [None]:
if evaluated_version == "peft":
  df = get_predictions(peft_model, samples, target)
else:
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
  model = model.to(device)
  df = get_predictions(model, samples, target)

In [None]:
eval = Evaluator()
df_translation = eval.evaluating_from_dataframe(df)
df_translation

In [None]:
corpus_bleu = eval.calculate_corpus_bleu(df_translation)
mean_bleu = eval.calculate_mean_bleu(df_translation)
corpus_chrf = eval.calculate_corpus_chrf(df_translation)
mean_chrf = eval.calculate_mean_chrf(df_translation)
mean_comet = eval.calculate_system_score_COMET(df_translation)
print('*** *** ***')
print(f'Corpus BLEU: {corpus_bleu}')
print(f'Mean BLEU: {mean_bleu}')
print('*** *** ***')
print(f'Corpus chrf: {corpus_chrf}')
print(f'Mean chrf: {mean_chrf}')
print('*** *** ***')
print(f'\nMean COMET: {mean_comet}')
print('*** *** ***')

In [None]:
wandb.log({'corpus_bleu': corpus_bleu, 'mean_bleu': mean_bleu, 'corpus_chrf': corpus_chrf, 'mean_chrf': mean_chrf, 'mean_comet':mean_comet})