In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%capture
!pip install transformers
!pip install accelerate
!pip install peft
!pip install datasets
!pip install tensorRT
!pip install unbabel-comet
!pip install wandb
!pip install bitsandbytes
!pip install pytorch_lightning

In [None]:
!sudo find / -name libnvinfer*.so* -print

find: ‘/proc/54/task/54/net’: Invalid argument
find: ‘/proc/54/net’: Invalid argument
/usr/local/lib/python3.9/dist-packages/tensorrt/libnvinfer.so.8
/usr/local/lib/python3.9/dist-packages/tensorrt/libnvinfer_plugin.so.8
/usr/local/lib/python3.9/dist-packages/tensorrt/libnvinfer_builder_resource.so.8.6.0


In [None]:
import pandas as pd
from tqdm import tqdm
import os

import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.tokenize import word_tokenize
from nltk.translate.chrf_score import sentence_chrf, corpus_chrf
from nltk.translate.bleu_score import SmoothingFunction
from comet import download_model, load_from_checkpoint

import warnings
warnings.filterwarnings("ignore")

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, BloomModel, MBartForConditionalGeneration, MBart50TokenizerFast
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType, prepare_model_for_int8_training
from datasets import load_dataset, DatasetDict
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup

import logging
logging.disable(logging.CRITICAL)

import wandb

from pytorch_lightning import (
    LightningDataModule, LightningModule, 
    Trainer, seed_everything)
from pytorch_lightning.loggers import WandbLogger
from sklearn.model_selection import train_test_split
from typing import Optional

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


In [None]:
m = "t5" # "bloom" "t5" ---- in future maybe also "mbart" "nllb" 

In [None]:
method = "peft" # "peft" or "normal"

# Evaluator

In [None]:
class Evaluator:

    def __init__(self, model_name='Unbabel/wmt22-comet-da'):

        self.COMET_model_path = download_model(model_name, saving_directory='./models/')
        self.COMET_sytem_score = 0

    def calculate_sentence_bleu(self, dataframe):
        """
        Calculating the sentence BLEU score for each translation.
        """
        dataframe['BLEU2'] = 0
        dataframe['BLEU3'] = 0
        dataframe['BLEU4'] = 0
        smoothie = SmoothingFunction().method4
        weights = [
            (1./2., 1./2.),
            (1./3., 1./3., 1./3.),
            (1./4., 1./4., 1./4., 1./4.)
            ]        
        for i, r in dataframe.iterrows():   
            bleu_scores = sentence_bleu([word_tokenize(str(r['target']))], word_tokenize(str(r['translation']))
                                       , weights, smoothing_function=smoothie)
            
            dataframe.at[i, 'BLEU2'] = bleu_scores[0]
            dataframe.at[i, 'BLEU3'] = bleu_scores[1]
            dataframe.at[i, 'BLEU4'] = bleu_scores[2]

        return dataframe

    def calculate_sentence_chrf(self, dataframe):
        """
        Calculating the sentence chrf score for each translation.
        """
        dataframe['chrf'] = 0
        for i, r in dataframe.iterrows():
            chrf_score = sentence_chrf((str(r['target'])), str(r['translation']))
            dataframe.at[i, 'chrf'] = chrf_score

        return dataframe

    def calculate_COMET(self, dataframe, batch_size=16, gpu_numbers=1):
        """
        Calculating the COMET score for each translation and also COMET sytem_score for entire translations.
        Args
            batch_size (:obj: 'int'): batch_size
            gpu_numbers (:obj: 'int'): Number of GPUs
        Returns
            dataframe with added COMET score
        """
        if torch.cuda.is_available():
            gpu_numbers = gpu_numbers
        else:
            gpu_numbers = 0

        model = load_from_checkpoint(self.COMET_model_path)
        data_list = []
        for i, r in dataframe.iterrows():
            data = {
                'src': str(r['source']),
                'mt': str(r['translation']),
                'ref': str(r['target'])
                }
            data_list.append(data)
        
        model_output = model.predict(data_list, batch_size, gpu_numbers)
        dataframe['COMET'] = model_output.scores

        # Add COMET system_score to self.COMET_sytem_score variable 
        # so when we need COMET system_score, there won't be any need to recalculate it
        self.COMET_sytem_score = model_output.system_score

        return dataframe

    def evaluating_from_dataframe(self, dataframe, save_path='/data/df_result_with_evaluation.csv'
                                  , COMET_model_batch_size=8, COMET_model_gpu_numbers=1):
        """
        Evaluating translations from privided csv file path.
        Args
            dataframe (:obj:`pandas dataframe'): Translation dataframe with agreed structure
            save_path (:obj: 'str'): path for saving the result dataframe in csv format
        Returns
            dataframe (:obj: 'pandas dataframe'): The dataframe with 3 evaluation metrics columns (BLEU, chrf, COMET)
        """
        dataframe = self.calculate_sentence_bleu(dataframe)
        dataframe = self.calculate_sentence_chrf(dataframe)
        dataframe = self.calculate_COMET(dataframe
                                            , batch_size=COMET_model_batch_size, gpu_numbers=COMET_model_gpu_numbers)

        dataframe.to_csv(save_path, sep=',')
        return dataframe

    def evaluating_from_file_path(self, prediction_file_path, sep=',', encoding='utf-8', save_path='/data/'
                                  , COMET_model_batch_size=8, COMET_model_gpu_numbers=1):
        """
        Evaluating translations from privided csv file path.
        Args
            prediction_file_path (:obj:`str'): CSV file path with agreed structure
            sep (:obj: 'str'): seperator of csv file
            encoding (:obj: 'str'): encoding of csv file
            save_path (:obj: 'str'): path for saving the result dataframe in csv format
        Returns
            dataframe (:obj: 'pandas dataframe'): The dataframe with 3 evaluation metrics columns (BLEU, chrf, COMET)
        """

        dataframe = pd.read_csv(prediction_file_path, sep=sep, encoding=encoding)
        dataframe = self.calculate_sentence_bleu(dataframe)
        dataframe = self.calculate_sentence_chrf(dataframe)
        dataframe = self.calculate_COMET(dataframe
                                             , batch_size=COMET_model_batch_size, gpu_numbers=COMET_model_gpu_numbers)

        dataframe.to_csv(save_path, sep=',')
        return dataframe

    def calculate_corpus_bleu(self, dataframe):
        """
        Calculating the corpus BLEU score over entire translations.
        Args
            dataframe (:obj:`pandas dataframe`):
        Return
            dictionary (:obj: `dict`): dictionary of BLEU2, BLEU3, and BLEU4 scores
        """
        list_of_references = []
        for sentence in dataframe['target'].values:
            list_of_references.append([word_tokenize(str(sentence))])

        hypotheses = []
        for sentence in dataframe['translation'].values:
            hypotheses.append(word_tokenize(str(sentence)))

        weights = [
            (1./2., 1./2.),
            (1./3., 1./3., 1./3.),
            (1./4., 1./4., 1./4., 1./4.)
            ]
        smoothie = SmoothingFunction().method4
        bleu_corpus_scores = corpus_bleu(list_of_references, hypotheses, weights, smoothing_function=smoothie)
        return {'BLEU2': bleu_corpus_scores[0], 'BLEU3': bleu_corpus_scores[1], 'BLEU4': bleu_corpus_scores[2]}

    def calculate_mean_bleu(self, dataframe):
        """
            Calculating the mean BLEU score over entire translations.
        """
        mean_bleu = dataframe.loc[:, 'BLEU'].mean()
        return mean_bleu

    def calculate_corpus_chrf(self, dataframe):
        """
        Calculating the corpus chrf score over entire translations.
        """
        list_of_references = []
        for sentence in dataframe['target'].values:
            list_of_references.append([str(sentence)])

        hypotheses = []
        for sentence in dataframe['translation'].values:
            hypotheses.append([str(sentence)])

        return corpus_chrf(list_of_references, hypotheses)

    def calculate_mean_chrf(self, dataframe):
        """
        Calculating the mean chrf score over entire translations.
        """
        mean_bleu = dataframe.loc[:, 'chrf'].mean()
        return mean_bleu

    def get_system_score_COMET(self):
        if self.COMET_sytem_score == 0:
            return 'COMET system score has not been computed yet. Call calculate_system_score_COMET() to compute it directly.'
        else:
            return self.COMET_sytem_score

    def calculate_system_score_COMET(self, dataframe, batch_size=16, gpu_numbers=1):
        """
        Calculate system_score (mean) COMET score over entire translations.
        Args
            df_prediction (:obj:`pandas dataframe'): Dataframe contains source text, reference text ,and translation text
            model_name (:obj:`str`): Model name of COMET library from below link:
            1. https://huggingface.co/Unbabel
            The default value is 'Unbabel/wmt22-comet-da' which is built on top of XLM-R
            and has been trained on direct assessments from WMT17 to WMT20 and provides scores ranging from 0 to 1
            , where 1 represents a perfect translation.
            batch_size (:obj: 'int'): batch_size
            gpu_numbers (:obj: 'int'): Number of GPUs
        Returns
            system_score (:obj: 'float'): The mean COMET score over entire translations.
        """
        if torch.cuda.is_available():
            gpu_numbers = gpu_numbers
        else:
            gpu_numbers = 0

        model = load_from_checkpoint(self.COMET_model_path)

        data_list = []
        for i, r in dataframe.iterrows():
            data = {
                'src': str(r['source']),
                'mt': str(r['translation']),
                'ref': str(r['target'])
            }
            data_list.append(data)

        model_output = model.predict(data_list, batch_size=batch_size, gpus=gpu_numbers)
        return model_output.system_score

#  PEFT

This is valid for sequence-to-sequence models, like:

*   T5
*   BLOOM
*   mBART
*   NLLB

T5 (and variants) and BLOOM need a prefix before the source sentence, mbart and nllb do not need a prefix

In [None]:
if m == "t5":
  model_name = "google/flan-t5-small"
  #model_name = "google/mt5-small"
  #model_name = "google/flan-ul2"
  tokenizer = T5Tokenizer.from_pretrained(model_name)
elif m == "bloom":
  model_name = "bigscience/mt0-small"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
# elif m == "mbart":
#   model_name = "facebook/mbart-large-50"
#   tokenizer = MBart50TokenizerFast.from_pretrained(
#                                                   model_name, 
#                                                   src_lang="{}_XX".format(src_lang), 
#                                                   tgt_lang="{}_XX".format(trg_lang)
#                                                   )
# elif m == "nllb":
#   model_name = "facebook/nllb-200-distilled-600M" # "facebook/nllb-200-distilled-1.3B"
#   tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Model chosen: {}".format(model_name))

Model chosen: google/flan-t5-small


In [None]:
max_length = 256
lr = 1e-4
num_epochs = 1
batch_size = 2
lora_alpha = 32
lora_dropout = 0.1
lora_r = 16

In [None]:
AVAIL_GPUS = 0
if torch.cuda.is_available():       
    device = torch.device("cuda")
    AVAIL_GPUS = torch.cuda.device_count()
    print(f'There are {AVAIL_GPUS} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
                                                                                                                                                                                                                                            
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")   

There are 1 GPU(s) available.
Device name: Tesla T4


## Flores "Classic" Pytorch

In [None]:
prefix = "translate Italian to Spanish:"

In [None]:
src_lang ="ita"
trg_lang = "spa"

In [None]:
wandb.init(name=f"{model_name}_{method}_Flores_{src_lang}_{trg_lang}", project='translated-challenge', entity='mt2magic')

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


### Data

In [None]:
class FloresDataset(Dataset):
  def __init__(self, src_file, trg_file, tokenizer, max_length=1024):

    self.src_sentences = []
    self.trg_sentences = []
    self.tokenizer = tokenizer
    self.max_length = max_length
    
    with open(src_file, 'r') as f:
      for line in f:
        if m in ["t5", "bloom"]:
          self.src_sentences.append(prefix + line.strip())
        else:
          self.src_sentences.append(line.strip())
    
    with open(trg_file, 'r') as f:
      for line in f:
        self.trg_sentences.append(line.strip())

  def __len__(self):
    return len(self.src_sentences)

  def __getitem__(self, index):
    src_encoding = self.tokenizer(self.src_sentences[index], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
    
    trg_encoding = self.tokenizer(self.trg_sentences[index], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
    
    input_ids = src_encoding['input_ids'].squeeze()
    attention_mask = src_encoding['attention_mask'].squeeze()
    trg_input_ids = trg_encoding['input_ids'].squeeze()
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': trg_input_ids}

In [None]:
train_dataset = FloresDataset("/content/drive/MyDrive/Data/Flores/{}_Latn.dev".format(src_lang), "/content/drive/MyDrive/Data/Flores/{}_Latn.dev".format(trg_lang), tokenizer)
eval_dataset = FloresDataset("/content/drive/MyDrive/Data/Flores/{}_Latn.devtest".format(src_lang), "/content/drive/MyDrive/Data/Flores/{}_Latn.devtest".format(trg_lang), tokenizer)
print("Number of samples in the train set: {}".format(len(train_dataset)))
print("Number of samples in the eval set: {}".format(len(eval_dataset)))

Number of samples in the train set: 997
Number of samples in the eval set: 1012


In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=True)

### Fine-tuning

In [None]:
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=lora_r, lora_alpha=lora_alpha, lora_dropout=lora_dropout)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
peft_model = get_peft_model(model, peft_config)
#peft_model

Downloading (…)lve/main/config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

In [None]:
peft_model.print_trainable_parameters()

trainable params: 688128 || all params: 300864896 || trainable%: 0.2287166130541198


In [None]:
config = wandb.config
config.model = model_name
config.batch_size = batch_size
config.learning_rate = lr
config.max_length = max_length
config.epochs = num_epochs
config.lora_alpha = lora_alpha
config.lora_dropout = lora_dropout
config.lor_r = lora_r

In [None]:
optimizer = torch.optim.AdamW(peft_model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)
config.optimizer = "AdamW"

In [None]:
# training and evaluation
peft_model = peft_model.to(device)
wandb.watch(peft_model, log="all")

for epoch in range(num_epochs):
    peft_model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = peft_model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    peft_model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = peft_model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
    wandb.log({'epoch': epoch + 1, 'train_loss': train_epoch_loss, 'eval_loss':eval_epoch_loss})

### Evaluation

In [None]:
def get_predictions(model, samples, target):
  results = []
  for i,m in enumerate(samples):
    message = prefix + m
    inputs = tokenizer.encode(message, truncation=False, max_length=1024, return_tensors="pt").to("cuda")
    output = model.generate(inputs=inputs)
    results.append([m, target[i], tokenizer.decode(output[0])])

  df = pd.DataFrame(results, columns=["source","target","translation"])
  return df

In [None]:
data_eng = load_dataset("gsarti/flores_101",src_lang)
data_fra = load_dataset("gsarti/flores_101",trg_lang)

In [None]:
samples = data_eng["devtest"]["sentence"]#[:5]
target = data_fra["devtest"]["sentence"]#[:5]
results = []

In [None]:
if method == "peft":
  df = get_predictions(peft_model, samples, target)
elif method == "normal":
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
  model = model.to(device)
  df = get_predictions(model, samples, target)

In [None]:
df

In [None]:
print(df.source[0], "\n", df.translation[0])

In [None]:
%%capture
eval = Evaluator()
df_translation = eval.evaluating_from_dataframe(df, save_path="/content/data/")
#df_translation

In [None]:
print(df_translation["source"][1], "\n", df_translation["target"][1])

In [None]:
corpus_bleu = eval.calculate_corpus_bleu(df_translation)
mean_bleu = eval.calculate_mean_bleu(df_translation)
corpus_chrf = eval.calculate_corpus_chrf(df_translation)
mean_chrf = eval.calculate_mean_chrf(df_translation)
mean_comet = eval.calculate_system_score_COMET(df_translation)
print('*** *** ***')
print(f'Corpus BLEU: {corpus_bleu}')
print(f'Mean BLEU: {mean_bleu}')
print('*** *** ***')
print(f'Corpus chrf: {corpus_chrf}')
print(f'Mean chrf: {mean_chrf}')
print('*** *** ***')
print(f'\nMean COMET: {mean_comet}')
print('*** *** ***')

In [None]:
wandb.log({'corpus_bleu': corpus_bleu, 'mean_bleu': mean_bleu, 'corpus_chrf': corpus_chrf, 'mean_chrf': mean_chrf, 'mean_comet':mean_comet})

## Flores Lightning

In [None]:
src_lang ="ita"
trg_lang = "spa"

In [None]:
data_path = "/content/drive/MyDrive/Data/Flores/"

In [None]:
prefix = "translate Italian to Spanish:"

In [None]:
wandb_logger = WandbLogger(name=f"{model_name}_{method}_Translated_{src_lang}_{trg_lang}", project='translated-challenge', entity='mt2magic', log_model=True)

[34m[1mwandb[0m: Currently logged in as: [33mgianfree_romani[0m ([33mmt2magic[0m). Use [1m`wandb login --relogin`[0m to force relogin


### Data

In [None]:
class PEFTDataset(Dataset):
    def __init__(self, input_id, attention, labels):
        self.attention = attention
        self.input_id = input_id
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx:int):
        attention = self.attention[idx]
        label = self.labels[idx]
        input_id = self.input_id[idx]
        sample = {"attention_mask": attention,
                  "input_ids": input_id, "labels": label}
        return sample

In [None]:
class FloresDataModule(LightningDataModule):
  def __init__(self, src_lang:str, trg_lang:str, path:str, tokenizer, max_length:int=128, batch_size:int=32, prefix:str="Translate from Italian to Spanish:"):
    super().__init__()

    self.src_lang = src_lang
    self.trg_lang = trg_lang
    self.path = path
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.max_length = max_length
    self.tokenizer = AutoTokenizer.from_pretrained(tokenizer, use_fast=True)
    self.prefix = prefix

  def prepare_split(self, split:str="dev"):
    src_sentences = []
    self.src_file = self.path + "{}_Latn.{}".format(self.src_lang, split)
    with open(self.src_file, 'r') as f:
      for line in f:
        src_sentences.append(line.strip())
    trg_sentences = []
    self.trg_file = self.path + "{}_Latn.{}".format(self.trg_lang, split)
    with open(self.trg_file, 'r') as f:
      for line in f:
        trg_sentences.append(line.strip())
    
    df = pd.DataFrame(list(zip(src_sentences, trg_sentences)), columns=['original', 'translation'])
    return df

  def setup(self, stage:str=None):
    train_data, val_data = train_test_split(self.prepare_split("dev"), test_size=0.2, random_state=42)
    test_data = self.prepare_split("devtest")

    self.X_train_enc, self.X_train_attention, self.Y_train_enc = self.preprocess_data(train_data)
    self.X_val_enc, self.X_val_attention, self.Y_val_enc = self.preprocess_data(val_data)
    self.X_test_enc, self.X_test_attention, self.Y_test_enc = self.preprocess_data(test_data)
    
  # def setup(self, stage:str=None):
  #   train_data = pd.read_csv(self.train_file)
  #   val_data = pd.read_csv(self.val_file)
  #   test_data = pd.read_csv(self.test_file)
    
  #   self.X_train_enc, self.X_train_attention, self.Y_train_enc = self.preprocess_data(train_data)
  #   self.X_val_enc, self.X_val_attention, self.Y_val_enc = self.preprocess_data(val_data)
  #   self.X_test_enc, self.X_test_attention, self.Y_test_enc = self.preprocess_data(test_data)

  def train_dataloader(self):
    train_dataset = PEFTDataset(self.X_train_enc,self.X_train_attention, self.Y_train_enc)
    return DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

  def val_dataloader(self):
    val_dataset = PEFTDataset(self.X_val_enc,self.X_val_attention, self.Y_val_enc)
    return DataLoader(val_dataset, batch_size=self.batch_size, shuffle=True)

  def test_dataloader(self):
    test_dataset = PEFTDataset(self.X_test_enc, self.X_test_attention, self.Y_test_enc)
    return DataLoader(test_dataset, batch_size=self.batch_size)

  def preprocess_data(self, data:pd.DataFrame):
    input_ids = []
    attention_masks = []
    trg_input_ids = []
    #print(data)
    for index, row in data.iterrows():
      src_encoding = self.tokenizer.batch_encode_plus(
            [self.prefix+row["original"]], max_length=self.max_length, pad_to_max_length=True, truncation=True
        )
      trg_encoding = self.tokenizer.batch_encode_plus(
            [row["translation"]], max_length=self.max_length, pad_to_max_length=True, truncation=True
        )
      
      input_ids.append(src_encoding.get('input_ids')[0])
      attention_masks.append(src_encoding.get('attention_mask')[0])
      trg_input_ids.append(trg_encoding.get('input_ids')[0])
    
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    trg_input_ids = torch.tensor(trg_input_ids)
    
    return input_ids, attention_masks, trg_input_ids

In [None]:
seed_everything(42)
dm = FloresDataModule(src_lang,  
                      trg_lang,
                      data_path,
                      tokenizer=model_name, 
                      batch_size=batch_size,
                      max_length=max_length, 
                      prefix=prefix
                      )
dm.setup()

In [None]:
# t = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# it = iter(dm.train_dataloader())
# for i in range(15):
#   ex = next(it)
#   r = t.decode(ex["input_ids"][0], skip_special_tokens =True)
#   print(r)
#   r = t.decode(ex["labels"][0], skip_special_tokens =True)
#   print(r)

### Fine-Tuning

In [None]:
class PEFTModel(LightningModule):
  def __init__(self, model_name:str, lora_r:float, lora_alpha:float, lora_dropout:float, device:str, lr=2e-5):
    super().__init__()
    
    self.peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, 
                                  inference_mode=False, 
                                  target_modules=["q", "v"], 
                                  r=lora_r, 
                                  lora_alpha=lora_alpha, 
                                  lora_dropout=lora_dropout
                                  )
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, load_in_8bit=True, device_map='auto')
    #model = prepare_model_for_int8_training(model)
    self.peft_model = get_peft_model(model, self.peft_config).to(device)
    self.lr = lr
    self.save_hyperparameters()

  def forward(self, **inputs):
    return self.peft_model(**inputs)

  def predict_step(self, batch, batch_idx:int, dataloader_idx:int=0):
    return self(**batch)

  def training_step(self, batch, batch_idx:int):
    outputs = self(**batch)
    loss = outputs.loss
    self.log('train_loss', loss)
    return loss

  def validation_step(self, batch, batch_idx:int):
    outputs = self(**batch)
    loss = outputs.loss
    self.log('val_loss', loss)
    return loss

  def configure_optimizers(self):
    optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
    return optimizer

In [None]:
model = PEFTModel(model_name, lora_r, lora_alpha, lora_dropout, device=device, lr=lr)

trainer = Trainer(
    max_epochs=num_epochs,
    gpus=1,
    logger= wandb_logger)

In [None]:
trainer.fit(model, datamodule=dm)

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Evaluation

In [None]:
def get_predictions(model, df_samples:pd.DataFrame=None, samples_path:str=None):
  if df_samples is not None:
    data = df_samples
  else:
    data = pd.read_csv(samples_path)
  results = []
  for i,s in tqdm(data.iterrows(), total=data.shape[0]):
    message = prefix + s["original"]
    inputs = tokenizer.encode(message, return_tensors="pt", padding=True).to("cuda")
    output = model.generate(inputs=inputs, max_length=1024)
    results.append([s["original"], s["translation"], tokenizer.decode(output[0], skip_special_tokens =True)])

  df = pd.DataFrame(results, columns=["source","target","translation"])
  return df

In [None]:
df_samples = dm.prepare_split("devtest")
df_samples

Unnamed: 0,original,translation
0,"""Abbiamo topi di quattro mesi che prima erano ...","«Actualmente, tenemos ratones de cuatro meses ..."
1,"Lo studio è ancora in fase iniziale, come dich...",La investigación todavía se ubica en su etapa ...
2,"Come altri esperti, è scettico circa la possib...","Al igual que otros especialistas, es escéptico..."
3,La segretaria permanente del Comitato per il N...,"El lunes, Sara Danius, secretaria permanente d..."
4,"Danius ha dischiarato: ""Ora come ora non stiam...",Danius declaró: «Actualmente no estamos hacien...
...,...,...
1007,Dal momento che i territori sono scarsamente p...,Gracias a la escasa población que reside en di...
1008,"In Giappone, la cultura del lavoro è più gerar...","En Japón, la cultura laboral tiene una estruct..."
1009,L'abbigliamento da ufficio è costituito normal...,La vestimenta típica del ámbito de los negocio...
1010,È di importanza cruciale l'armonia del luogo d...,La armonía en el lugar de trabajo es fundament...


In [None]:
if method == "peft":
  df = get_predictions(model.peft_model.to(device), df_samples=df_samples)
elif method == "normal":
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
  model = model.to(device)
  df = get_predictions(model, df_samples=df_samples)

 62%|██████▏   | 626/1012 [08:24<04:05,  1.57it/s]

In [None]:
%%capture
eval = Evaluator()
df_translation = eval.evaluating_from_dataframe(df, save_path="/content/data/")

In [None]:
print(df_translation["source"][4], "\n", df_translation["translation"][4])

In [None]:
corpus_bleu = eval.calculate_corpus_bleu(df_translation)
mean_bleu = eval.calculate_mean_bleu(df_translation)
corpus_chrf = eval.calculate_corpus_chrf(df_translation)
mean_chrf = eval.calculate_mean_chrf(df_translation)
mean_comet = eval.calculate_system_score_COMET(df_translation)
print('*** *** ***')
print(f'Corpus BLEU: {corpus_bleu}')
print(f'Mean BLEU: {mean_bleu}')
print('*** *** ***')
print(f'Corpus chrf: {corpus_chrf}')
print(f'Mean chrf: {mean_chrf}')
print('*** *** ***')
print(f'\nMean COMET: {mean_comet}')
print('*** *** ***')

In [None]:
wandb_logger.experiment.config["corpus_bleu"] = corpus_bleu
wandb_logger.experiment.config["mean_bleu"] = mean_bleu
wandb_logger.experiment.config["corpus_chrf"] = corpus_chrf
wandb_logger.experiment.config["mean_chrf"] = mean_chrf
wandb_logger.experiment.config["mean_comet"] = mean_comet

## Translated Dataset

In [None]:
src_lang ="ita"
trg_lang = "eng"

In [None]:
prefix = "translate Italian to English:"

In [None]:
data_translated="/content/drive/MyDrive/Data/Translated/it-en-cleaned.csv"

In [None]:
train_path = "/content/drive/MyDrive/Data/Translated/split_data/train_it-en-translated.csv"
val_path = "/content/drive/MyDrive/Data/Translated/split_data/val_it-en-translated.csv"
test_path = "/content/drive/MyDrive/Data/Translated/split_data/test_it-en-translated.csv"

In [None]:
wandb_logger = WandbLogger(name=f"{model_name}_{method}_Translated_{src_lang}_{trg_lang}", project='translated-challenge', entity='mt2magic', log_model=True)

[34m[1mwandb[0m: Currently logged in as: [33mgianfree_romani[0m ([33mmt2magic[0m). Use [1m`wandb login --relogin`[0m to force relogin


### Data

In [None]:
"""
Split data stored in data_path in three splits
test_size: how many rows will be used for test split, 
val_size: percentage of train data that will be used for validation
random_state: set the random seed 
folder_csv_path: where the splits will be saved and stored
"""
import re
def split_data(data_path, test_size=1000, val_size=0.2, random_state=42, folder_csv_path="/content/drive/MyDrive/Data/Translated/split_data/"):
  data = pd.read_csv(data_path)
  filename = os.path.basename(data_path)
  languages = re.findall("[A-Za-z]{2}-[A-Za-z]{2}-", filename)[0]
  train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)
  train_data, val_data = train_test_split(train_data, test_size=val_size, random_state=random_state)
  train_path = folder_csv_path + f"train_{languages}translated.csv"
  train_data.to_csv(train_path)
  test_path = folder_csv_path + f"test_{languages}translated.csv"
  test_data.to_csv(test_path)
  val_path = folder_csv_path + f"val_{languages}translated.csv"
  val_data.to_csv(val_path)

In [None]:
#split_data(data_translated)

In [None]:
class PEFTDataset(Dataset):
    def __init__(self, input_id, attention, labels):
        self.attention = attention
        self.input_id = input_id
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx:int):
        attention = self.attention[idx]
        label = self.labels[idx]
        input_id = self.input_id[idx]
        sample = {"attention_mask": attention,
                  "input_ids": input_id, "labels": label}
        return sample

In [None]:
class TranslatedDataModule(LightningDataModule):
  def __init__(self, train_file:str, test_file:str, val_file:str, tokenizer, max_length:int=128, batch_size:int=32, prefix:str="Translate from Italian to Spanish"):
    super().__init__()

    self.train_file = train_file
    self.test_file = test_file
    self.val_file = val_file
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.max_length = max_length
    self.tokenizer = AutoTokenizer.from_pretrained(tokenizer, use_fast=True)
    self.prefix = prefix

  def setup(self, stage:str=None):
    train_data = pd.read_csv(self.train_file)
    val_data = pd.read_csv(self.val_file)
    test_data = pd.read_csv(self.test_file)
    
    self.X_train_enc, self.X_train_attention, self.Y_train_enc = self.preprocess_data(train_data)
    self.X_val_enc, self.X_val_attention, self.Y_val_enc = self.preprocess_data(val_data)
    self.X_test_enc, self.X_test_attention, self.Y_test_enc = self.preprocess_data(test_data)

  def train_dataloader(self):
    train_dataset = PEFTDataset(self.X_train_enc,self.X_train_attention, self.Y_train_enc)
    return DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

  def val_dataloader(self):
    val_dataset = PEFTDataset(self.X_val_enc,self.X_val_attention, self.Y_val_enc)
    return DataLoader(val_dataset, batch_size=self.batch_size, shuffle=True)

  def test_dataloader(self):
    test_dataset = PEFTDataset(self.X_test_enc, self.X_test_attention, self.Y_test_enc)
    return DataLoader(test_dataset, batch_size=self.batch_size)

  def preprocess_data(self, data:pd.DataFrame):
    input_ids = []
    attention_masks = []
    trg_input_ids = []
    for index, row in data.iterrows():
      src_encoding = self.tokenizer.batch_encode_plus(
            [self.prefix+row["original"]], max_length=self.max_length, pad_to_max_length=True, truncation=True
        )
      trg_encoding = self.tokenizer.batch_encode_plus(
            [row["translation"]], max_length=self.max_length, pad_to_max_length=True, truncation=True
        )
      
      input_ids.append(src_encoding.get('input_ids'))
      attention_masks.append(src_encoding.get('attention_mask'))
      trg_input_ids.append(trg_encoding.get('input_ids'))
    
    input_ids = torch.tensor(input_ids)[0]
    attention_masks = torch.tensor(attention_masks)[0]
    trg_input_ids = torch.tensor(trg_input_ids)[0]
    
    return input_ids, attention_masks, trg_input_ids

In [None]:
seed_everything(42)
dm = TranslatedDataModule(train_path, 
                          val_path, 
                          test_path,
                          tokenizer=model_name, 
                          batch_size=batch_size,
                          max_length=max_length, 
                          prefix=prefix
                          )
dm.setup()

In [None]:
# ex = next(iter(dm.train_dataloader()))
# print(ex)

### Fine-tuning

In [None]:
class PEFTModel(LightningModule):
  def __init__(self, model_name:str, lora_r:float, lora_alpha:float, lora_dropout:float, device:str, lr=2e-5):
    super().__init__()
    
    self.peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, 
                                  inference_mode=False, 
                                  target_modules=["q", "v"], 
                                  r=lora_r, 
                                  lora_alpha=lora_alpha, 
                                  lora_dropout=lora_dropout
                                  )
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, load_in_8bit=True, device_map='auto')
    #model = prepare_model_for_int8_training(model)
    self.peft_model = get_peft_model(model, self.peft_config).to(device)
    self.lr = lr
    self.save_hyperparameters()

  def forward(self, **inputs):
    return self.peft_model(**inputs)

  def predict_step(self, batch, batch_idx, dataloader_idx=0):
    return self(**batch)

  def training_step(self, batch, batch_idx):
    outputs = self(**batch)
    loss = outputs.loss
    self.log('train_loss', loss)
    return loss

  def validation_step(self, batch, batch_idx):
    outputs = self(**batch)
    loss = outputs.loss
    self.log('val_loss', loss)
    return loss

  def configure_optimizers(self):
    optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
    return optimizer

In [None]:
wandb_logger.experiment.config["max_length"] = max_length
wandb_logger.experiment.config["lr"] = lr
wandb_logger.experiment.config["num_epochs"] = num_epochs
wandb_logger.experiment.config["batch_size"] = batch_size
wandb_logger.experiment.config["lora_alpha"] = lora_alpha
wandb_logger.experiment.config["lora_dropout"] = lora_dropout
wandb_logger.experiment.config["lora_r"] = lora_r

In [None]:
# training
model = PEFTModel(model_name, lora_r, lora_alpha, lora_dropout, device=device, lr=lr)

trainer = Trainer(
    max_epochs=num_epochs,
    gpus=1,
    logger= wandb_logger)

In [None]:
trainer.fit(model, datamodule=dm)

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Evaluation

In [None]:
# # Faster version
# def get_predictions(model, samples_path, dm):
  
#   translations = []
#   for inputs in dm:
#     outputs = model.generate(**inputs, max_length=1024)
#     translations += [tokenizer.decode(outputs, skip_special_tokens=True) for output in outputs]
#   results = pd.read_csv(samples_path)
#   results["translation"] = translations
#   return results[["source","target","translation"]]

In [None]:
# sample = "Il Data Protection Officer (DPO) è una figura introdotta dal Regolamento sulla protezione dei dati (Regolamento UE 2016/679), il quale deve osservare, valutare e organizzare la gestione del trattamento dei dati personali all’interno dell’azienda, affinché essi siano trattati nel rispetto delle normative sulla privacy a livello nazionale ed europeo."
# sample

In [None]:
# message = prefix + sample
# inputs = tokenizer.encode(message, return_tensors="pt", padding=True)#.to("cuda")
# output = model.peft_model.generate(inputs=inputs, max_length=512)
# r = tokenizer.decode(output[0], skip_special_tokens =True)
# r

In [None]:
model_name = "google/flan-t5-large"
model2 = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model2 = model2.to(device)

In [None]:
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# model = model.to(device)

In [None]:
# prefix = "translate Italian to Spanish:"

In [None]:
message = prefix + "Un aereo americano in difficoltà si schianta sopra il comune."
inputs = tokenizer.encode(message, return_tensors="pt", padding=True).to("cuda")
output = model2.generate(inputs=inputs, max_length=512)
r = tokenizer.decode(output[0], skip_special_tokens =False)
r

In [None]:
def get_predictions(model, samples_path:str):
  data = pd.read_csv(samples_path)
  results = []
  for i,s in tqdm(data.iterrows(), total=data.shape[0]):
    message = prefix + s["original"]
    inputs = tokenizer.encode(message, return_tensors="pt", padding=True).to("cuda")
    output = model.generate(inputs=inputs, max_length=1024)
    results.append([s["original"], s["translation"], tokenizer.decode(output[0], skip_special_tokens =True)])

  df = pd.DataFrame(results, columns=["source","target","translation"])
  return df

In [None]:
method="normal"

In [None]:
if method == "peft":
  df = get_predictions(model.peft_model.to(device), test_path)#, dm.test_dataloader)
elif method == "normal":
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
  model = model.to(device)
  df = get_predictions(model, test_path)#, dm.test_dataloader)

100%|██████████| 1000/1000 [16:34<00:00,  1.01it/s]


In [None]:
# df = get_predictions(model.peft_model, test_path, dm.test_dataloader)

In [None]:
eval = Evaluator()
df_translation = eval.evaluating_from_dataframe(df, save_path="/content/data/")
df_translation

In [None]:
print(df_translation["source"][4], "\n", df_translation["translation"][4])

Un aereo americano in difficoltà si schianta sopra il comune.  
 A American scientist in the field is a common scientist.


In [None]:
corpus_bleu = eval.calculate_corpus_bleu(df_translation)
mean_bleu = eval.calculate_mean_bleu(df_translation)
corpus_chrf = eval.calculate_corpus_chrf(df_translation)
mean_chrf = eval.calculate_mean_chrf(df_translation)
mean_comet = eval.calculate_system_score_COMET(df_translation)
print('*** *** ***')
print(f'Corpus BLEU: {corpus_bleu}')
print(f'Mean BLEU: {mean_bleu}')
print('*** *** ***')
print(f'Corpus chrf: {corpus_chrf}')
print(f'Mean chrf: {mean_chrf}')
print('*** *** ***')
print(f'\nMean COMET: {mean_comet}')
print('*** *** ***')

In [None]:
wandb_logger.experiment.config["corpus_bleu"] = corpus_bleu
wandb_logger.experiment.config["mean_bleu"] = mean_bleu
wandb_logger.experiment.config["corpus_chrf"] = corpus_chrf
wandb_logger.experiment.config["mean_chrf"] = mean_chrf
wandb_logger.experiment.config["mean_comet"] = mean_comet