Model: T5-base
Fine tune: FULL
Quantization: NO
Dataset: rotten tomatoes

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets --quiet
!pip install evaluate --quiet
!pip install rouge_score --quiet
!pip install accelerate -U --quiet
!pip install transformers[torch] -U --quiet
!pip install transformers[sentencepiece] -U --quiet
!pip install peft --quiet
!pip install sentencepiece --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━

In [1]:
#names "t5-base" "google/mt5-base" "facebook/bart-large" "facebook/mbart-large-50"
original_model_name = "facebook/mbart-large-50"

In [2]:
model_name = 'peft_mBart'
dataset_path = './drive/MyDrive/mba_tcc_datasets/rotten_tomatoes/bart_rt/'
dataset_name = 'rt'
max_length = 1024
peft = True
batch_size = 3
experiment_time = {}

In [3]:
from transformers import AutoConfig, DataCollatorForSeq2Seq, Seq2SeqTrainer, GenerationConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, MBart50TokenizerFast
from nltk.tokenize import RegexpTokenizer
from torch.utils.data import DataLoader
from datasets import load_from_disk
import numpy as np
import evaluate
import shutil
import torch
import time
import json
import os

In [4]:
#Model loading
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = AutoConfig.from_pretrained(
    original_model_name,
    max_length=max_length,
    length_penalty=0.6,
    no_repeat_ngram_size=2,
    num_beams=15,
)

original_model = (AutoModelForSeq2SeqLM
         .from_pretrained(original_model_name, config=config))
tokenizer = AutoTokenizer.from_pretrained(original_model_name, src_lang="en_XX", tgt_lang="en_XX", max_target_length=512, max_source_length=1024)

In [5]:
#Peft loading
if peft:
    from peft import LoraConfig, get_peft_model, TaskType

    lora_config = LoraConfig(
        r=8, # Rank
        lora_alpha=8,
        lora_dropout=0.05,
        bias="all",
        target_modules=['v_proj', 'q_proj'],
        task_type=TaskType.SEQ_2_SEQ_LM
    )

    model = get_peft_model(original_model,
                            lora_config).to(device) #carregar no cuda

else:
    model = original_model.to(device)

In [6]:
#Data loading
dataset = load_from_disk(dataset_path)

def tokenize_sample_data(data):
    input_feature = tokenizer(data["input"], truncation=True, max_length=max_length, padding='max_length')
    label         = tokenizer(data["summary"], truncation=True, max_length=512, padding='max_length')

    return {
        "input_ids": input_feature["input_ids"],
        "attention_mask": input_feature["attention_mask"],
        "labels": label["input_ids"],
    }

tokenized_ds = dataset.map(
    tokenize_sample_data,
    batched=True,
    remove_columns=["input", "summary"])

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    return_tensors="pt")

Map:   0%|          | 0/2984 [00:00<?, ? examples/s]

Map:   0%|          | 0/747 [00:00<?, ? examples/s]

In [7]:
rouge_metric = evaluate.load("rouge")

def tokenize_sentence(arg):
    encoded_arg = tokenizer(arg)
    return tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

def metrics_func(eval_arg):
    preds, labels = eval_arg
    # Replace -100
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Convert id tokens to text
    text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Insert a line break (\n) in each sentence for ROUGE scoring
    # (Note : Please change this code, when you perform on other languages except for Japanese)
    text_preds = [(p if p.endswith(("!", "?", ".")) else p + ". ") for p in text_preds]
    text_labels = [(l if l.endswith(("!", "?", ".")) else l + ". ") for l in text_labels]
    sent_tokenizer = RegexpTokenizer(u'[^!?.]*[!?.]')
    text_preds = ["\n".join(np.char.strip(sent_tokenizer.tokenize(p))) for p in text_preds]
    text_labels = ["\n".join(np.char.strip(sent_tokenizer.tokenize(l))) for l in text_labels]
    # compute ROUGE score with custom tokenization
    return rouge_metric.compute(
        predictions=text_preds,
        references=text_labels,
        tokenizer=tokenize_sentence
    )


In [10]:
#model training
training_args = Seq2SeqTrainingArguments(
    output_dir = f"{model_name}-{dataset_name}_checkpoint",
    log_level = "error",
    num_train_epochs = 4,
    learning_rate = 2e-3,
    lr_scheduler_type = "linear",
    warmup_steps = 20,
    optim = "adamw_torch",
    weight_decay = 0.01,
    gradient_accumulation_steps = 16,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 1,
    evaluation_strategy = "steps",
    eval_steps = 50,
    predict_with_generate=True,
    generation_max_length = 512,
    save_steps = 100,
    logging_steps = 10
)

trainer = Seq2SeqTrainer(
    model           = model,
    args            = training_args,
    compute_metrics = metrics_func,
    train_dataset   = tokenized_ds['train'],
    eval_dataset    = tokenized_ds["test"].select(range(20)),
    tokenizer       = tokenizer,
    data_collator   = data_collator
)

In [None]:
start = time.time()
trainer.train()
end = time.time()
experiment_time['training'] = end - start
print(experiment_time)

{'loss': 9.269, 'learning_rate': 0.001, 'epoch': 0.11}
{'loss': 9.2633, 'learning_rate': 0.002, 'epoch': 0.21}
{'loss': 9.2385, 'learning_rate': 0.001943181818181818, 'epoch': 0.32}
{'loss': 9.2311, 'learning_rate': 0.0018863636363636365, 'epoch': 0.43}
{'loss': 9.239, 'learning_rate': 0.0018295454545454546, 'epoch': 0.54}


In [None]:
name_of_model = f'{model_name}/{model_name}_{dataset_name}_model'
os.makedirs(f"./{name_of_model}", exist_ok=True)
if hasattr(trainer.model, "module"):
    trainer.model.module.save_pretrained(f"./{name_of_model}")
else:
    trainer.model.save_pretrained(f"./{name_of_model}")

In [None]:
start = time.time()

all_preds = []
all_labels = []
# Predict with test data (first 5 rows)
sample_dataloader = DataLoader(
    tokenized_ds["test"].with_format("torch"),
    batch_size=5
    )
for i, batch in enumerate(sample_dataloader):
    with torch.no_grad():
      preds = model.generate(
          input_ids = batch["input_ids"].to(device),
          generation_config=GenerationConfig(
              num_beams=2,
              max_new_tokens=200
          )
      )

    labels = batch["labels"]
    # Replace -100 (see above)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Convert id tokens to text
    text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    if i == 0 :
      print(labels[0])
      print(text_preds[0])
      print(text_labels[0])

    all_preds  += text_preds
    all_labels += text_labels
    #break

end = time.time()
experiment_time['prediction'] = end - start
print(experiment_time)

In [None]:
trainer = Seq2SeqTrainer(
    model           = model,
    args            = training_args,
    data_collator   = data_collator,
    compute_metrics = metrics_func,
    train_dataset   = tokenized_ds["train"],
    eval_dataset    = tokenized_ds["test"],
    tokenizer       = tokenizer
)
start = time.time()
results = trainer.evaluate()
end = time.time()
experiment_time['evaluation'] = end - start
print(experiment_time)

model_result_path = f'{model_name}/{model_name}_{dataset_name}_results'
os.mkdir(model_result_path)

def save_json(file, data):
  with open(file, 'w') as f:
      json.dump(data, f)

save_json(f'{model_result_path}/all_preds.json', all_preds)
save_json(f'{model_result_path}/all_labels.json', all_labels)
save_json(f'{model_result_path}/result.json', results)
save_json(f'{model_result_path}/experiment_time.json', experiment_time)

shutil.make_archive(f'{model_name}', 'zip', f'{model_name}')

In [None]:
#from google.colab import runtime
#runtime.unassign()