# Fine-tuning a ceshine/t5-paraphrase-paws-msrp-opinosis model on a detoxification task with prefix
Test is performed in collab because I do not have GPU

In [1]:
from google.colab import drive # Using drive to quckly use my dataset
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# installing huggingface libraries for dataset, models and metrics
!pip install datasets transformers[sentencepiece] sacrebleu
!pip install numpy==1.24.3



In [3]:
# Necessary inputs
import warnings

from datasets import load_from_disk, load_metric
import transformers
# import datasets
# import random
# import pandas as pd
# from IPython.display import display, HTML

warnings.filterwarnings('ignore')

## Selecting the model
For the example purpose we select as model checkpoint the smallest transformer in T5 family - `t5_small`. Other pre-trained models can be found [here](https://huggingface.co/docs/transformers/model_doc/t5#:~:text=T5%20comes%20in%20different%20sizes%3A).

In [4]:
# selecting model checkpoint
model_checkpoint = "ceshine/t5-paraphrase-paws-msrp-opinosis"

## Loading the dataset

In [5]:
# setting random seed for transformers library
transformers.set_seed(42)

# Load my dataset from interim/ ditrectory
raw_datasets = load_from_disk("/content/drive/MyDrive/interim/")

# Load the BLUE metric
metric = load_metric("sacrebleu", split='train')

## Dataset
This dataset is exact one from interim/ directory that was formed using my make_dataset.py script. It contains keys `["train", "validation", "test"]` - which represents a dataset splits

In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['toxic_comment', 'neutral_comment'],
        num_rows: 404443
    })
    validation: Dataset({
        features: ['toxic_comment', 'neutral_comment'],
        num_rows: 57778
    })
    test: Dataset({
        features: ['toxic_comment', 'neutral_comment'],
        num_rows: 115556
    })
})

In [7]:
# samples from train dataset
raw_datasets["train"][:5]

{'toxic_comment': ['if alkar is flooding her with psychic waste, that explains the high level of neurotransmitters.',
  "now you're getting nasty.",
  'well, we could spare your life, for one.',
  "ah! monkey, you've got to snap out of it.",
  "i've got orders to put her down."],
 'neutral_comment': ['if alkar floods her with her mental waste, it would explain the high levels of neurotransmitter.',
  "you're becoming disgusting.",
  'well, we can spare your life.',
  'monkey, you have to wake up.',
  'i have orders to kill her.']}

## Preprocessing the data
As usual we will need to preprocess data and tokenize it before passing to model

In [8]:
from transformers import AutoTokenizer

# we will use autotokenizer for this purpose
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
# prefix for model input
prefix = "turn toxic to neutral"

In [10]:
max_input_length = 128
max_target_length = 128
toxic_sent = "toxic_comment"
target_sent = "neutral_comment"

def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples[toxic_sent]]
    targets = [ex for ex in examples[target_sent]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
# example of preprocessing
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[919, 12068, 12, 7163, 99, 491, 4031, 19, 18368, 160, 28, 26829, 2670, 6, 24, 3, 9453, 8, 306, 593, 13, 6567, 7031, 1538, 4849, 5, 1], [919, 12068, 12, 7163, 7651, 25, 31, 60, 652, 23147, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[3, 99, 491, 4031, 8347, 7, 160, 28, 160, 2550, 2670, 6, 34, 133, 3209, 8, 306, 1425, 13, 6567, 7031, 1538, 449, 5, 1], [25, 31, 60, 2852, 27635, 53, 5, 1]]}

In [12]:
# for the example purpose we will crop the dataset and select first 5000 for train
# and 500 for validation and test
cropped_datasets = raw_datasets
cropped_datasets['train'] = raw_datasets['train'].select(range(5000))
cropped_datasets['validation'] = raw_datasets['validation'].select(range(500))
cropped_datasets['test'] = raw_datasets['test'].select(range(500))
tokenized_datasets = cropped_datasets.map(preprocess_function, batched=True)

{'toxic_comment': 'if alkar is flooding her with psychic waste, that explains the high level of neurotransmitters.',
 'neutral_comment': 'if alkar floods her with her mental waste, it would explain the high levels of neurotransmitter.',
 'input_ids': [919,
  12068,
  12,
  7163,
  99,
  491,
  4031,
  19,
  18368,
  160,
  28,
  26829,
  2670,
  6,
  24,
  3,
  9453,
  8,
  306,
  593,
  13,
  6567,
  7031,
  1538,
  4849,
  5,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [3,
  99,
  491,
  4031,
  8347,
  7,
  160,
  28,
  160,
  2550,
  2670,
  6,
  34,
  133,
  3209,
  8,
  306,
  1425,
  13,
  6567,
  7031,
  1538,
  449,
  5,
  1]}

## Fine-tuning the model

In [13]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
# create a model for the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [14]:
# ! pip install accelerate -U

In [15]:
# defining the parameters for training
batch_size = 32
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{toxic_sent}-to-{target_sent}",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.00,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    report_to='tensorboard',
)

In [16]:
# instead of writing collate_fn function we will use DataCollatorForSeq2Seq
# simliarly it implements the batch creation for training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
import numpy as np

# simple postprocessing for text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

# compute metrics function to pass to trainer
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [18]:
# instead of writing train loop we will use Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [19]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,1.456775,24.0591,14.08
2,No log,1.413215,24.8854,13.936
3,No log,1.397439,25.1478,13.924
4,1.567100,1.392835,25.1933,13.87
5,1.567100,1.389202,25.4205,13.886
6,1.567100,1.39326,25.5133,13.822
7,1.310600,1.392417,25.3374,13.824
8,1.310600,1.394696,25.8742,13.796
9,1.310600,1.395465,25.7859,13.782
10,1.219100,1.397373,25.687,13.768


TrainOutput(global_step=1570, training_loss=1.3576157247944243, metrics={'train_runtime': 696.2629, 'train_samples_per_second': 71.812, 'train_steps_per_second': 2.255, 'total_flos': 3068776684707840.0, 'train_loss': 1.3576157247944243, 'epoch': 10.0})

In [20]:
# saving model
trainer.save_model('best')

In [21]:
# loading the model and run inference for it
model = AutoModelForSeq2SeqLM.from_pretrained('best')
model.eval()
model.config.use_cache = False

In [22]:
def translate(model, inference_request, tokenizer=tokenizer):
    input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True,temperature=0))

In [23]:
inference_request = prefix + "Now you're getting cocky, bastard"
translate(model, inference_request,tokenizer)

now you're a cocky, shit.
