In [1]:
# Necessary inputs
import warnings

from datasets import load_dataset, load_metric
import transformers
import datasets
import random
import pandas as pd
from IPython.display import display, HTML
import torch
from torch.utils.data import random_split

warnings.filterwarnings('ignore')

## Selecting the model
For the example purpose we select as model checkpoint the smallest transformer in T5 family - `t5_small`. Other pre-trained models can be found [here](https://huggingface.co/docs/transformers/model_doc/t5#:~:text=T5%20comes%20in%20different%20sizes%3A).

In [2]:
# selecting model checkpoint
model_checkpoint = "t5-small"

## Loading the dataset (???)

In [3]:
df_raw = pd.read_csv('../data/raw/filtered.tsv', delimiter='\t', index_col=0)
df_raw

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348
...,...,...,...,...,...,...
577772,You didn't know that Estelle had stolen some f...,you didn't know that Estelle stole your fish f...,0.870322,0.030769,0.000121,0.949143
577773,It'il suck the life out of you!,you'd be sucked out of your life!,0.722897,0.058824,0.996124,0.215794
577774,"I can't fuckin' take that, bruv.",I really can't take this.,0.617511,0.212121,0.984538,0.000049
577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care.",0.679613,0.358209,0.991945,0.000124


In [4]:
df = df_raw.drop(columns=["similarity", "lenght_diff"])

Make reference always more toxic than the translation:

In [5]:
mask = df['ref_tox'] < df['trn_tox']

df['translation'][mask], df['reference'][mask] = df['reference'][mask], df['translation'][mask]
df['ref_tox'][mask], df['trn_tox'][mask] = df['trn_tox'][mask], df['ref_tox'][mask]
df

Unnamed: 0,reference,translation,ref_tox,trn_tox
0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t...",0.981983,0.014195
1,you're becoming disgusting.,Now you're getting nasty.,0.999039,0.065473
2,"well, we can spare your life.","Well, we could spare your life, for one.",0.985068,0.213313
3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it.",0.994215,0.053362
4,I have orders to kill her.,I've got orders to put her down.,0.999348,0.009402
...,...,...,...,...
577772,you didn't know that Estelle stole your fish f...,You didn't know that Estelle had stolen some f...,0.949143,0.000121
577773,It'il suck the life out of you!,you'd be sucked out of your life!,0.996124,0.215794
577774,"I can't fuckin' take that, bruv.",I really can't take this.,0.984538,0.000049
577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care.",0.991945,0.000124


In [6]:
import numpy as np
df = df[:][df['translation'].str.len() > 150]
df

Unnamed: 0,reference,translation,ref_tox,trn_tox
32,"and Murray has his eyes on his ass, he can't s...","And Murray, being the butt-plug that he is, do...",0.996764,0.059791
40,"Not content with merely killing my son, the Co...",the Count and his bloodlust-starved nephews we...,0.927254,0.049472
119,That fool's gonna look up and see that check t...,the poor driver looks at the payout that the c...,0.998072,0.000280
124,but it was a human race that put us all in thi...,But it was the human race that had jammed us a...,0.857898,0.074480
212,It was as if God had decided Delacroix needed ...,it seemed as if God had chosen to cast Delacro...,0.920242,0.001022
...,...,...,...,...
577552,"you know, just you and me... and that moment o...","You know, it's just that you and I finally hav...",0.859250,0.028703
577648,Or perhaps the ancient communal traditions... ...,"or, perhaps, those ancient social traditions t...",0.980451,0.022242
577711,"""I don't see how you get so upset,"" Arnie answ...","'I don't get what you're so upset about,' Arni...",0.999119,0.006092
577718,"but in the midst of her astonishment, she hear...",And then she was shockingly surprised to hear ...,0.996422,0.006014


In [7]:
class DetoxificationDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, prefix, max_length, return_scores=False, samples=-1):
        self.raw_data = dataframe.to_numpy()[:samples,:]
        self.return_scores = return_scores

        inputs = (prefix + self.raw_data[:,0]).astype(str).tolist()
        targets = self.raw_data[:,1].tolist()

        self.inputs = []
        for input, target in zip(inputs, targets):
            model_input = tokenizer(input, max_length=max_length, truncation=True)
            label = tokenizer(target, max_length=max_length, truncation=True)
            model_input['labels'] = label['input_ids']
            self.inputs.append(model_input)

        #for input, label in zip(self.inputs, labels):
        #    input['labels'] = label["input_ids"]

    def __getitem__(self, idx):
        return self.inputs[idx]

    def __len__(self):
        return len(self.inputs)

In [8]:
class LongerDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, prefix, max_length, return_scores=False, samples=-1):
        self.raw_data = dataframe.to_numpy()[:samples,:]
        self.return_scores = return_scores

        inputs = (prefix + self.raw_data[:,0]).astype(str).tolist()
        targets = self.raw_data[:,1].tolist()

        self.inputs = []
        for input in inputs + targets:
            model_input = tokenizer(input, max_length=max_length, truncation=True)
            model_input['labels'] = model_input['input_ids']
            self.inputs.append(model_input)

    def __getitem__(self, idx):
        return self.inputs[idx]

    def __len__(self):
        return len(self.inputs)

In [9]:
global_seed = 1984
# setting random seed for transformers library
transformers.set_seed(global_seed)

## Preprocessing the data
As usual we will need to preprocess data and tokenize it before passing to model

In [10]:
from transformers import AutoTokenizer

# we will use autotokenizer for this purpose
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

prefix = ""
dataset = LongerDataset(df, tokenizer, prefix=prefix, max_length=128)

val_ratio = 0.85
train_dataset, val_dataset = random_split(dataset, [1 - val_ratio, val_ratio])

## Fine-tuning the model

In [11]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [19]:
# create a model for the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [18]:
from transformers import GenerationConfig

gcnf = GenerationConfig(max_length = 128, temperature = 0)

In [20]:
# defining the parameters for training
batch_size = 32
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-longer-2",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    report_to='tensorboard',
    generation_config=gcnf
)

In [21]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [22]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [23]:
# instead of writing train loop we will use Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics
)

In [24]:
trainer.train()

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/705 [00:00<?, ?it/s]

{'eval_loss': 0.0026724364142864943, 'eval_runtime': 60.2609, 'eval_samples_per_second': 373.891, 'eval_steps_per_second': 11.699, 'epoch': 1.0}


  0%|          | 0/705 [00:00<?, ?it/s]

{'eval_loss': 0.001692138030193746, 'eval_runtime': 31.5501, 'eval_samples_per_second': 714.133, 'eval_steps_per_second': 22.345, 'epoch': 2.0}


  0%|          | 0/705 [00:00<?, ?it/s]

{'eval_loss': 0.0018509796354919672, 'eval_runtime': 31.942, 'eval_samples_per_second': 705.372, 'eval_steps_per_second': 22.071, 'epoch': 3.0}
{'loss': 0.0104, 'learning_rate': 0.0001, 'epoch': 4.0}


  0%|          | 0/705 [00:00<?, ?it/s]

{'eval_loss': 0.0010793991386890411, 'eval_runtime': 33.3511, 'eval_samples_per_second': 675.569, 'eval_steps_per_second': 21.139, 'epoch': 4.0}


  0%|          | 0/705 [00:00<?, ?it/s]

{'eval_loss': 0.0010317499982193112, 'eval_runtime': 39.1672, 'eval_samples_per_second': 575.252, 'eval_steps_per_second': 18.0, 'epoch': 5.0}
{'train_runtime': 322.6652, 'train_samples_per_second': 61.627, 'train_steps_per_second': 1.937, 'train_loss': 0.008874515390396118, 'epoch': 5.0}


TrainOutput(global_step=625, training_loss=0.008874515390396118, metrics={'train_runtime': 322.6652, 'train_samples_per_second': 61.627, 'train_steps_per_second': 1.937, 'train_loss': 0.008874515390396118, 'epoch': 5.0})

In [25]:
# saving model
trainer.save_model('longer_best22')

In [26]:
# loading the model and run inference for it
model = AutoModelForSeq2SeqLM.from_pretrained('longer_best22')
model.eval()
model.config.use_cache = False

In [29]:
def translate(model, inference_request, tokenizer=tokenizer):
    input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)
    print(outputs[0].size())
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [18]:
#val_dataset[15]

In [19]:
#tokenizer.decode(val_dataset[15]['input_ids'], skip_special_tokens=False, temperature=0)

In [20]:
#tokenizer.decode(val_dataset[15]['labels'][:11], skip_special_tokens=False, temperature=0)

In [44]:
index = 0

In [68]:
index = index + 1
print(index)
dataset.raw_data[index][0]

17


'to quote Jake and Oppenheimer: "I must die, I must feel like a terrible god."'

In [30]:
inference_request = prefix + 'Another idea is to join the two columns (a and b) as strings, and check for 12 and 43, i.e.'
#inference_request = dataset.raw_data[index][0]
input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
translate(model, inference_request, tokenizer)

ValueError: `decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.