# Imports

In [1]:
import transformers
import torch
import random
import numpy as np
from torch.utils.data import random_split
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import GenerationConfig

import sys
if '../' not in sys.path: sys.path.insert(1, '../')
from src.data.make_dataset import load_detoxification_dataset, load_toxicity_dataset

# Load the pretrained T5

In [2]:
global_seed = 1984

transformers.set_seed(global_seed)
random.seed(global_seed)
np.random.seed(global_seed)
torch.manual_seed(global_seed)
torch.cuda.manual_seed_all(global_seed)
model_checkpoint = "t5-small"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
dataset_portion = 0.01
dataset_kwargs = {
    'path': '../data/raw/filtered.tsv', # path to raw data
    'cache_path': '../data/processed/tokenized.tsv', # path to processed data (or where to store it)
    'tokenizer': tokenizer, # tokenizer to tokenize texts
    'portion': dataset_portion # get only a portion of dataset [0..1]
}

# Dataset

In [5]:
dataset = load_detoxification_dataset(**dataset_kwargs)

val_ratio = 0.2
train_dataset, val_dataset = random_split(dataset, [1 - val_ratio, val_ratio])

# Training

In [6]:
# defining the parameters for training
genConfig = GenerationConfig.from_pretrained(model_checkpoint)
genConfig.max_new_tokens = 64

batch_size = 32
postfix = "-10"
save_model_path = f'../models/t5_detoxifier{postfix}'
args = Seq2SeqTrainingArguments(
    f"../models/{model_checkpoint}-detoxification{postfix}",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    report_to='tensorboard',
    logging_steps=5000,
    save_steps=10000,
    generation_config=genConfig
)

In [7]:
collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collator,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics
)

In [8]:
trainer.train()

  0%|          | 0/725 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/37 [00:00<?, ?it/s]

{'eval_loss': 2.121922016143799, 'eval_runtime': 0.9647, 'eval_samples_per_second': 1197.224, 'eval_steps_per_second': 38.353, 'epoch': 1.0}


  0%|          | 0/37 [00:00<?, ?it/s]

{'eval_loss': 2.0536093711853027, 'eval_runtime': 1.0399, 'eval_samples_per_second': 1110.679, 'eval_steps_per_second': 35.58, 'epoch': 2.0}


  0%|          | 0/37 [00:00<?, ?it/s]

{'eval_loss': 2.027437686920166, 'eval_runtime': 0.9848, 'eval_samples_per_second': 1172.771, 'eval_steps_per_second': 37.569, 'epoch': 3.0}


  0%|          | 0/37 [00:00<?, ?it/s]

{'eval_loss': 2.013796091079712, 'eval_runtime': 1.0278, 'eval_samples_per_second': 1123.806, 'eval_steps_per_second': 36.001, 'epoch': 4.0}


  0%|          | 0/37 [00:00<?, ?it/s]

{'eval_loss': 2.0110414028167725, 'eval_runtime': 1.0049, 'eval_samples_per_second': 1149.388, 'eval_steps_per_second': 36.82, 'epoch': 5.0}
{'train_runtime': 77.9113, 'train_samples_per_second': 296.619, 'train_steps_per_second': 9.305, 'train_loss': 2.274500942887931, 'epoch': 5.0}


TrainOutput(global_step=725, training_loss=2.274500942887931, metrics={'train_runtime': 77.9113, 'train_samples_per_second': 296.619, 'train_steps_per_second': 9.305, 'train_loss': 2.274500942887931, 'epoch': 5.0})

In [11]:
# saving model
trainer.save_model(save_model_path)

In [64]:
# loading the model and run inference for it
# model = AutoModelForSeq2SeqLM.from_pretrained(save_model_path)
model = AutoModelForSeq2SeqLM.from_pretrained('../models backup/t5_detoxifier-10')
model.eval()
model.config.use_cache = False

# Testing ??

In [65]:
def translate(model, inference_request, tokenizer=tokenizer):
    tokenized = tokenizer.encode(inference_request, return_tensors="pt")
    outputs = model.generate(tokenized, generation_config=genConfig)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [76]:
inference_request = """
this guy is a con man. He's an actor. He's just a character.
"""
translate(model, inference_request)

This guy is a con, he's an actor, he's just a character.


# Validation ????

In [13]:
from src.models.t5_toxicity_evaluator import T5TEModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
evalutator = T5TEModel('../models/last_toxic_regressor/model.pt').to(device)
model.to(device)
_ = evalutator.model.eval()

In [14]:
eval_dataset = load_toxicity_dataset(**dataset_kwargs)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=128, shuffle=False, collate_fn=evalutator.collate_batch)

In [15]:
from tqdm.auto import tqdm
transformed = []

for batch in tqdm(eval_loader, total=len(eval_loader), desc='Translating'):
    output = model.generate(input_ids=batch.input_ids, attention_mask=batch.attention_mask, generation_config=genConfig)
    transformed += output.detach().cpu()

Translating:   0%|          | 0/452 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [16]:
ref_evaluations = []

torch.cuda.empty_cache()
for batch in tqdm(eval_loader, total=len(eval_loader), desc='Evaluation'):
    output = evalutator(batch)
    ref_evaluations += output.detach().cpu()

Evaluation:   0%|          | 0/452 [00:00<?, ?it/s]

In [17]:
transformed_keys = [{'input_ids': x} for x in transformed]
trn_loader = torch.utils.data.DataLoader(transformed_keys, batch_size=128, shuffle=False, collate_fn=evalutator.collate_batch)

In [18]:
trn_evaluations = []

torch.cuda.empty_cache()
for batch in tqdm(trn_loader, total=len(trn_loader), desc='Evaluation'):
    output = evalutator(batch)
    trn_evaluations += output.detach().cpu()

Evaluation:   0%|          | 0/452 [00:00<?, ?it/s]

In [19]:
torch.cuda.empty_cache()

In [20]:
threshold = 0.5

refevs = np.array(ref_evaluations)
trnevs = np.array(trn_evaluations)

ref_toxs = refevs > threshold
trn_toxs = trnevs > threshold

In [21]:
ref_neutrals = ref_toxs == False
ref_toxics = ref_toxs == True
trn_neutrals = trn_toxs == False
trn_toxics = trn_toxs == True

print(f'Neutral -> neutral: {np.sum(ref_neutrals)} -> {np.sum(np.logical_and(ref_neutrals, trn_neutrals))}')
print(f'Neutral -> toxic: {np.sum(ref_neutrals)} -> {np.sum(np.logical_and(ref_neutrals, trn_toxics))}')
print(f'Toxic -> neutral: {np.sum(ref_toxics)} -> {np.sum(np.logical_and(ref_toxics, trn_neutrals))}')
print(f'Toxic -> toxic: {np.sum(ref_toxics)} -> {np.sum(np.logical_and(ref_toxics, trn_toxics))}')

Neutral -> neutral: 27871 -> 27857
Neutral -> toxic: 27871 -> 14
Toxic -> neutral: 29906 -> 28996
Toxic -> toxic: 29906 -> 910
