## `In some places code can be simplified/removed/replaced`

### Importing all necessary packages and modules

In [1]:
import pandas as pd
import numpy as np
from datasets import load_metric
from datasets import Dataset
from transformers import (AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, 
                          Seq2SeqTrainingArguments, Seq2SeqTrainer)

### Loading and splitting data

In [2]:
df = pd.read_csv("../data/raw/filtered.tsv", sep="\t")
toxic = df[df['ref_tox'] > df['trn_tox']].iloc[:, 1:]   # Choosing the partition where the toxicity score of the reference is bigger than the one of its translation
toxic.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
5,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...,0.703185,0.206522,0.950956,0.035846
6,"They're all laughing at us, so we'll kick your...",they're laughing at us. We'll show you.,0.618866,0.230769,0.999492,0.000131
7,Maine was very short on black people back then.,there wasn't much black in Maine then.,0.720482,0.1875,0.96368,0.14871
11,"So now their spirits are cursed, walking back ...","their souls are cursed, they guard the paths, ...",0.755883,0.013245,0.842509,0.143992
13,"Come on, Cal, leave that shit alone.","come on, Cal, put it down.",0.660481,0.27027,0.999637,0.000279


### Sampling an example 

In [3]:
print(toxic['reference'].values[100])
print(toxic['translation'].values[100])

If you don't stop laughing, you'll end up dead... just like your idiot hyena cousins!
if you don't stop laughing, you come as your cousins of hyenas.


### Taking only a part of data with references & their translations

In [4]:
sins = toxic[["reference", 'translation']]

### Here it is possible to configure the partition of data that will be taken from all samples for traing(was useful for "fast check" of the training process)

In [8]:
part_factor = sins.shape[0]
# part_factor = 1000

### Calculating the maximum lenght of the toxic sentence for model generation 

In [9]:
lens_r = sins[:part_factor]['reference'].str.lower().str.split().apply(lambda x: len(x))
lens_t = sins[:part_factor]['translation'].str.lower().str.split().apply(lambda x: len(x))
max_len = np.max([lens_r.max(), lens_t.max()])
print(max_len)

### Creating a dataset to use .map() method for preparing data for the model training

In [12]:
dataset = Dataset.from_pandas(sins[:part_factor]).train_test_split(test_size=0.2)   # Create dataset and split it on "train" and "test" parts

### Load model and tokenizer, tokenizing data, creating model and data collator objects

In [13]:
checkpoint = "t5-small" # Model to load - "T5-small"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)   # Creating tokenizer

# Function for preprocessing sentences
def preprocess_function(data):
    inputs = [sent.lower().split() for sent in data["reference"]]
    targets = [sent.lower().split() for sent in data["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_len, truncation=True, is_split_into_words=True)

    return model_inputs

# Tokenizing the data in the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Creating a model's object
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Creating a data collator's object
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

Map:   0%|          | 0/255313 [00:00<?, ? examples/s]

Map:   0%|          | 0/63829 [00:00<?, ? examples/s]

### Functions for calculating BLEU score metric

In [15]:
metric = load_metric("sacrebleu")

# Function for preprocessing data for calculating metrics
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

# Main function for calculating metrics: BLEU score and average lenght of generated sentences
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)  # Replacing -100 tokens(if any) on pad tokens of the tokenizer
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # Decoding tokens to words

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)   # Replacing -100 tokens(if any) on pad tokens of the tokenizer
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Decoding tokens to words
    
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) # Prepare sentences for calculating metrics

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)   # Calculating BLEU score for each sentence
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]  
    result["gen_len"] = np.mean(prediction_lens)    # Calculating average lenght of generated sentences 
    result = {k: round(v, 4) for k, v in result.items()}
    return result

  metric = load_metric("sacrebleu")


### Choosing training hyperparameters and configuring training process with consequent launching it

In [17]:
new_model = '../models/T5-small_2e-5_32_10_MaxTokenLength' # Path for models checkpoints

# All training arguments/hyperparameters
training_args = Seq2SeqTrainingArguments(
    output_dir=new_model,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    generation_max_length = max_len + 5,
    weight_decay=1e-6,
)

# Trainer object creation
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Starting training
trainer.train()

  0%|          | 0/79790 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.0181, 'learning_rate': 1.987542298533651e-05, 'epoch': 0.06}
{'loss': 1.8386, 'learning_rate': 1.975009399674145e-05, 'epoch': 0.13}
{'loss': 1.7833, 'learning_rate': 1.9624765008146387e-05, 'epoch': 0.19}
{'loss': 1.7439, 'learning_rate': 1.9499436019551323e-05, 'epoch': 0.25}
{'loss': 1.7236, 'learning_rate': 1.9374107030956263e-05, 'epoch': 0.31}
{'loss': 1.7051, 'learning_rate': 1.92487780423612e-05, 'epoch': 0.38}
{'loss': 1.7026, 'learning_rate': 1.912369971174333e-05, 'epoch': 0.44}
{'loss': 1.6885, 'learning_rate': 1.8998370723148265e-05, 'epoch': 0.5}
{'loss': 1.6686, 'learning_rate': 1.8873041734553205e-05, 'epoch': 0.56}
{'loss': 1.6598, 'learning_rate': 1.8747712745958142e-05, 'epoch': 0.63}
{'loss': 1.6543, 'learning_rate': 1.862238375736308e-05, 'epoch': 0.69}
{'loss': 1.6467, 'learning_rate': 1.8497054768768018e-05, 'epoch': 0.75}
{'loss': 1.6412, 'learning_rate': 1.8371725780172955e-05, 'epoch': 0.81}
{'loss': 1.6379, 'learning_rate': 1.8246396791577894e-05, 

  0%|          | 0/1995 [00:00<?, ?it/s]

{'eval_loss': 1.493385672569275, 'eval_bleu': 28.7427, 'eval_gen_len': 16.3043, 'eval_runtime': 2547.2054, 'eval_samples_per_second': 25.058, 'eval_steps_per_second': 0.783, 'epoch': 1.0}
{'loss': 1.6316, 'learning_rate': 1.7995738814387767e-05, 'epoch': 1.0}
{'loss': 1.6065, 'learning_rate': 1.7870660483769897e-05, 'epoch': 1.07}
{'loss': 1.6049, 'learning_rate': 1.7745331495174837e-05, 'epoch': 1.13}
{'loss': 1.6093, 'learning_rate': 1.7620002506579773e-05, 'epoch': 1.19}
{'loss': 1.6169, 'learning_rate': 1.749467351798471e-05, 'epoch': 1.25}
{'loss': 1.6062, 'learning_rate': 1.736934452938965e-05, 'epoch': 1.32}
{'loss': 1.606, 'learning_rate': 1.7244015540794586e-05, 'epoch': 1.38}
{'loss': 1.5861, 'learning_rate': 1.7118937210176715e-05, 'epoch': 1.44}
{'loss': 1.5889, 'learning_rate': 1.6993608221581652e-05, 'epoch': 1.5}
{'loss': 1.5837, 'learning_rate': 1.6868279232986592e-05, 'epoch': 1.57}
{'loss': 1.5847, 'learning_rate': 1.6742950244391528e-05, 'epoch': 1.63}
{'loss': 1.575

  0%|          | 0/1995 [00:00<?, ?it/s]

{'eval_loss': 1.4485167264938354, 'eval_bleu': 29.1851, 'eval_gen_len': 16.2659, 'eval_runtime': 2693.1319, 'eval_samples_per_second': 23.701, 'eval_steps_per_second': 0.741, 'epoch': 2.0}
{'loss': 1.5726, 'learning_rate': 1.5991477628775536e-05, 'epoch': 2.01}
{'loss': 1.5619, 'learning_rate': 1.5866148640180473e-05, 'epoch': 2.07}
{'loss': 1.5497, 'learning_rate': 1.5741070309562602e-05, 'epoch': 2.13}
{'loss': 1.5601, 'learning_rate': 1.5615741320967542e-05, 'epoch': 2.19}
{'loss': 1.5471, 'learning_rate': 1.549041233237248e-05, 'epoch': 2.26}
{'loss': 1.5544, 'learning_rate': 1.5365083343777415e-05, 'epoch': 2.32}
{'loss': 1.5678, 'learning_rate': 1.5239754355182355e-05, 'epoch': 2.38}
{'loss': 1.5409, 'learning_rate': 1.5114425366587293e-05, 'epoch': 2.44}
{'loss': 1.5606, 'learning_rate': 1.4989096377992231e-05, 'epoch': 2.51}
{'loss': 1.5546, 'learning_rate': 1.4863767389397168e-05, 'epoch': 2.57}
{'loss': 1.5486, 'learning_rate': 1.4738438400802106e-05, 'epoch': 2.63}
{'loss': 

  0%|          | 0/1995 [00:00<?, ?it/s]

{'eval_loss': 1.424839735031128, 'eval_bleu': 29.5602, 'eval_gen_len': 16.2472, 'eval_runtime': 2809.5533, 'eval_samples_per_second': 22.719, 'eval_steps_per_second': 0.71, 'epoch': 3.0}
{'loss': 1.5322, 'learning_rate': 1.3986965785186116e-05, 'epoch': 3.01}
{'loss': 1.5252, 'learning_rate': 1.3861636796591054e-05, 'epoch': 3.07}
{'loss': 1.5327, 'learning_rate': 1.373630780799599e-05, 'epoch': 3.13}
{'loss': 1.5401, 'learning_rate': 1.3610978819400929e-05, 'epoch': 3.2}
{'loss': 1.5239, 'learning_rate': 1.3485649830805867e-05, 'epoch': 3.26}
{'loss': 1.5178, 'learning_rate': 1.3360320842210805e-05, 'epoch': 3.32}
{'loss': 1.5273, 'learning_rate': 1.3234991853615742e-05, 'epoch': 3.38}
{'loss': 1.5289, 'learning_rate': 1.310966286502068e-05, 'epoch': 3.45}
{'loss': 1.5153, 'learning_rate': 1.298433387642562e-05, 'epoch': 3.51}
{'loss': 1.5208, 'learning_rate': 1.2859004887830558e-05, 'epoch': 3.57}
{'loss': 1.5185, 'learning_rate': 1.2733675899235494e-05, 'epoch': 3.63}
{'loss': 1.515

  0%|          | 0/1995 [00:00<?, ?it/s]

{'eval_loss': 1.4082581996917725, 'eval_bleu': 29.6821, 'eval_gen_len': 16.2147, 'eval_runtime': 2816.1135, 'eval_samples_per_second': 22.666, 'eval_steps_per_second': 0.708, 'epoch': 4.0}
{'loss': 1.5251, 'learning_rate': 1.1981952625642313e-05, 'epoch': 4.01}
{'loss': 1.5069, 'learning_rate': 1.1856623637047251e-05, 'epoch': 4.07}
{'loss': 1.5092, 'learning_rate': 1.1731545306429377e-05, 'epoch': 4.14}
{'loss': 1.5136, 'learning_rate': 1.1606216317834315e-05, 'epoch': 4.2}
{'loss': 1.5252, 'learning_rate': 1.1480887329239253e-05, 'epoch': 4.26}
{'loss': 1.5017, 'learning_rate': 1.1355558340644193e-05, 'epoch': 4.32}
{'loss': 1.5064, 'learning_rate': 1.123022935204913e-05, 'epoch': 4.39}
{'loss': 1.5047, 'learning_rate': 1.1105151021431258e-05, 'epoch': 4.45}
{'loss': 1.5049, 'learning_rate': 1.0979822032836196e-05, 'epoch': 4.51}
{'loss': 1.4827, 'learning_rate': 1.0854493044241134e-05, 'epoch': 4.57}
{'loss': 1.5033, 'learning_rate': 1.072916405564607e-05, 'epoch': 4.64}
{'loss': 1.

  0%|          | 0/1995 [00:00<?, ?it/s]

{'eval_loss': 1.3975569009780884, 'eval_bleu': 30.0868, 'eval_gen_len': 16.2677, 'eval_runtime': 2812.3094, 'eval_samples_per_second': 22.696, 'eval_steps_per_second': 0.709, 'epoch': 5.0}
{'loss': 1.4909, 'learning_rate': 9.97794209800727e-06, 'epoch': 5.01}
{'loss': 1.5038, 'learning_rate': 9.852613109412208e-06, 'epoch': 5.08}
{'loss': 1.4966, 'learning_rate': 9.727284120817145e-06, 'epoch': 5.14}
{'loss': 1.4964, 'learning_rate': 9.601955132222084e-06, 'epoch': 5.2}
{'loss': 1.4945, 'learning_rate': 9.476626143627021e-06, 'epoch': 5.26}
{'loss': 1.4969, 'learning_rate': 9.351297155031959e-06, 'epoch': 5.33}
{'loss': 1.5042, 'learning_rate': 9.225968166436897e-06, 'epoch': 5.39}
{'loss': 1.4878, 'learning_rate': 9.100639177841835e-06, 'epoch': 5.45}
{'loss': 1.4803, 'learning_rate': 8.975310189246774e-06, 'epoch': 5.51}
{'loss': 1.4755, 'learning_rate': 8.850231858628901e-06, 'epoch': 5.58}
{'loss': 1.4887, 'learning_rate': 8.72490287003384e-06, 'epoch': 5.64}
{'loss': 1.5035, 'lear

  0%|          | 0/1995 [00:00<?, ?it/s]

{'eval_loss': 1.3895801305770874, 'eval_bleu': 30.1083, 'eval_gen_len': 16.2335, 'eval_runtime': 2700.7207, 'eval_samples_per_second': 23.634, 'eval_steps_per_second': 0.739, 'epoch': 6.0}
{'loss': 1.4897, 'learning_rate': 7.973430254417848e-06, 'epoch': 6.02}
{'loss': 1.4743, 'learning_rate': 7.848101265822786e-06, 'epoch': 6.08}
{'loss': 1.4763, 'learning_rate': 7.722772277227724e-06, 'epoch': 6.14}
{'loss': 1.4912, 'learning_rate': 7.597443288632661e-06, 'epoch': 6.2}
{'loss': 1.4794, 'learning_rate': 7.4721143000375996e-06, 'epoch': 6.27}
{'loss': 1.4792, 'learning_rate': 7.346785311442537e-06, 'epoch': 6.33}
{'loss': 1.4801, 'learning_rate': 7.221456322847475e-06, 'epoch': 6.39}
{'loss': 1.4888, 'learning_rate': 7.096127334252413e-06, 'epoch': 6.45}
{'loss': 1.4865, 'learning_rate': 6.970798345657351e-06, 'epoch': 6.52}
{'loss': 1.4843, 'learning_rate': 6.845469357062289e-06, 'epoch': 6.58}
{'loss': 1.4715, 'learning_rate': 6.720391026444417e-06, 'epoch': 6.64}
{'loss': 1.4684, 'l

  0%|          | 0/1995 [00:00<?, ?it/s]

{'eval_loss': 1.3839020729064941, 'eval_bleu': 30.199, 'eval_gen_len': 16.2308, 'eval_runtime': 2813.5709, 'eval_samples_per_second': 22.686, 'eval_steps_per_second': 0.709, 'epoch': 7.0}
{'loss': 1.4772, 'learning_rate': 5.968667752851235e-06, 'epoch': 7.02}
{'loss': 1.485, 'learning_rate': 5.843338764256173e-06, 'epoch': 7.08}
{'loss': 1.4785, 'learning_rate': 5.718009775661111e-06, 'epoch': 7.14}
{'loss': 1.4679, 'learning_rate': 5.59268078706605e-06, 'epoch': 7.21}
{'loss': 1.4683, 'learning_rate': 5.467351798470987e-06, 'epoch': 7.27}
{'loss': 1.462, 'learning_rate': 5.3422734678531156e-06, 'epoch': 7.33}
{'loss': 1.4802, 'learning_rate': 5.216944479258053e-06, 'epoch': 7.39}
{'loss': 1.4826, 'learning_rate': 5.091615490662991e-06, 'epoch': 7.46}
{'loss': 1.4645, 'learning_rate': 4.966286502067928e-06, 'epoch': 7.52}
{'loss': 1.4817, 'learning_rate': 4.8409575134728666e-06, 'epoch': 7.58}
{'loss': 1.4616, 'learning_rate': 4.715628524877805e-06, 'epoch': 7.65}
{'loss': 1.464, 'lear

  0%|          | 0/1995 [00:00<?, ?it/s]

{'eval_loss': 1.3794429302215576, 'eval_bleu': 30.2507, 'eval_gen_len': 16.2425, 'eval_runtime': 2876.0606, 'eval_samples_per_second': 22.193, 'eval_steps_per_second': 0.694, 'epoch': 8.0}
{'loss': 1.4795, 'learning_rate': 3.9639052512846225e-06, 'epoch': 8.02}
{'loss': 1.4637, 'learning_rate': 3.83882692066675e-06, 'epoch': 8.08}
{'loss': 1.473, 'learning_rate': 3.7134979320716884e-06, 'epoch': 8.15}
{'loss': 1.4729, 'learning_rate': 3.5881689434766266e-06, 'epoch': 8.21}
{'loss': 1.468, 'learning_rate': 3.4628399548815644e-06, 'epoch': 8.27}
{'loss': 1.4652, 'learning_rate': 3.3375109662865025e-06, 'epoch': 8.33}
{'loss': 1.4666, 'learning_rate': 3.2121819776914403e-06, 'epoch': 8.4}
{'loss': 1.4792, 'learning_rate': 3.0868529890963785e-06, 'epoch': 8.46}
{'loss': 1.4684, 'learning_rate': 2.961524000501316e-06, 'epoch': 8.52}
{'loss': 1.4676, 'learning_rate': 2.8361950119062544e-06, 'epoch': 8.59}
{'loss': 1.4683, 'learning_rate': 2.711116681288382e-06, 'epoch': 8.65}
{'loss': 1.4709

  0%|          | 0/1995 [00:00<?, ?it/s]

{'eval_loss': 1.377905011177063, 'eval_bleu': 30.2792, 'eval_gen_len': 16.194, 'eval_runtime': 2716.4888, 'eval_samples_per_second': 23.497, 'eval_steps_per_second': 0.734, 'epoch': 9.0}
{'loss': 1.468, 'learning_rate': 1.9593934076952e-06, 'epoch': 9.02}
{'loss': 1.4739, 'learning_rate': 1.834064419100138e-06, 'epoch': 9.09}
{'loss': 1.4561, 'learning_rate': 1.708735430505076e-06, 'epoch': 9.15}
{'loss': 1.4717, 'learning_rate': 1.583406441910014e-06, 'epoch': 9.21}
{'loss': 1.4597, 'learning_rate': 1.458077453314952e-06, 'epoch': 9.27}
{'loss': 1.4724, 'learning_rate': 1.3329991226970797e-06, 'epoch': 9.34}
{'loss': 1.4575, 'learning_rate': 1.207920792079208e-06, 'epoch': 9.4}
{'loss': 1.4627, 'learning_rate': 1.0825918034841459e-06, 'epoch': 9.46}
{'loss': 1.4617, 'learning_rate': 9.572628148890838e-07, 'epoch': 9.53}
{'loss': 1.4635, 'learning_rate': 8.319338262940219e-07, 'epoch': 9.59}
{'loss': 1.4666, 'learning_rate': 7.066048376989598e-07, 'epoch': 9.65}
{'loss': 1.4647, 'learn

  0%|          | 0/1995 [00:00<?, ?it/s]

{'eval_loss': 1.3774222135543823, 'eval_bleu': 30.3103, 'eval_gen_len': 16.216, 'eval_runtime': 2770.9514, 'eval_samples_per_second': 23.035, 'eval_steps_per_second': 0.72, 'epoch': 10.0}
{'train_runtime': 44320.725, 'train_samples_per_second': 57.606, 'train_steps_per_second': 1.8, 'train_loss': 1.5262058627621873, 'epoch': 10.0}


TrainOutput(global_step=79790, training_loss=1.5262058627621873, metrics={'train_runtime': 44320.725, 'train_samples_per_second': 57.606, 'train_steps_per_second': 1.8, 'train_loss': 1.5262058627621873, 'epoch': 10.0})

### Load model checkpoint and generating detoxified sentence
    I hope that there is no need to comment anything

In [18]:
test_model = AutoModelForSeq2SeqLM.from_pretrained("../models/T5-small_detox/final_checkpoint/")

In [91]:
input_ids = tokenizer("Are we going to bring these self-pissing dogs here?".lower().split(),
                      max_length=max_len, truncation=True, is_split_into_words=True, return_tensors="pt").input_ids
outputs = test_model.generate(input_ids, max_new_tokens=max_len + 5)

In [92]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

are we going to bring these dogs here?
