## `In some places code can be simplified/removed/replaced`

### Importing all necessary packages and modules

In [2]:
import pandas as pd
import numpy as np
from datasets import load_metric
from datasets import Dataset
from transformers import (DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, 
                          Seq2SeqTrainer, BartForConditionalGeneration, BartTokenizer)

### Loading and splitting data

In [None]:
df = pd.read_csv("../data/raw/filtered.tsv", sep="\t")
toxic = df[df['ref_tox'] > df['trn_tox']].iloc[:, 1:]   # Choosing the partition where the toxicity score of the reference is bigger than the one of its translation
toxic.head()

### Sampling an example 

In [4]:
print(toxic['reference'].values[100])
print(toxic['translation'].values[100])

If you don't stop laughing, you'll end up dead... just like your idiot hyena cousins!
if you don't stop laughing, you come as your cousins of hyenas.


### Taking only a part of data with references & their translations

In [5]:
sins = toxic[["reference", 'translation']]

### Here it is possible to configure the partition of data that will be taken from all samples for traing(was useful for "fast check" of the training process)

In [6]:
part_factor = sins.shape[0]
# part_factor = 10000

### Calculating the maximum lenght of the toxic sentence for model generation 

In [7]:
lens_r = sins[:part_factor]['reference'].str.lower().str.split().apply(lambda x: len(x))
lens_t = sins[:part_factor]['translation'].str.lower().str.split().apply(lambda x: len(x))
max_len = np.max([lens_r.max(), lens_t.max()])
print(max_len)

### Creating a dataset to use .map() method for preparing data for the model training

In [12]:
dataset = Dataset.from_pandas(sins[:part_factor]).train_test_split(test_size=0.2)   # Create dataset and split it on "train" and "test" parts

### Load model and tokenizer, tokenizing data, creating model and data collator objects

In [13]:
checkpoint = "ccdv/lsg-bart-base-4096"  # Model to load - "bart-base-4096"

tokenizer = BartTokenizer.from_pretrained(checkpoint)   # Creating tokenizer

# Function for preprocessing sentences
def preprocess_function(data):
    inputs = [sent.lower().split() for sent in data["reference"]]
    targets = [sent.lower().split() for sent in data["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_len, truncation=True, is_split_into_words=True)

    return model_inputs

# Tokenizing the data in the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Creating the model's object
model = BartForConditionalGeneration.from_pretrained(checkpoint, forced_bos_token_id=0)

# Creating a data collator's object
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Map:   0%|          | 0/255313 [00:00<?, ? examples/s]

Map:   0%|          | 0/63829 [00:00<?, ? examples/s]

### Functions for calculating BLEU score metric

In [13]:
metric = load_metric("sacrebleu")

# Function for preprocessing data for calculating metrics
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

# Main function for calculating metrics: BLEU score and average lenght of generated sentences
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)  # Replacing -100 tokens(if any) on pad tokens of the tokenizer
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # Decoding tokens to words
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  # Replacing -100 tokens(if any) on pad tokens of the tokenizer
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Decoding tokens to words
    
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) # Prepare sentences for calculating metrics

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)   # Calculating BLEU score for each sentence
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)    # Calculating average lenght of generated sentences 
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Choosing training hyperparameters and configuring training process with consequent launching it

In [16]:
new_model = '../models/Bart_2e-5_16_10_MaxTokenLength' # Path for models checkpoints

# All training arguments/hyperparameters
training_args = Seq2SeqTrainingArguments(
    output_dir=new_model,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    generation_max_length = max_len + 5,
    weight_decay=1e-6,
)

# Trainer object creation
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Starting training
trainer.train()

  0%|          | 0/63832 [00:00<?, ?it/s]

{'loss': 1.8458, 'learning_rate': 1.9844592054142124e-05, 'epoch': 0.03}
{'loss': 1.701, 'learning_rate': 1.9688244140869785e-05, 'epoch': 0.06}
{'loss': 1.6346, 'learning_rate': 1.9531582905125955e-05, 'epoch': 0.09}
{'loss': 1.5996, 'learning_rate': 1.937492166938213e-05, 'epoch': 0.13}
{'loss': 1.5754, 'learning_rate': 1.9218260433638303e-05, 'epoch': 0.16}
{'loss': 1.5606, 'learning_rate': 1.9061599197894474e-05, 'epoch': 0.19}
{'loss': 1.5458, 'learning_rate': 1.8904937962150648e-05, 'epoch': 0.22}
{'loss': 1.5276, 'learning_rate': 1.8748276726406818e-05, 'epoch': 0.25}
{'loss': 1.5308, 'learning_rate': 1.8591615490662992e-05, 'epoch': 0.28}
{'loss': 1.4942, 'learning_rate': 1.8434954254919163e-05, 'epoch': 0.31}
{'loss': 1.5057, 'learning_rate': 1.8278293019175337e-05, 'epoch': 0.34}
{'loss': 1.4961, 'learning_rate': 1.8121631783431507e-05, 'epoch': 0.38}
{'loss': 1.4803, 'learning_rate': 1.796528387015917e-05, 'epoch': 0.41}
{'loss': 1.4853, 'learning_rate': 1.7808622634415342e-

  0%|          | 0/3990 [00:00<?, ?it/s]

{'eval_loss': 1.306068778038025, 'eval_bleu': 32.844, 'eval_gen_len': 15.2113, 'eval_runtime': 7134.6231, 'eval_samples_per_second': 8.946, 'eval_steps_per_second': 0.559, 'epoch': 1.0}
{'loss': 1.3761, 'learning_rate': 1.4989973680912395e-05, 'epoch': 1.0}
{'loss': 1.3226, 'learning_rate': 1.4833312445168569e-05, 'epoch': 1.03}
{'loss': 1.3055, 'learning_rate': 1.4676651209424741e-05, 'epoch': 1.07}
{'loss': 1.3019, 'learning_rate': 1.4519989973680915e-05, 'epoch': 1.1}
{'loss': 1.3184, 'learning_rate': 1.4363642060408573e-05, 'epoch': 1.13}
{'loss': 1.3254, 'learning_rate': 1.4206980824664747e-05, 'epoch': 1.16}
{'loss': 1.3049, 'learning_rate': 1.4050319588920917e-05, 'epoch': 1.19}
{'loss': 1.3084, 'learning_rate': 1.3893658353177091e-05, 'epoch': 1.22}
{'loss': 1.3093, 'learning_rate': 1.3736997117433264e-05, 'epoch': 1.25}
{'loss': 1.3146, 'learning_rate': 1.3580649204160923e-05, 'epoch': 1.28}
{'loss': 1.3018, 'learning_rate': 1.3423987968417095e-05, 'epoch': 1.32}
{'loss': 1.30

  0%|          | 0/3990 [00:00<?, ?it/s]

{'eval_loss': 1.2573332786560059, 'eval_bleu': 33.7363, 'eval_gen_len': 15.1838, 'eval_runtime': 6916.3259, 'eval_samples_per_second': 9.229, 'eval_steps_per_second': 0.577, 'epoch': 2.0}
{'loss': 1.2763, 'learning_rate': 9.978380749467352e-06, 'epoch': 2.01}
{'loss': 1.2245, 'learning_rate': 9.821719513723526e-06, 'epoch': 2.04}
{'loss': 1.2209, 'learning_rate': 9.665058277979698e-06, 'epoch': 2.07}
{'loss': 1.2109, 'learning_rate': 9.508710364707357e-06, 'epoch': 2.1}
{'loss': 1.214, 'learning_rate': 9.35204912896353e-06, 'epoch': 2.13}
{'loss': 1.214, 'learning_rate': 9.195387893219704e-06, 'epoch': 2.16}
{'loss': 1.2291, 'learning_rate': 9.038726657475874e-06, 'epoch': 2.19}
{'loss': 1.2159, 'learning_rate': 8.882065421732048e-06, 'epoch': 2.22}
{'loss': 1.2248, 'learning_rate': 8.725717508459707e-06, 'epoch': 2.26}
{'loss': 1.2237, 'learning_rate': 8.56905627271588e-06, 'epoch': 2.29}
{'loss': 1.2102, 'learning_rate': 8.412395036972052e-06, 'epoch': 2.32}
{'loss': 1.2158, 'learnin

  0%|          | 0/3990 [00:00<?, ?it/s]

{'eval_loss': 1.2376044988632202, 'eval_bleu': 34.1361, 'eval_gen_len': 15.1473, 'eval_runtime': 6921.0957, 'eval_samples_per_second': 9.222, 'eval_steps_per_second': 0.576, 'epoch': 3.0}
{'loss': 1.1857, 'learning_rate': 4.967101140493797e-06, 'epoch': 3.01}
{'loss': 1.1594, 'learning_rate': 4.8104399047499685e-06, 'epoch': 3.04}
{'loss': 1.1528, 'learning_rate': 4.653778669006142e-06, 'epoch': 3.07}
{'loss': 1.1554, 'learning_rate': 4.497117433262314e-06, 'epoch': 3.1}
{'loss': 1.1547, 'learning_rate': 4.340456197518486e-06, 'epoch': 3.13}
{'loss': 1.1543, 'learning_rate': 4.1837949617746585e-06, 'epoch': 3.16}
{'loss': 1.1514, 'learning_rate': 4.027133726030832e-06, 'epoch': 3.2}
{'loss': 1.147, 'learning_rate': 3.870472490287004e-06, 'epoch': 3.23}
{'loss': 1.1811, 'learning_rate': 3.713811254543176e-06, 'epoch': 3.26}
{'loss': 1.1619, 'learning_rate': 3.5571500187993486e-06, 'epoch': 3.29}
{'loss': 1.1492, 'learning_rate': 3.4004887830555213e-06, 'epoch': 3.32}
{'loss': 1.1531, 'l

  0%|          | 0/3990 [00:00<?, ?it/s]

{'eval_loss': 1.2308039665222168, 'eval_bleu': 34.3492, 'eval_gen_len': 15.1585, 'eval_runtime': 6708.1919, 'eval_samples_per_second': 9.515, 'eval_steps_per_second': 0.595, 'epoch': 4.0}
{'train_runtime': 39750.9662, 'train_samples_per_second': 25.691, 'train_steps_per_second': 1.606, 'train_loss': 1.2919149440636775, 'epoch': 4.0}


TrainOutput(global_step=63832, training_loss=1.2919149440636775, metrics={'train_runtime': 39750.9662, 'train_samples_per_second': 25.691, 'train_steps_per_second': 1.606, 'train_loss': 1.2919149440636775, 'epoch': 4.0})

### Load model checkpoint and generating detoxified sentence
    I hope that there is no need to comment anything

In [14]:
test_model = BartForConditionalGeneration.from_pretrained("../models/Bart-base-4096_detox/final_checkpoint/")

In [15]:
input_ids = tokenizer("Do you want to know how stupid and ugly you are?".lower().split(), 
                      max_length=max_len, truncation=True, is_split_into_words=True, return_tensors="pt").input_ids
outputs = test_model.generate(input_ids, max_new_tokens=max_len + 5)

In [16]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

 do you want to know how bad you are?
