# Third Hypothesis: Style transfer using t5-small

In [1]:
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from datasets import Dataset
import torch

import sys
sys.path.append('..')
from src.models.predict import detoxificate_style_transfer, PREFIX
from src.models.train import train

import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 1337
torch.manual_seed(RANDOM_SEED)

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<torch._C.Generator at 0x1f475495890>

### Loading t5-small model

In [2]:
model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### Creating dataset

In [3]:
df = pd.read_csv('../data/interim/train.csv')
toxic_sentences = df['reference'].tolist()
non_toxic_sentences = df['translation'].tolist()

dataset = Dataset.from_dict({"text": toxic_sentences, "labels": non_toxic_sentences})

In [4]:
MAX_LEN = 128

def group_texts(examples):
    inputs = [PREFIX + ex for ex in examples['text']]
    target = [ex for ex in examples['labels']]

    batch = tokenizer(inputs, padding='max_length', max_length=MAX_LEN, truncation=True, return_tensors='pt')
    batch["labels"] = tokenizer(target, padding='max_length', max_length=MAX_LEN, truncation=True, return_tensors='pt').input_ids

    return batch

dataset = dataset.map(group_texts, batched=True)

Map:   0%|          | 0/101535 [00:00<?, ? examples/s]

In [5]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))

### Training using Hugging Face Trainer 

In [6]:
train('seq2seq', 
      model, 
      tokenizer, 
      train_dataset, 
      val_dataset, 
      data_collator,
      batch_size=16, 
      epochs=1,
      seed=RANDOM_SEED
)

  0%|          | 0/5712 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.9131, 'learning_rate': 1.8249299719887958e-05, 'epoch': 0.09}


  0%|          | 0/635 [00:00<?, ?it/s]

{'eval_loss': 0.3491753041744232, 'eval_runtime': 82.8968, 'eval_samples_per_second': 122.49, 'eval_steps_per_second': 7.66, 'epoch': 0.09}
{'loss': 0.3268, 'learning_rate': 1.649859943977591e-05, 'epoch': 0.18}


  0%|          | 0/635 [00:00<?, ?it/s]

{'eval_loss': 0.25028446316719055, 'eval_runtime': 83.8954, 'eval_samples_per_second': 121.032, 'eval_steps_per_second': 7.569, 'epoch': 0.18}
{'loss': 0.2768, 'learning_rate': 1.4747899159663868e-05, 'epoch': 0.26}


  0%|          | 0/635 [00:00<?, ?it/s]

{'eval_loss': 0.24375779926776886, 'eval_runtime': 84.3467, 'eval_samples_per_second': 120.384, 'eval_steps_per_second': 7.528, 'epoch': 0.26}
{'loss': 0.2687, 'learning_rate': 1.2997198879551822e-05, 'epoch': 0.35}


  0%|          | 0/635 [00:00<?, ?it/s]

{'eval_loss': 0.24030275642871857, 'eval_runtime': 84.4608, 'eval_samples_per_second': 120.222, 'eval_steps_per_second': 7.518, 'epoch': 0.35}
{'loss': 0.2645, 'learning_rate': 1.1246498599439776e-05, 'epoch': 0.44}


  0%|          | 0/635 [00:00<?, ?it/s]

{'eval_loss': 0.2379130870103836, 'eval_runtime': 83.3083, 'eval_samples_per_second': 121.885, 'eval_steps_per_second': 7.622, 'epoch': 0.44}
{'loss': 0.2608, 'learning_rate': 9.49579831932773e-06, 'epoch': 0.53}


  0%|          | 0/635 [00:00<?, ?it/s]

{'eval_loss': 0.2362850159406662, 'eval_runtime': 83.0681, 'eval_samples_per_second': 122.237, 'eval_steps_per_second': 7.644, 'epoch': 0.53}
{'loss': 0.2558, 'learning_rate': 7.745098039215687e-06, 'epoch': 0.61}


  0%|          | 0/635 [00:00<?, ?it/s]

{'eval_loss': 0.23515287041664124, 'eval_runtime': 82.9637, 'eval_samples_per_second': 122.391, 'eval_steps_per_second': 7.654, 'epoch': 0.61}
{'loss': 0.2593, 'learning_rate': 5.994397759103642e-06, 'epoch': 0.7}


  0%|          | 0/635 [00:00<?, ?it/s]

{'eval_loss': 0.23446246981620789, 'eval_runtime': 82.9348, 'eval_samples_per_second': 122.434, 'eval_steps_per_second': 7.657, 'epoch': 0.7}
{'loss': 0.2543, 'learning_rate': 4.243697478991597e-06, 'epoch': 0.79}


  0%|          | 0/635 [00:00<?, ?it/s]

{'eval_loss': 0.23362945020198822, 'eval_runtime': 83.021, 'eval_samples_per_second': 122.306, 'eval_steps_per_second': 7.649, 'epoch': 0.79}
{'loss': 0.2559, 'learning_rate': 2.492997198879552e-06, 'epoch': 0.88}


  0%|          | 0/635 [00:00<?, ?it/s]

{'eval_loss': 0.23317529261112213, 'eval_runtime': 83.0078, 'eval_samples_per_second': 122.326, 'eval_steps_per_second': 7.65, 'epoch': 0.88}
{'loss': 0.2549, 'learning_rate': 7.42296918767507e-07, 'epoch': 0.96}


  0%|          | 0/635 [00:00<?, ?it/s]

{'eval_loss': 0.23297300934791565, 'eval_runtime': 82.9367, 'eval_samples_per_second': 122.431, 'eval_steps_per_second': 7.656, 'epoch': 0.96}
{'train_runtime': 3121.7483, 'train_samples_per_second': 29.272, 'train_steps_per_second': 1.83, 'train_loss': 0.32360762641543434, 'epoch': 1.0}


### Some examples from the test dataset

In [2]:
best_model = AutoModelForSeq2SeqLM.from_pretrained("../models/detoxificator")
tokenizer = AutoTokenizer.from_pretrained("../models/detoxificator")

test_toxic_sentences = pd.read_csv('../data/interim/test.csv')['reference'].to_list()[:10]

detoxified = detoxificate_style_transfer(test_toxic_sentences, best_model, tokenizer)

for sentence, detoxified_sentence in zip(test_toxic_sentences, detoxified):
    print(f'Original: {sentence}')
    print(f'Detoxified: {detoxified_sentence}')
    print()

Original: Hurt you and I want to find out who that was.
Detoxified: you're gonna have to find out who's doing it.

Original: I can't believe we haven't fucked for two years, nine months, three weeks and... 69 hours.
Detoxified: I can't believe we've been a couple of years, nine months, three weeks and... 69 hours.

Original: So forgive me for being a little fidgety, but if it's lunch we're talking, I'm gonna eat a fat pork sandwich, and I'm sure as shit not gonna eat it here.
Detoxified: I'm sorry for being a little a little a bit a bit, but if it's lunch we're talking, I'll eat a pork sandwich, and I'm sure I'll not eat it here.

Original: There is no fucking soy milk!
Detoxified: there's no milk!

Original: What's wrong with people having sex?
Detoxified: what's wrong with people having sex?

Original: What the fuck are you talking about?
Detoxified: what are you talking about?

Original: There were fewer horses that needed stealing than men who needed killing, after all.
Detoxified: