# Third Hypothesis: Style transfer using t5-small

In [1]:
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from datasets import Dataset
import torch

import sys
sys.path.append('..')
from src.models.predict import detoxificate_style_transfer, PREFIX
from src.models.train import train

import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 1337
torch.manual_seed(RANDOM_SEED)

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<torch._C.Generator at 0x2c360625890>

### Loading t5-small model

In [2]:
model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### Creating dataset

In [3]:
df = pd.read_csv('../data/interim/train.csv')
toxic_sentences = df['reference'].tolist()
non_toxic_sentences = df['translation'].tolist()

dataset = Dataset.from_dict({"text": toxic_sentences, "labels": non_toxic_sentences})

In [4]:
MAX_LEN = 128

def group_texts(examples):
    inputs = [PREFIX + ex for ex in examples['text']]
    target = [ex for ex in examples['labels']]

    batch = tokenizer(inputs, padding='max_length', max_length=MAX_LEN, truncation=True, return_tensors='pt')
    batch["labels"] = tokenizer(target, padding='max_length', max_length=MAX_LEN, truncation=True, return_tensors='pt').input_ids

    return batch

dataset = dataset.map(group_texts, batched=True)

Map:   0%|          | 0/97006 [00:00<?, ? examples/s]

In [5]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))

### Training using Hugging Face Trainer 

In [6]:
train('seq2seq', 
      model, 
      tokenizer, 
      train_dataset, 
      val_dataset, 
      data_collator,
      batch_size=16, 
      epochs=5,
      seed=RANDOM_SEED
)

  0%|          | 0/27285 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.8124, 'learning_rate': 1.9633498259116732e-05, 'epoch': 0.09}
{'loss': 0.2114, 'learning_rate': 1.9266996518233462e-05, 'epoch': 0.18}
{'loss': 0.1863, 'learning_rate': 1.8900494777350193e-05, 'epoch': 0.27}
{'loss': 0.1781, 'learning_rate': 1.8533993036466923e-05, 'epoch': 0.37}
{'loss': 0.1705, 'learning_rate': 1.8167491295583654e-05, 'epoch': 0.46}
{'loss': 0.1667, 'learning_rate': 1.7800989554700384e-05, 'epoch': 0.55}
{'loss': 0.168, 'learning_rate': 1.7434487813817118e-05, 'epoch': 0.64}
{'loss': 0.1655, 'learning_rate': 1.7067986072933848e-05, 'epoch': 0.73}
{'loss': 0.1668, 'learning_rate': 1.670148433205058e-05, 'epoch': 0.82}
{'loss': 0.1589, 'learning_rate': 1.633498259116731e-05, 'epoch': 0.92}


  0%|          | 0/607 [00:00<?, ?it/s]

{'eval_loss': 0.1460123360157013, 'eval_runtime': 79.3667, 'eval_samples_per_second': 122.23, 'eval_steps_per_second': 7.648, 'epoch': 1.0}
{'loss': 0.1607, 'learning_rate': 1.596848085028404e-05, 'epoch': 1.01}
{'loss': 0.1586, 'learning_rate': 1.560197910940077e-05, 'epoch': 1.1}
{'loss': 0.1564, 'learning_rate': 1.5235477368517502e-05, 'epoch': 1.19}
{'loss': 0.1561, 'learning_rate': 1.4868975627634232e-05, 'epoch': 1.28}
{'loss': 0.1577, 'learning_rate': 1.4502473886750962e-05, 'epoch': 1.37}
{'loss': 0.1562, 'learning_rate': 1.4135972145867693e-05, 'epoch': 1.47}
{'loss': 0.1534, 'learning_rate': 1.3769470404984425e-05, 'epoch': 1.56}
{'loss': 0.1532, 'learning_rate': 1.3402968664101155e-05, 'epoch': 1.65}
{'loss': 0.1563, 'learning_rate': 1.3036466923217886e-05, 'epoch': 1.74}
{'loss': 0.1518, 'learning_rate': 1.2669965182334618e-05, 'epoch': 1.83}
{'loss': 0.152, 'learning_rate': 1.2303463441451348e-05, 'epoch': 1.92}


  0%|          | 0/607 [00:00<?, ?it/s]

{'eval_loss': 0.13942165672779083, 'eval_runtime': 79.1137, 'eval_samples_per_second': 122.621, 'eval_steps_per_second': 7.673, 'epoch': 2.0}
{'loss': 0.1525, 'learning_rate': 1.1936961700568079e-05, 'epoch': 2.02}
{'loss': 0.1502, 'learning_rate': 1.157045995968481e-05, 'epoch': 2.11}
{'loss': 0.1491, 'learning_rate': 1.1203958218801541e-05, 'epoch': 2.2}
{'loss': 0.1486, 'learning_rate': 1.0837456477918271e-05, 'epoch': 2.29}
{'loss': 0.1488, 'learning_rate': 1.0470954737035002e-05, 'epoch': 2.38}
{'loss': 0.1503, 'learning_rate': 1.0104452996151732e-05, 'epoch': 2.47}
{'loss': 0.1471, 'learning_rate': 9.737951255268463e-06, 'epoch': 2.57}
{'loss': 0.1476, 'learning_rate': 9.371449514385193e-06, 'epoch': 2.66}
{'loss': 0.15, 'learning_rate': 9.004947773501925e-06, 'epoch': 2.75}
{'loss': 0.1525, 'learning_rate': 8.638446032618655e-06, 'epoch': 2.84}
{'loss': 0.1494, 'learning_rate': 8.271944291735386e-06, 'epoch': 2.93}


  0%|          | 0/607 [00:00<?, ?it/s]

{'eval_loss': 0.1365615874528885, 'eval_runtime': 78.949, 'eval_samples_per_second': 122.877, 'eval_steps_per_second': 7.689, 'epoch': 3.0}
{'loss': 0.1476, 'learning_rate': 7.905442550852118e-06, 'epoch': 3.02}
{'loss': 0.1507, 'learning_rate': 7.538940809968847e-06, 'epoch': 3.12}
{'loss': 0.1468, 'learning_rate': 7.172439069085579e-06, 'epoch': 3.21}
{'loss': 0.147, 'learning_rate': 6.80593732820231e-06, 'epoch': 3.3}
{'loss': 0.1454, 'learning_rate': 6.43943558731904e-06, 'epoch': 3.39}
{'loss': 0.1478, 'learning_rate': 6.0729338464357714e-06, 'epoch': 3.48}
{'loss': 0.1463, 'learning_rate': 5.706432105552502e-06, 'epoch': 3.57}
{'loss': 0.1456, 'learning_rate': 5.339930364669232e-06, 'epoch': 3.67}
{'loss': 0.1486, 'learning_rate': 4.973428623785963e-06, 'epoch': 3.76}
{'loss': 0.1465, 'learning_rate': 4.606926882902694e-06, 'epoch': 3.85}
{'loss': 0.1457, 'learning_rate': 4.240425142019425e-06, 'epoch': 3.94}


  0%|          | 0/607 [00:00<?, ?it/s]

{'eval_loss': 0.1352386772632599, 'eval_runtime': 79.053, 'eval_samples_per_second': 122.715, 'eval_steps_per_second': 7.678, 'epoch': 4.0}
{'loss': 0.1427, 'learning_rate': 3.873923401136156e-06, 'epoch': 4.03}
{'loss': 0.1461, 'learning_rate': 3.5074216602528866e-06, 'epoch': 4.12}
{'loss': 0.1455, 'learning_rate': 3.140919919369617e-06, 'epoch': 4.21}
{'loss': 0.1456, 'learning_rate': 2.7744181784863483e-06, 'epoch': 4.31}
{'loss': 0.1442, 'learning_rate': 2.407916437603079e-06, 'epoch': 4.4}
{'loss': 0.1465, 'learning_rate': 2.0414146967198094e-06, 'epoch': 4.49}
{'loss': 0.1491, 'learning_rate': 1.6749129558365405e-06, 'epoch': 4.58}
{'loss': 0.1438, 'learning_rate': 1.308411214953271e-06, 'epoch': 4.67}
{'loss': 0.1432, 'learning_rate': 9.419094740700019e-07, 'epoch': 4.76}
{'loss': 0.1454, 'learning_rate': 5.754077331867327e-07, 'epoch': 4.86}
{'loss': 0.1455, 'learning_rate': 2.0890599230346348e-07, 'epoch': 4.95}


  0%|          | 0/607 [00:00<?, ?it/s]

{'eval_loss': 0.1347780078649521, 'eval_runtime': 78.979, 'eval_samples_per_second': 122.83, 'eval_steps_per_second': 7.686, 'epoch': 5.0}
{'train_runtime': 10779.71, 'train_samples_per_second': 40.495, 'train_steps_per_second': 2.531, 'train_loss': 0.16572755782491058, 'epoch': 5.0}


### Some examples from the test dataset

In [7]:
best_model = AutoModelForSeq2SeqLM.from_pretrained("../models/detoxificator")
tokenizer = AutoTokenizer.from_pretrained("../models/detoxificator")

test_toxic_sentences = pd.read_csv('../data/interim/test.csv')['reference'].to_list()[:10]

detoxified = detoxificate_style_transfer(test_toxic_sentences, best_model, tokenizer)

for sentence, detoxified_sentence in zip(test_toxic_sentences, detoxified):
    print(f'Original: {sentence}')
    print(f'Detoxified: {detoxified_sentence}')
    print()

Original: Hurt you and I want to find out who that was.
Detoxified: you and I want to find out who that was.

Original: I can't believe we haven't fucked for two years, nine months, three weeks and... 69 hours.
Detoxified: I can't believe we haven't slept for two years, nine months, three weeks and... 69 hours.

Original: So forgive me for being a little fidgety, but if it's lunch we're talking, I'm gonna eat a fat pork sandwich, and I'm sure as shit not gonna eat it here.
Detoxified: so forgive me for being a little bit of a snitch, but if it's lunch we're talking, I'll eat a fat pork sandwich, and I'm sure I'm not gonna eat it here.

Original: There is no fucking soy milk!
Detoxified: there's no soy milk!

Original: What's wrong with people having sex?
Detoxified: what's wrong with people having sex?

Original: What the fuck are you talking about?
Detoxified: what are you talking about?

Original: There were fewer horses that needed stealing than men who needed killing, after all.
De