In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m89.9 MB/s[0m eta [36m0:00:00[0m
Co

In [None]:
import pandas as pd
data = pd.read_csv('interim.tsv', sep='\t')

print(data.head())

   Unnamed: 0                                          reference  \
0           0  if Alkar floods her with her mental waste, it ...   
1           1                        you're becoming disgusting.   
2           2                      well, we can spare your life.   
3           3                       monkey, you have to wake up.   
4           4                         I have orders to kill her.   

                                         translation  similarity  lenght_diff  \
0  If Alkar is flooding her with psychic waste, t...    0.785171     0.010309   
1                          Now you're getting nasty.    0.749687     0.071429   
2           Well, we could spare your life, for one.    0.919051     0.268293   
3          Ah! Monkey, you've got to snap out of it.    0.664333     0.309524   
4                   I've got orders to put her down.    0.726639     0.181818   

    ref_tox   trn_tox  
0  0.981983  0.014195  
1  0.999039  0.065473  
2  0.985068  0.213313  
3  0.994

In [None]:
import pandas as pd
from sklearn.utils import shuffle
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch

Reading the input dataset

In [None]:
df_train_toxic = []
df_train_neutral = []

for index, row in data.iterrows():
            df_train_toxic.append(row['reference'])
            df_train_neutral.append(row['translation'])

In [None]:
df = pd.DataFrame({
    'toxic_comment': df_train_toxic,
    'neutral_comment': df_train_neutral
})

df = shuffle(df)

Preparing data structures for training

In [None]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx < len(self.x['input_ids'])
        item = {key: val[idx] for key, val in self.x.items()}
        item['decoder_attention_mask'] = self.y['attention_mask'][idx]
        item['labels'] = self.y['input_ids'][idx]
        return item

    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n # * 2

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple
from sklearn.model_selection import train_test_split
import gc
from tqdm.auto import tqdm, trange

In [None]:
from typing import List, Dict, Union

class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        )
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']

        return {k: torch.tensor(v) for k, v in batch.items()}

In [None]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [None]:
def evaluate_model(model, test_dataloader):
    num = 0
    den = 0

    for batch in test_dataloader:
        with torch.no_grad():
            loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
            num += len(batch) * loss.item()
            den += len(batch)
    val_loss = num / den
    return val_loss

Defining the training loop

In [None]:
def train_loop(
    model, train_dataloader, val_dataloader,
    max_epochs=30,
    max_steps=1_000,
    lr=3e-5,
    gradient_accumulation_steps=1,
    cleanup_step=100,
    report_step=300,
    window=100,
):
    cleanup()
    optimizer = torch.optim.Adam(params = [p for p in model.parameters() if p.requires_grad], lr=lr)

    ewm_loss = 0
    step = 0
    model.train()

    for epoch in trange(max_epochs):
        print(step, max_steps)
        if step >= max_steps:
            break
        tq = tqdm(train_dataloader)
        for i, batch in enumerate(tq):
            try:
                batch['labels'][batch['labels']==0] = -100
                loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
                loss.backward()
            except Exception as e:
                print('error on step', i, e)
                loss = None
                cleanup()
                continue
            if i and i % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                step += 1
                if step >= max_steps:
                    break

            if i % cleanup_step == 0:
                cleanup()

            w = 1 / min(i+1, window)
            ewm_loss = ewm_loss * (1-w) + loss.item() * w
            tq.set_description(f'loss: {ewm_loss:4.4f}')

            if (i and i % report_step == 0 or i == len(train_dataloader)-1)  and val_dataloader is not None:
                model.eval()
                eval_loss = evaluate_model(model, val_dataloader)
                model.train()
                print(f'epoch {epoch}, step {i}/{step}: train loss: {ewm_loss:4.4f}  val loss: {eval_loss:4.4f}')

            if step % 1000 == 0:
                model.save_pretrained(f't5_base_{dname}_{steps}')

    cleanup()

In [None]:
import torch

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

def train_model(x, y, model_name, test_size=0.1, batch_size=32, **kwargs):
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    x1, x2, y1, y2 = train_test_split(x, y, test_size=test_size, random_state=42)
    train_dataset = PairsDataset(tokenizer(x1), tokenizer(y1))
    test_dataset = PairsDataset(tokenizer(x2), tokenizer(y2))

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator)
    val_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator)

    train_loop(model, train_dataloader, val_dataloader, **kwargs)
    return model

Defining the type of the model

In [None]:
model_name = 't5-small'

In [None]:
cleanup()

In [None]:
datasets = {
    'train': df
}

In [None]:
print(datasets)

{'train':                                             toxic_comment  \
253661  There is nothing weak-minded or degenerate abo...   
365083  I just-- I've gotten concerned over what's gon...   
338456                         You're a dangerous person.   
54149   be assured that we will visit your house, find...   
211427  I don't want to kick your ass when you're on t...   
...                                                   ...   
384478  That idiotic student of Wong Fei Hung Butcher ...   
130563         I am not going anywhere with you lunatics.   
305329                   # Why can't a woman be a chum? #   
278398               Give my head a good scratch, please?   
62212   Even if he doesn't trigger them, the damn thin...   

                                          neutral_comment  
253661  there is no weakness or memory loss to Miss Ho...  
365083  I just... I have to keep thinking about what w...  
338456                                  you're dangerous.  
54149   Be sure, 

Training the model! You can hyperparametrize the number of training iterations.

In [None]:
for steps in [300, 1000, 10000]:
    for dname, d in datasets.items():
        print(f'\n\n\n  {dname}  {steps} \n=====================\n\n')
        model = train_model(d['toxic_comment'].tolist(), d['neutral_comment'].tolist(), model_name=model_name, batch_size=16, max_epochs=1000, max_steps=steps)
        model.save_pretrained(f't5_base_{dname}_{steps}')




  train  300 




Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

0 300


  0%|          | 0/26978 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


300 300



  train  1000 




  0%|          | 0/1000 [00:00<?, ?it/s]

0 1000


  0%|          | 0/26978 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 0, step 300/300: train loss: 2.4735  val loss: 10.1542
epoch 0, step 600/600: train loss: 2.3255  val loss: 10.6974
epoch 0, step 900/900: train loss: 2.2681  val loss: 11.0002
1000 1000



  train  10000 




  0%|          | 0/1000 [00:00<?, ?it/s]

0 10000


  0%|          | 0/26978 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 0, step 300/300: train loss: 2.5019  val loss: 10.0181
epoch 0, step 600/600: train loss: 2.3352  val loss: 10.6248
epoch 0, step 900/900: train loss: 2.2946  val loss: 10.8993
epoch 0, step 1200/1200: train loss: 2.2297  val loss: 11.2519
epoch 0, step 1500/1500: train loss: 2.1949  val loss: 11.3543
epoch 0, step 1800/1800: train loss: 2.1809  val loss: 11.5794
epoch 0, step 2100/2100: train loss: 2.1968  val loss: 11.5166
epoch 0, step 2400/2400: train loss: 2.1856  val loss: 11.5636
epoch 0, step 2700/2700: train loss: 2.1577  val loss: 11.6637
epoch 0, step 3000/3000: train loss: 2.1325  val loss: 11.7274
epoch 0, step 3300/3300: train loss: 2.1234  val loss: 11.8827
epoch 0, step 3600/3600: train loss: 2.1151  val loss: 11.9466
epoch 0, step 3900/3900: train loss: 2.1081  val loss: 12.1018
epoch 0, step 4200/4200: train loss: 2.0991  val loss: 12.1669
epoch 0, step 4500/4500: train loss: 2.0578  val loss: 12.2961
epoch 0, step 4800/4800: train loss: 2.0517  val loss: 12.576