In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
!wget https://github.com/skoltech-nlp/detox/releases/download/emnlp2021/filtered_paranmt.zip
!unzip filtered_paranmt.zip

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('filtered.tsv', sep='\t', encoding='utf-8')
print(df.shape)
df.sample(10)

In [None]:
(df.ref_tox > df.trn_tox).mean()

In [None]:
xx = []
yy = []
for i, row in df.iterrows():
    if row.ref_tox > row.trn_tox:
        xx.append(row.reference)
        yy.append(row.translation)
    else:
        yy.append(row.reference)
        xx.append(row.translation)
        
xydf = pd.DataFrame({'source': xx, 'target': yy})

In [None]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer, T5TokenizerFast,
    get_linear_schedule_with_warmup
)

In [None]:
t5Para = "ceshine/t5-paraphrase-paws-msrp-opinosis"

In [None]:
tokenizer = T5TokenizerFast.from_pretrained(model_name)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = train_test_split(xydf, test_size=300)
print(df_train.shape[0], df_test.shape[0])

In [None]:
%%time

x1 = tokenizer(df_train.source.tolist(), truncation=True)
y1 = tokenizer(df_train.target.tolist(), truncation=True)
x2 = tokenizer(df_test.source.tolist(), truncation=True)
y2 = tokenizer(df_test.target.tolist(), truncation=True)

In [None]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx < len(self.x['input_ids'])
        item = {key: val[idx] for key, val in self.x.items()}
        item['decoder_attention_mask'] = self.y['attention_mask'][idx]
        item['labels'] = self.y['input_ids'][idx]
        return item
    
    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n # * 2
    
train_dataset = PairsDataset(x1, y1)
test_dataset = PairsDataset(x2, y2)
len(train_dataset), len(test_dataset)

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=4, drop_last=True, shuffle=True, num_workers=1)
test_dataloader = DataLoader(test_dataset, batch_size=4, drop_last=True, shuffle=True, num_workers=1)

In [None]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
import torch

In [None]:
checkpoint_name = 'SkolkovoInstitute/t5-paraphrase-paws-msrp-opinosis-paranmt'

In [None]:
model = T5ForConditionalGeneration.from_pretrained(checkpoint_name)

In [None]:
device = torch.device('cuda:0')
model.to(device)

In [None]:
import transformers
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple

class TrAr(TrainingArguments):
    @cached_property
    def _setup_devices(self):
        return device

In [None]:
from typing import List, Dict, Union

class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        ) 
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']
        
        return {k: torch.tensor(v) for k, v in batch.items()}

In [None]:
save_name = 'models/t5-cechine-nmt-mined-detox'
training_args = TrAr(
    output_dir=save_name,   # output directory
    overwrite_output_dir=True,
    num_train_epochs=3,             # total # of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=300,               # number of warmup steps for learning rate scheduler
    weight_decay=0,                  # strength of weight decay
    learning_rate=3e-5,
    logging_dir='./logs',           # directory for storing logs
    logging_steps=100,
    eval_steps=100,
    evaluation_strategy='steps',
    save_total_limit=1,
    save_steps=5000,
)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache();

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
model.eval();

In [None]:
model.save_pretrained(t5Para)

In [None]:
inputs = tokenizer('hey loser , try this get a fucking life and stay out of mine , which you know nothing about', return_tensors='pt')
inputs = {k: v.to(device) for k, v in inputs.items()}
for t in model.generate(**inputs, num_return_sequences=10, do_sample=False, num_beams=10):
    print(tokenizer.decode(t, skip_special_tokens=True))