# Second Hypothesis: Mask toxic words using classifier and then use MaskedLM to find appropriate alternatives

In [1]:
import pandas as pd
from tqdm import tqdm
from transformers import BertForMaskedLM, BertTokenizer, DataCollatorForLanguageModeling
from datasets import Dataset
import torch
import torch.nn as nn
from torch.utils.data import Dataset as TorchDataset, DataLoader

import sys
sys.path.append('..')
from src.data.preprocess import put_mask_with_classifier, get_toxicity
from src.models.predict import detoxificate_text_with_classifier
from src.models.train import train, train_classifier, evaluate_classifier
from src.models.classifier import ToxicWordsClassifier

import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 1337
torch.manual_seed(RANDOM_SEED)

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<torch._C.Generator at 0x1e11dcde430>

### Loading bert-base-uncased model for MaskedLM 

In [2]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Creating dataset class for the classifier

In [17]:
class ToxicWordsDataset(TorchDataset):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

        positive_words = open('../data/interim/positive_words.txt').read().split('\n')
        toxic_words = open('../data/interim/toxic_words.txt').read().split('\n')

        toxic_words = [w for w in toxic_words if w.isalnum() and len(w) > 1]
        positive_words = [w for w in positive_words if w.isalnum() and len(w) > 1]

        self.texts = []
        self.labels = []

        for w in tqdm(toxic_words):
            word = self.tokenizer(w, add_special_tokens=False, max_length=1, truncation=True).input_ids
            self.texts.append(word[0])
            self.labels.append(1)

        for w in tqdm(positive_words):
            word = self.tokenizer(w, add_special_tokens=False, max_length=1, truncation=True).input_ids
            self.texts.append(word[0])
            self.labels.append(0)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]
    
dataset = ToxicWordsDataset(tokenizer)

100%|██████████| 5017/5017 [00:00<00:00, 7624.08it/s]
100%|██████████| 1904/1904 [00:00<00:00, 5987.00it/s]


### Splitting the classifier data into train and validation

In [18]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [25]:
toxicity_classifier = ToxicWordsClassifier(vocab_size=dataset.tokenizer.vocab_size, embedding_dim=512, dropout=0.3)

### Training classifier and saving the best one

In [26]:
EPOCHS = 20

optimizer = torch.optim.Adam(toxicity_classifier.parameters(), lr=1e-3)
criterion = nn.BCELoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_loss = 1e9

for epoch in range(EPOCHS):
    train_classifier(epoch, toxicity_classifier, optimizer, criterion, train_dataloader, device)
    loss = evaluate_classifier(epoch, toxicity_classifier, criterion, val_dataloader, device)
    if loss < best_loss:
        best_loss = loss
        torch.save(toxicity_classifier.state_dict(), '../models/toxicity_classifier.pth')

  0%|          | 0/195 [00:00<?, ?it/s]

Epoch: 0, Loss: 0.57610, Acc: 0.46808: 100%|██████████| 195/195 [00:02<00:00, 86.62it/s]
	Epoch: 0, Loss: 0.57708, Acc: 0.49330: 100%|██████████| 22/22 [00:00<00:00, 205.59it/s]
Epoch: 1, Loss: 0.49233, Acc: 0.60946: 100%|██████████| 195/195 [00:02<00:00, 96.34it/s]
	Epoch: 1, Loss: 0.56154, Acc: 0.60768: 100%|██████████| 22/22 [00:00<00:00, 265.05it/s]
Epoch: 2, Loss: 0.42995, Acc: 0.69635: 100%|██████████| 195/195 [00:01<00:00, 97.93it/s] 
	Epoch: 2, Loss: 0.54478, Acc: 0.64969: 100%|██████████| 22/22 [00:00<00:00, 293.31it/s]
Epoch: 3, Loss: 0.36888, Acc: 0.75769: 100%|██████████| 195/195 [00:01<00:00, 98.28it/s] 
	Epoch: 3, Loss: 0.54644, Acc: 0.64678: 100%|██████████| 22/22 [00:00<00:00, 265.04it/s]
Epoch: 4, Loss: 0.32649, Acc: 0.79827: 100%|██████████| 195/195 [00:02<00:00, 95.16it/s] 
	Epoch: 4, Loss: 0.56609, Acc: 0.65165: 100%|██████████| 22/22 [00:00<00:00, 309.83it/s]
Epoch: 5, Loss: 0.28430, Acc: 0.83304: 100%|██████████| 195/195 [00:02<00:00, 96.86it/s]
	Epoch: 5, Loss: 0

### Some test to evaluate the performance of the classifier

In [27]:
toxic_word = 'buttcheeks'
non_toxic_word = 'university'

toxicity_classifier.load_state_dict(torch.load('../models/toxicity_classifier.pth'))

print(f'Word: {toxic_word}, Toxicity: {get_toxicity(toxic_word, tokenizer, toxicity_classifier)}')
print(f'Word: {non_toxic_word}, Toxicity: {get_toxicity(non_toxic_word, tokenizer, toxicity_classifier)}')

Word: buttcheeks, Toxicity: 0.9748811721801758
Word: university, Toxicity: 0.48354899883270264


### Creating dataset for MaskedLM

In [8]:
df = pd.read_csv('../data/interim/train.csv')
toxic_sentences = df['reference'].tolist()
non_toxic_sentences = df['translation'].tolist()

data = []
labels = []

for i in tqdm(range(len(toxic_sentences))):
    toxic_sentences[i] = put_mask_with_classifier(toxic_sentences[i], tokenizer, toxicity_classifier)
    if '[MASK]' in toxic_sentences[i]:
        data.append(toxic_sentences[i])
        labels.append(non_toxic_sentences[i])

dataset = Dataset.from_dict({"text": data, "labels": labels})

100%|██████████| 97006/97006 [07:26<00:00, 217.06it/s]


In [9]:
MAX_LEN = 128

def group_texts(examples):
    inputs = [ex for ex in examples['text']]
    target = [ex for ex in examples['labels']]

    batch = tokenizer(inputs, padding='max_length', max_length=MAX_LEN, truncation=True, return_tensors='pt')
    batch["labels"] = tokenizer(target, padding='max_length', max_length=MAX_LEN, truncation=True, return_tensors='pt').input_ids

    return batch

dataset = dataset.map(group_texts, batched=True)

Map:   0%|          | 0/93618 [00:00<?, ? examples/s]

In [10]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))

### Training using Hugging Face Trainer

In [35]:
train('maskedlm_with_classifier', 
      model, 
      tokenizer, 
      train_dataset, 
      val_dataset, 
      data_collator,
      batch_size=16, 
      epochs=3,
      seed=RANDOM_SEED
)

  0%|          | 0/5526 [00:00<?, ?it/s]

{'loss': 3.2831, 'learning_rate': 1.819037278320666e-05, 'epoch': 0.09}
{'loss': 3.0637, 'learning_rate': 1.638074556641332e-05, 'epoch': 0.18}
{'loss': 3.0873, 'learning_rate': 1.4571118349619979e-05, 'epoch': 0.27}
{'loss': 2.9544, 'learning_rate': 1.2761491132826638e-05, 'epoch': 0.36}
{'loss': 2.9274, 'learning_rate': 1.0951863916033298e-05, 'epoch': 0.45}
{'loss': 2.9715, 'learning_rate': 9.142236699239957e-06, 'epoch': 0.54}
{'loss': 2.8911, 'learning_rate': 7.332609482446616e-06, 'epoch': 0.63}
{'loss': 2.8747, 'learning_rate': 5.5229822656532765e-06, 'epoch': 0.72}
{'loss': 2.8676, 'learning_rate': 3.7133550488599353e-06, 'epoch': 0.81}
{'loss': 2.8604, 'learning_rate': 1.9037278320665944e-06, 'epoch': 0.9}
{'loss': 2.8661, 'learning_rate': 9.410061527325373e-08, 'epoch': 1.0}


  0%|          | 0/614 [00:00<?, ?it/s]

{'eval_loss': 2.749858856201172, 'eval_runtime': 157.18, 'eval_samples_per_second': 62.495, 'eval_steps_per_second': 3.906, 'epoch': 1.0}
{'train_runtime': 3724.7467, 'train_samples_per_second': 23.735, 'train_steps_per_second': 1.484, 'train_loss': 2.9680513250107268, 'epoch': 1.0}


### Some examples from the test dataset

In [28]:
best_model = BertForMaskedLM.from_pretrained("../models/bert_maskedlm")
tokenizer = BertTokenizer.from_pretrained("../models/bert_maskedlm")

test_toxic_sentences = pd.read_csv('../data/interim/test.csv')['reference'].to_list()[:10]

detoxified = detoxificate_text_with_classifier(test_toxic_sentences, tokenizer, best_model, toxicity_classifier)

for sentence, detoxified_sentence in zip(test_toxic_sentences, detoxified):
    print(f'Original: {sentence}')
    print(f'Masked: {put_mask_with_classifier(sentence, tokenizer, toxicity_classifier)}')
    print(f'Detoxified: {detoxified_sentence}')
    print()

Original: Hurt you and I want to find out who that was.
Masked: [MASK] you and i [MASK] to find out who that [MASK]
Detoxified: and you and i need to find out who that.

Original: I can't believe we haven't fucked for two years, nine months, three weeks and... 69 hours.
Masked: i [MASK] believe [MASK] [MASK] fucked [MASK] two years, nine months, [MASK] weeks and... 69 hours.
Detoxified: i cannot believe i just fucked up two years, nine months, two weeks and... 69 hours.

Original: So forgive me for being a little fidgety, but if it's lunch we're talking, I'm gonna eat a fat pork sandwich, and I'm sure as shit not gonna eat it here.
Masked: so forgive me [MASK] [MASK] a little [MASK] but if [MASK] lunch [MASK] talking, i'm [MASK] eat a [MASK] [MASK] sandwich, and i'm [MASK] as [MASK] [MASK] [MASK] eat [MASK] here.
Detoxified: so forgive me to just a little. but if your lunch stop talking, i'm to eat a c ing sandwich, and i'm good as good as you eat right here.

Original: There is no fuc