In [1]:
import pandas as pd
from tqdm import tqdm
from transformers import BertForMaskedLM, BertTokenizer, DataCollatorForLanguageModeling
from datasets import Dataset
import torch

import sys
sys.path.append('..')
from src.data.preprocess import put_mask
from src.models.predict import detoxificate_text
from src.models.train import train

import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 1337
torch.manual_seed(RANDOM_SEED)

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<torch._C.Generator at 0x1e8eec4e410>

In [2]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
df = pd.read_csv('../data/interim/train.csv')
toxic_sentences = df['reference'].tolist()
non_toxic_sentences = df['translation'].tolist()
toxic_words = open('../data/interim/toxic_words.txt').read().split('\n')

data = []
labels = []

for i in tqdm(range(len(toxic_sentences))):
    toxic_sentences[i] = put_mask(toxic_sentences[i], toxic_words)
    if '[MASK]' in toxic_sentences[i]:
        data.append(toxic_sentences[i])
        labels.append(non_toxic_sentences[i])

dataset = Dataset.from_dict({"text": data, "labels": labels})

100%|██████████| 101535/101535 [01:02<00:00, 1613.41it/s]


In [4]:
MAX_LEN = 128

def group_texts(examples):
    inputs = [ex for ex in examples['text']]
    target = [ex for ex in examples['labels']]

    batch = tokenizer(inputs, padding='max_length', max_length=MAX_LEN, truncation=True, return_tensors='pt')
    batch["labels"] = tokenizer(target, padding='max_length', max_length=MAX_LEN, truncation=True, return_tensors='pt').input_ids

    return batch

dataset = dataset.map(group_texts, batched=True)

Map:   0%|          | 0/89914 [00:00<?, ? examples/s]

In [5]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))

In [6]:
train('maskedlm', 
      model, 
      tokenizer, 
      train_dataset, 
      val_dataset, 
      data_collator,
      batch_size=16, 
      epochs=1,
      seed=RANDOM_SEED
)

  0%|          | 0/5058 [00:00<?, ?it/s]

{'loss': 2.6986, 'learning_rate': 1.8022933965994466e-05, 'epoch': 0.1}
{'loss': 2.5543, 'learning_rate': 1.604586793198893e-05, 'epoch': 0.2}
{'loss': 2.5081, 'learning_rate': 1.4068801897983393e-05, 'epoch': 0.3}
{'loss': 2.4367, 'learning_rate': 1.2091735863977859e-05, 'epoch': 0.4}
{'loss': 2.4347, 'learning_rate': 1.0114669829972321e-05, 'epoch': 0.49}
{'loss': 2.4175, 'learning_rate': 8.137603795966786e-06, 'epoch': 0.59}
{'loss': 2.4153, 'learning_rate': 6.160537761961251e-06, 'epoch': 0.69}
{'loss': 2.3735, 'learning_rate': 4.183471727955714e-06, 'epoch': 0.79}
{'loss': 2.4114, 'learning_rate': 2.2064056939501782e-06, 'epoch': 0.89}
{'loss': 2.3053, 'learning_rate': 2.2933965994464219e-07, 'epoch': 0.99}


  0%|          | 0/562 [00:00<?, ?it/s]

{'eval_loss': 2.310804843902588, 'eval_runtime': 144.0731, 'eval_samples_per_second': 62.413, 'eval_steps_per_second': 3.901, 'epoch': 1.0}
{'train_runtime': 3410.0589, 'train_samples_per_second': 23.73, 'train_steps_per_second': 1.483, 'train_loss': 2.454359945264811, 'epoch': 1.0}


In [7]:
import random

random.seed(RANDOM_SEED)

best_model = BertForMaskedLM.from_pretrained("../models/bert_maskedlm")
tokenizer = BertTokenizer.from_pretrained("../models/bert_maskedlm")

random_toxic_sentences = random.sample(df['reference'].tolist(), 3)

for sentence in random_toxic_sentences:
    print(f'Original: {sentence}')
    print(f'Masked: {put_mask(sentence, toxic_words)}')
    print(f'Detoxified: {detoxificate_text(sentence, toxic_words, tokenizer, best_model)}')
    print()

Original: Suddenly, to the delight and outrage of the congregation, a raucous saxophone broke the solemnity, and a jazz rendering of "Fools Rush In" was blaring over the loudspeakers.
Masked: suddenly, to the delight and out[MASK] of the congregation, a raucous s[MASK]ophone [MASK] the [MASK]ity, and a jazz rendering of "[MASK]s rush in" was blaring over the [MASK]speakers.
Detoxified: suddenly, to the delight and outflow of the congregation, a raucous s was ophoned the - ity, and a jazz rendering of "'s rush in " was blaring over the stage speakers.

Original: This place is such a dump.
Masked: this place is such a [MASK].
Detoxified: this place is such a place.

Original: Doesn't mean a damn thing!
Masked: doesn't mean a [MASK] thing!
Detoxified: doesn't mean a first thing!

