# Imports

In [1]:
import datasets
import pandas as pd

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("OxAISH-AL-LLM/wiki_toxic")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'comment_text', 'label'],
        num_rows: 127656
    })
    validation: Dataset({
        features: ['id', 'comment_text', 'label'],
        num_rows: 31915
    })
    test: Dataset({
        features: ['id', 'comment_text', 'label'],
        num_rows: 63978
    })
    balanced_train: Dataset({
        features: ['id', 'comment_text', 'label'],
        num_rows: 25868
    })
})

In [4]:
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset =dataset['test']

In [7]:
train_dataset.to_csv("train.csv")
val_dataset.to_csv("val.csv")
test_dataset.to_csv("test.csv") 

Creating CSV from Arrow format: 100%|██████████| 128/128 [00:01<00:00, 65.01ba/s]
Creating CSV from Arrow format: 100%|██████████| 32/32 [00:00<00:00, 49.14ba/s]
Creating CSV from Arrow format: 100%|██████████| 64/64 [00:01<00:00, 54.81ba/s]


26534261

In [5]:
len(train_dataset), len(val_dataset), len(test_dataset)

(127656, 31915, 63978)

In [7]:
train_dataset[0],val_dataset[0],test_dataset[0]

({'id': '794c30aff0931384',
  'comment_text': "And that's not a personal attack^^ ?",
  'label': 0},
 {'id': 'c9e4af184d5246e7',
  'comment_text': "Military Keynesianism \n\nI'm not sure how to request more information, so hopefully this gets somewhere.  I wanted to get some sources that discuss the basic definition of military Keynesianism. Thanks.",
  'label': 0},
 {'id': '0001ea8717f6de06',
  'comment_text': 'Thank you for understanding. I think very highly of you and would not revert without discussion.',
  'label': 0})

In [8]:
train_dataset.features

{'id': Value(dtype='string', id=None),
 'comment_text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['non', 'tox'], id=None)}

In [11]:
train_dataset.filter(lambda example: example['label'] == 1)[:5]

Filter: 100%|██████████| 127656/127656 [00:00<00:00, 138221.39 examples/s]


{'id': ['30dc274513233575',
  '3a9656dae4b54ef5',
  'e6051913321df290',
  'a2f59e64d8d5c396',
  'd3b0c70d7a5f5dfa'],
 'comment_text': ['FUCK FUCKITY FUCK FUCK FUCK',
  'You LOVE Anime!? There are some ANIMES out there that suck! You idiot!68.84.225.186',
  "To faithlessfaggotboy:  Suck my dick you goddamn gay-lovin piece of shit liberal.  If I ever met you on the street I'd knock your empty head off your shoulders.",
  "And I don't understand what you try to tell me. So shut up and don't change my contributions!!!!",
  'THE IMAGE IS PORNAGRAPHIC \n\nWHAT IS IT THAT YOU DONT UNDERSTAND\n\nCUNT'],
 'label': [1, 1, 1, 1, 1]}

In [12]:
train_dataset.filter(lambda example: example['label'] == 0)[:5]

Filter: 100%|██████████| 127656/127656 [00:01<00:00, 116754.19 examples/s]


{'id': ['794c30aff0931384',
  '23a5fedbe63573bd',
  '24c21e4b0b831f6f',
  'd4f19d76911a174d',
  '7da2032cd711584e'],
 'comment_text': ["And that's not a personal attack^^ ?",
  '"\n\n Please do not vandalize pages, as you did with this edit to Bruce Stern. If you continue to do so, you will be blocked from editing.  (Bloom) "',
  '"\n\nUser:Public Juju/Sandbox\n\nYour Featured picture candidate has been promoted Your nomination for featured picture status, File:Canis lupus 265b.jpg, gained a consensus of support, and has been promoted. If you would like to nominate another image, please do so at Wikipedia:Featured picture candidates. \'\'\'\'\'\'\xa0Talk "',
  "So why M.K. didn't blocked? He deleted my message on his talk? Greets;] Alden or  talk with Alden",
  '"\nYou can only do it with the PSP dev kit, which is 8 grand. @ "'],
 'label': [0, 0, 0, 0, 0]}

# Tokenizing

In [14]:
from transformers import AutoTokenizer

In [15]:
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")

In [17]:
print(train_dataset[0]['comment_text'])
tokenizer(train_dataset[0]['comment_text'])

And that's not a personal attack^^ ?


{'input_ids': [101, 1998, 2008, 1005, 1055, 2025, 1037, 3167, 2886, 1034, 1034, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [19]:
tokenizer.decode(tokenizer(train_dataset[0]['comment_text'])['input_ids'])

"[CLS] and that's not a personal attack ^ ^? [SEP]"

In [20]:
def encode(examples):
    return tokenizer(
            examples["comment_text"],
            truncation=True,
            padding="max_length",
            max_length=512,
        )

In [21]:
train_dataset = train_dataset.map(encode, batched=True)

Map: 100%|██████████| 127656/127656 [01:05<00:00, 1962.14 examples/s]


# Formatting

In [22]:
import torch

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [23]:
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)

In [24]:
next(iter(dataloader))

{'label': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
         1, 0, 1, 0, 0, 0, 0, 0]),
 'input_ids': tensor([[ 101, 1998, 2008,  ...,    0,    0,    0],
         [ 101, 1000, 3531,  ...,    0,    0,    0],
         [ 101, 1000, 5310,  ...,    0,    0,    0],
         ...,
         [ 101, 1000, 5678,  ...,    0,    0,    0],
         [ 101, 2339, 2064,  ...,    0,    0,    0],
         [ 101, 2417, 7442,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [25]:
for batch in dataloader:
    print(batch['input_ids'].shape, batch['attention_mask'].shape, batch['label'].shape)

torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) to