In [None]:
%%capture
!pip install datasets
!pip install transformers

In [None]:
import numpy as np
import pandas as pd

from pathlib import Path

from datasets import load_dataset
from datasets import Dataset,DatasetDict
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from transformers import TrainingArguments,Trainer
from datasets import load_metric

# Downloading and saving the dataset

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
MODEL_DIR = Path("/content/drive/MyDrive/ML Projects/Hate Speech Classification/models")
DATA_DIR = Path("/content/drive/MyDrive/ML Projects/Hate Speech Classification/data")

In [None]:
dataset = load_dataset("hate_speech18")

Downloading builder script:   0%|          | 0.00/3.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.61k [00:00<?, ?B/s]

Downloading and preparing dataset hate_speech18/default to /root/.cache/huggingface/datasets/hate_speech18/default/0.0.0/775598e7a56b8706cb7cc3781ce7abd83f8a6d0831d6363f1c93b5f963d7c733...


Downloading data:   0%|          | 0.00/3.19M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10944 [00:00<?, ? examples/s]

Dataset hate_speech18 downloaded and prepared to /root/.cache/huggingface/datasets/hate_speech18/default/0.0.0/775598e7a56b8706cb7cc3781ce7abd83f8a6d0831d6363f1c93b5f963d7c733. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
dataset.save_to_disk(DATA_DIR / "hate_speech.hf")

Saving the dataset (0/1 shards):   0%|          | 0/10944 [00:00<?, ? examples/s]

# Preprocessing

In [None]:
dataset = dataset.load_from_disk(DATA_DIR / "hate_speech.hf")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'user_id', 'subforum_id', 'num_contexts', 'label'],
        num_rows: 10944
    })
})

In [None]:
dataset.keys()

dict_keys(['train'])

In [None]:
set(dataset['train']['label'])

{0, 1, 2, 3}

In [None]:
pd.set_option('display.max_colwidth', None)
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,train
0,"{'text': 'As of March 13th , 2014 , the booklet had been downloaded over 18,300 times and counting .', 'user_id': 572066, 'subforum_id': 1346, 'num_contexts': 0, 'label': 0}"
1,"{'text': 'In order to help increase the booklets downloads , it would be great if all Stormfronters who had YouTube accounts , could display the following text in the description boxes of their uploaded YouTube videos .', 'user_id': 572066, 'subforum_id': 1346, 'num_contexts': 0, 'label': 0}"
2,"{'text': '( Simply copy and paste the following text into your YouTube videos description boxes. )', 'user_id': 572066, 'subforum_id': 1346, 'num_contexts': 0, 'label': 0}"
3,"{'text': 'Click below for a FREE download of a colorfully illustrated 132 page e-book on the Zionist-engineered INTENTIONAL destruction of Western civilization .', 'user_id': 572066, 'subforum_id': 1346, 'num_contexts': 0, 'label': 1}"
4,"{'text': 'Click on the `` DOWNLOAD ( 7.42 MB ) '' green banner link .', 'user_id': 572066, 'subforum_id': 1346, 'num_contexts': 0, 'label': 0}"


In [None]:
df.iloc[0]

train    {'text': 'As of March 13th , 2014 , the booklet had been downloaded over 18,300 times and counting .', 'user_id': 572066, 'subforum_id': 1346, 'num_contexts': 0, 'label': 0}
Name: 0, dtype: object

In [None]:
df.shape

(10944, 1)

In [None]:
type(df["train"])

pandas.core.series.Series

In [None]:
type(df["train"][0])

dict

In [None]:
df["label"] = df.apply(lambda row: row["train"].pop("label"), axis=1)
df["label"][:5]

0    0
1    0
2    0
3    1
4    0
Name: label, dtype: int64

In [None]:
df['label'].value_counts()

0    9507
1    1196
3     168
2      73
Name: label, dtype: int64

In [None]:
df["train"][:5]

0                                                                                                                           {'text': 'As of March 13th , 2014 , the booklet had been downloaded over 18,300 times and counting .', 'user_id': 572066, 'subforum_id': 1346, 'num_contexts': 0}
1    {'text': 'In order to help increase the booklets downloads , it would be great if all Stormfronters who had YouTube accounts , could display the following text in the description boxes of their uploaded YouTube videos .', 'user_id': 572066, 'subforum_id': 1346, 'num_contexts': 0}
2                                                                                                                             {'text': '( Simply copy and paste the following text into your YouTube videos description boxes. )', 'user_id': 572066, 'subforum_id': 1346, 'num_contexts': 0}
3                                                              {'text': 'Click below for a FREE download of a colorfully illustrated 132 page 

In [None]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['train', 'label'],
    num_rows: 10944
})

# Tokenizing

In [None]:
model_nm = "facebook/roberta-hate-speech-dynabench-r4-target"

In [None]:
tokz = AutoTokenizer.from_pretrained(model_nm)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
tokz.tokenize("Hi, My name is Sahar.")

['Hi', ',', 'ĠMy', 'Ġname', 'Ġis', 'ĠSah', 'ar', '.']

In [None]:
list(map(str, ds["train"]))[0]

"{'num_contexts': 0, 'subforum_id': 1346, 'text': 'As of March 13th , 2014 , the booklet had been downloaded over 18,300 times and counting .', 'user_id': 572066}"

In [None]:
tokz.tokenize(list(map(str, ds["train"]))[0])

['{',
 "'",
 'num',
 '_',
 'context',
 's',
 "':",
 'Ġ0',
 ',',
 "Ġ'",
 'sub',
 'forum',
 '_',
 'id',
 "':",
 'Ġ13',
 '46',
 ',',
 "Ġ'",
 'text',
 "':",
 "Ġ'",
 'As',
 'Ġof',
 'ĠMarch',
 'Ġ13',
 'th',
 'Ġ,',
 'Ġ2014',
 'Ġ,',
 'Ġthe',
 'Ġbooklet',
 'Ġhad',
 'Ġbeen',
 'Ġdownloaded',
 'Ġover',
 'Ġ18',
 ',',
 '300',
 'Ġtimes',
 'Ġand',
 'Ġcounting',
 'Ġ.',
 "',",
 "Ġ'",
 'user',
 '_',
 'id',
 "':",
 'Ġ57',
 '20',
 '66',
 '}']

In [None]:
def tok_func(x): return tokz(ds)

In [None]:
token_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/10944 [00:00<?, ? examples/s]

In [None]:
row = token_ds[3]
row["train"], row["input_ids"]

In [None]:
tokz.vocab["label"]

33480

# Splitting the dataset

In [None]:
dds = token_ds.train_test_split(0.2)
dds

DatasetDict({
    train: Dataset({
        features: ['train', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8755
    })
    test: Dataset({
        features: ['train', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2189
    })
})

# Training and saving the model

In [None]:
bs = 16
epochs = 6
lr = 8e-5

In [None]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
args = TrainingArguments('outputs',
                         learning_rate=lr,
                         warmup_ratio=0.1,
                         do_eval=True,
                         lr_scheduler_type='cosine',
                         evaluation_strategy="epoch",
                         per_device_train_batch_size=bs,
                         per_device_eval_batch_size=bs*2,
                         num_train_epochs=epochs,
                         weight_decay=0.01,
                         report_to='none')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=4, ignore_mismatched_sizes=True)
trainer = Trainer(model, args, train_dataset=dds['train'],
                  eval_dataset=dds['test'], tokenizer=tokz, compute_metrics=compute_metrics)

Downloading (…)lve/main/config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at facebook/roberta-hate-speech-dynabench-r4-target and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3997,0.33806,0.883965
2,0.3431,0.343596,0.891275
3,0.2512,0.374576,0.905436
4,0.2068,0.381377,0.915487
5,0.1341,0.442791,0.921425
6,0.095,0.467573,0.919141


TrainOutput(global_step=3288, training_loss=0.22577871545387881, metrics={'train_runtime': 5085.3994, 'train_samples_per_second': 10.33, 'train_steps_per_second': 0.647, 'total_flos': 1.3499601161296224e+16, 'train_loss': 0.22577871545387881, 'epoch': 6.0})

In [None]:
preds = trainer.predict(dds["test"])
preds

PredictionOutput(predictions=array([[ 6.3362837 , -2.7093852 , -2.4692988 , -0.08730839],
       [-1.0815823 ,  4.8038974 , -3.848165  ,  0.26459253],
       [ 5.378582  , -1.7797365 , -3.2512376 ,  0.15955278],
       ...,
       [ 6.305857  , -2.8147345 , -2.30258   , -0.14246814],
       [ 6.314907  , -2.7374923 , -2.4256873 , -0.10189091],
       [ 6.3061633 , -2.7380552 , -2.424643  , -0.09792607]],
      dtype=float32), label_ids=array([0, 1, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.1331893652677536, 'test_runtime': 61.1806, 'test_samples_per_second': 35.779, 'test_steps_per_second': 4.479})

In [None]:
clipped_preds = np.clip(preds.predictions, 0, 1)
print(clipped_preds, preds.label_ids)

[[1.         0.         0.         0.        ]
 [0.         1.         0.         0.26459253]
 [1.         0.         0.         0.15955278]
 ...
 [1.         0.         0.         0.        ]
 [1.         0.         0.         0.        ]
 [1.         0.         0.         0.        ]] [0 1 0 ... 0 0 0]


In [None]:
trainer.save_model(MODEL_DIR / "roberta")

# Loading and testing the model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR / "roberta")
trainer = Trainer(model, tokenizer=tokz)

In [None]:
def test_model(text):
  test_token = tokz(text)
  test_pred = trainer.predict([test_token])
  label = np.argmax(test_pred.predictions)
  return label

In [None]:
test_model("Hi, My name is Sahar.")

0