In [1]:
# This code is originally obtained from https://github.com/paul-rottger/hatecheck-experiments and modified.

In [2]:
import numpy as np
import pandas as pd
import pickle
import argparse
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
import os
import random
#os.environ["TOKENIZERS_PARALLELISM"] = "false"

import wandb

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [3]:
class HateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [4]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, **kwargs):
       # self.class_weights = torch.FloatTensor(class_weights)
        self.weighted_loss = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights)).to(DEVICE)
        super().__init__(**kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        loss = self.weighted_loss(logits, labels)
        if return_outputs:
            return loss, outputs
        else:
            return loss

In [None]:
def create_datasets(data_dir, tokenizer):
    torch.manual_seed(42)
    train_df = pd.read_csv(data_dir + "/train.csv")
    train_df = train_df.dropna()
    valid_df = pd.read_csv(data_dir + "/valid.csv")
    valid_df = valid_df.dropna()
    test_df = pd.read_csv(data_dir + "/test.csv")
    test_df = test_df.dropna()

    train_texts = train_df['text'].astype("string").tolist()
    valid_texts = valid_df['text'].astype("string").tolist()
    test_texts = test_df['text'].astype("string").tolist()

    train_labels = train_df['label'].astype("int").tolist()
    valid_labels = valid_df['label'].astype("int").tolist()
    test_labels = test_df['label'].astype("int").tolist()

    # add special tokens for URLs, emojis and mentions (--> see pre-processing)
    special_tokens_dict = {'additional_special_tokens': ['[USER]', '[EMOJI]', '[URL]']}
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

    train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")#.to(DEVICE)
    valid_encodings = tokenizer(valid_texts, padding=True, truncation=True, return_tensors="pt")#.to(DEVICE)
    test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

    train_dataset = HateDataset(train_encodings, train_labels)
    valid_dataset = HateDataset(valid_encodings, valid_labels)
    test_dataset = HateDataset(test_encodings, test_labels)
    tok_len = len(tokenizer)

    return train_dataset, valid_dataset, test_dataset, tok_len

In [6]:
def calculate_class_weights(data_dir):
    dataset = pd.read_csv(data_dir + "/train.csv")
    train_labels = dataset.label.to_numpy()
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    print("class weights are {}".format(class_weights))
    return class_weights

In [7]:
dataset_dir = "./Data/"
output_dir = "./Model/"

dataset = "Davidson_hate"

dd_dir = dataset_dir + dataset
oo_dir = output_dir + dataset

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Loading tokenizer...")
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

print("Creating datasets...")
train_dataset, valid_dataset, test_dataset, tok_len = create_datasets(dd_dir, tokenizer)

print("Loading model...")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to(DEVICE)
model.resize_token_embeddings(tok_len)

print("Calculating class weights...")
class_weights = calculate_class_weights(dd_dir)

training_args = TrainingArguments(
    seed=123,
    output_dir=output_dir,  # output directory
    num_train_epochs=3,  # total number of training epochs
    warmup_steps=100,  # number of warmup steps for learning rate scheduler

    learning_rate = 5e-5,
    per_device_train_batch_size=64,  # batch size per device during training
    weight_decay=0.01,  # strength of weight decay
    
    logging_steps=10,
    report_to="wandb",

    per_device_eval_batch_size=64,  # batch size for evaluation
    evaluation_strategy="epoch",

    save_steps=1e8,
)

trainer = WeightedTrainer(
    model=model,
    class_weights=class_weights,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

wandb.init(project="hatecheck", name="experiment_1", config=training_args)

# try:
#     trainer.train(resume_from_checkpoint=True)
#     print("resuming from checkpoint...")
# except ValueError:
print("No checkpoints found. training from scratch...")
trainer.train()
wandb.finish()

print("Training done, saving...")
trainer.save_model(oo_dir)
tokenizer.save_pretrained(oo_dir)

print("Training done, evaluating...")
valid_preds = np.argmax(trainer.predict(valid_dataset)[0], axis=1) #should be numpy ndarray
valid_labels = np.array(valid_dataset.labels)

# cls_report_valid = classification_report(valid_labels, valid_preds, output_dict=True)
# pickle.dump(cls_report_valid, open(oo_dir + "/cls_report_valid.pickle", "wb"))

# test_preds = np.argmax(trainer.predict(test_dataset)[0], axis=1)
# test_labels = np.array(test_dataset.labels)

# cls_report_test = classification_report(test_labels, test_preds, output_dict=True)
# pickle.dump(cls_report_test, open(oo_dir + "/cls_report_test.pickle", "wb"))

Loading tokenizer...
Creating datasets...
Loading model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Calculating class weights...
class weights are [0.53061771 8.66520979]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mthomasrobertparis[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
huggingface/tokenizers: The current process just got fo

No checkpoints found. training from scratch...


Epoch,Training Loss,Validation Loss
1,0.4477,0.436011
2,0.3955,0.38875
3,0.2742,0.533919


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0,1
eval/loss,▃▁█
eval/runtime,▁██
eval/samples_per_second,█▁▁
eval/steps_per_second,█▁▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇██
train/grad_norm,▂▄▂▂▃▃▃▂▂▂▂▂▁▄▅▃▂▂▂▂▂▂▂▂▂▂▂▁▄▂▁▁▂▃▁▃▁▄▁█
train/learning_rate,▂▃▄▆▇███▇▇▇▇▇▇▇▆▆▆▅▅▅▅▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁
train/loss,█▆▆█▅▅▆▄▆▅▆▄▅▃▅▅▅▄▄▃▄▄▄▄▃▃▃▄▃▂▃▃▃▂▁▂▁▃▃▃

0,1
eval/loss,0.53392
eval/runtime,2.725
eval/samples_per_second,909.356
eval/steps_per_second,14.312
total_flos,2292380764263000.0
train/epoch,3.0
train/global_step,930.0
train/grad_norm,2.83612
train/learning_rate,0.0
train/loss,0.2742


Training done, saving...
Training done, evaluating...
