In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import transformers

import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.nn.functional import softmax

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

from datasets import Dataset
from transformers import TrainingArguments
from transformers import Trainer

from evaluate import load

# 1.) Basic EDA of csv

In [12]:
df = pd.read_csv("PrimaryVPHostileLabels.csv")

In [13]:
df.head()

Unnamed: 0,statement,debate_id,year,hostility_label
0,Senator J.D. Vance (R-OH) andGovernor Tim Walz...,2024_VP_1001,2024,0.0
1,Good evening. I'm Norah O'Donnell and thank yo...,2024_VP_1001,2024,0.0
2,I'm Margaret Brennan. In order to have a thoug...,2024_VP_1001,2024,0.0
3,"Thank you, Norah. Earlier today, Iran launched...",2024_VP_1001,2024,1.0
4,"Well, thank you. And thank you for those joini...",2024_VP_1001,2024,1.0


In [14]:
df = df[~df['hostility_label'].isna()]

In [15]:
type(df)

pandas.core.frame.DataFrame

In [16]:
df.head()

Unnamed: 0,statement,debate_id,year,hostility_label
0,Senator J.D. Vance (R-OH) andGovernor Tim Walz...,2024_VP_1001,2024,0.0
1,Good evening. I'm Norah O'Donnell and thank yo...,2024_VP_1001,2024,0.0
2,I'm Margaret Brennan. In order to have a thoug...,2024_VP_1001,2024,0.0
3,"Thank you, Norah. Earlier today, Iran launched...",2024_VP_1001,2024,1.0
4,"Well, thank you. And thank you for those joini...",2024_VP_1001,2024,1.0


In [17]:
# converting the hostility labels into int values

df['hostility_label'] = df['hostility_label'].astype(int)

In [18]:
statements_df = Dataset.from_pandas(df)

statements_df = statements_df.remove_columns(['debate_id','year','__index_level_0__'])

print(statements_df)

Dataset({
    features: ['statement', 'hostility_label'],
    num_rows: 5865
})


# Hate BERT Pipeline

## Model tokenization for 

In [19]:
hb_tokenizer = AutoTokenizer.from_pretrained("GroNLP/hateBERT")

In [20]:
# Test a sample tokenization
test = hb_tokenizer("This is an example statement.")
print(test)

{'input_ids': [101, 2023, 2003, 2019, 2742, 4861, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


In [21]:
# General Tokenization Function
def tokenization_function(examples, tokenizer):
    return tokenizer(
        examples["statement"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [22]:
tokenized_dataset = statements_df.map(lambda x: tokenization_function(x, hb_tokenizer), batched=True)

Map:   0%|          | 0/5865 [00:00<?, ? examples/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 5865/5865 [00:00<00:00, 8210.49 examples/s]


In [23]:
hb_model = AutoModelForSequenceClassification.from_pretrained(
    "GroNLP/hateBERT",
    num_labels=2    # Important! 2 classes: hate, not hate
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
print(hb_model.config)

BertConfig {
  "_attn_implementation_autoset": true,
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [25]:

# Split dataset train/val
train_test_split = statements_df.train_test_split(test_size=0.2, seed=1)

train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

print(train_dataset)
print(val_dataset)

Dataset({
    features: ['statement', 'hostility_label'],
    num_rows: 4692
})
Dataset({
    features: ['statement', 'hostility_label'],
    num_rows: 1173
})


In [47]:
# preparing the trainer

training_args = TrainingArguments(
    output_dir="./output/hatebert_finetuned",           
    save_strategy="epoch",
    eval_strategy='epoch' ,  
    learning_rate=2e-5, 
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./output/logs", 
    logging_steps=50,
    load_best_model_at_end=True, 
    metric_for_best_model="eval_loss",
    save_total_limit=2, 
    report_to="none",
    use_mps_device=True, ##### only add this if running locally,
    disable_tqdm=False # progress bar
)

In [48]:
# accuracy 
accuracy_metric = load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [49]:
# gives an error in the hostility label so renaming

#tokenized_dataset = tokenized_dataset.rename_column("hostility_label", "labels")

tokenized_dataset

Dataset({
    features: ['statement', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5865
})

In [50]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=1)

train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [51]:
# Trainer

trainer = Trainer(
    model=hb_model,
    args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset,     
    compute_metrics=compute_metrics,    
)

In [54]:
##trainer.train()

# ROBERTA Pipeline

## Model Tokenization

In [None]:
rb_tokenizer = AutoTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")