In [12]:
from toxigen_dataset import ToxiGenDataset
from hatebert_model import HateBERTClassifier
from torch.utils.data import DataLoader
import torch
import torch.nn as  nn
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score


## Toxigen Dataset
### Loading the Dataset

In [None]:
# Initialize and load the ToxiGen dataset
HUGGING_FACE_TOKEN = "A AJOUTER VOTRE TOKEN HUGGINGFACE"
toxigen_dataset = ToxiGenDataset(HUGGING_FACE_TOKEN)
toxigen = toxigen_dataset.load_dataset()
print(toxigen)

DatasetDict({
    test: Dataset({
        features: ['text', 'target_group', 'labels'],
        num_rows: 940
    })
    train: Dataset({
        features: ['text', 'target_group', 'labels'],
        num_rows: 8960
    })
})


### Statistics about the dataset

In [14]:
# Convert to pandas DataFrames for easier analysis
train_df = pd.DataFrame(toxigen['train'])
test_df = pd.DataFrame(toxigen['test'])

print("=== Dataset Overview ===")
print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(f"Total dataset size: {len(train_df) + len(test_df)}")


print("\n=== Label Distribution ===")
print("\nTraining set:")
print(train_df['labels'].value_counts(normalize=True))
print("\nTest set:")
print(test_df['labels'].value_counts(normalize=True))

print("\n=== Target Group Distribution ===")
print("\nTraining set:")
print(train_df['target_group'].value_counts())
print("\nTest set:")
print(test_df['target_group'].value_counts())

# Text length statistics
train_df['text_length'] = train_df['text'].str.len()
test_df['text_length'] = test_df['text'].str.len()

print("\n=== Text Length Statistics ===")
print("\nTraining set:")
print(train_df['text_length'].describe())
print("\nTest set:")
print(test_df['text_length'].describe())

=== Dataset Overview ===
Training set size: 8960
Test set size: 940
Total dataset size: 9900

=== Label Distribution ===

Training set:
labels
0    0.624107
1    0.375893
Name: proportion, dtype: float64

Test set:
labels
0    0.568085
1    0.431915
Name: proportion, dtype: float64

=== Target Group Distribution ===

Training set:
target_group
women              717
lgbtq              714
mental_dis         714
black              713
chinese            706
asian              702
native_american    702
middle_east        697
muslim             688
physical_dis       685
mexican            684
jewish             684
latino             554
Name: count, dtype: int64

Test set:
target_group
physical_dis       95
black              92
jewish             87
muslim             83
chinese            77
mexican            73
middle_east        68
mental_dis         68
lgbtq              66
women              65
latino             61
native_american    54
asian              51
Name: count, dtype:

### Tokenize the dataset

In [15]:
# Initialize HateBERT classifier
model_hatebert = HateBERTClassifier(
    model_name="GroNLP/hateBERT",
    num_labels=2,
    device=None,  # Will automatically detect cuda/cpu
    max_length=512
)

INFO:hatebert_model:Using device: cpu
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFor

In [16]:
# Tokenize function is now handled by HateBERT's prepare_inputs method
# We can directly use the tokenizer from our hatebert instance
tokenizer = model_hatebert.tokenizer

def tokenize_function(tokenizer, example, text_field):
    """
    Tokenize texts using the HateBERT tokenizer.
    
    Args:
        tokenizer: HateBERT tokenizer
        example: Dataset example
        text_field: Name of the text field to tokenize
        
    Returns:
        Tokenized features
    """
    # Tokenize the texts
    tokenized = tokenizer(
        example[text_field],
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )
    
    # Convert to lists for dataset storage
    return {
        'input_ids': tokenized['input_ids'].squeeze().tolist(),
        'attention_mask': tokenized['attention_mask'].squeeze().tolist(),
        'token_type_ids': tokenized['token_type_ids'].squeeze().tolist()
    }

'''
 apply the function to all the elements in the dataset (individually or in batches)
 https://huggingface.co/docs/datasets/v1.11.0/package_reference/main_classes.html?highlight=dataset%20map#datasets.Dataset.map
 batch mode is very powerful. It allows you to speed up processing
 more info here: https://huggingface.co/docs/datasets/en/about_map_batch
'''
cache_files = {
    "test": ".cache/datasets/toxigen/toxigen_test_tokenized.arrow",
    "train": ".cache/datasets/toxigen/toxigen_train_tokenized.arrow"
} #path to the local cache files, where the current computation from the following function will be stored. 
# Caching saves RAM when working with large datasets and saves time instead of doing transformations on the fly.
tokenized_toxigen = toxigen.map(lambda x: tokenize_function(tokenizer, x, "text"), batched=True, cache_file_names=cache_files)

In [17]:
print(tokenized_toxigen)


DatasetDict({
    test: Dataset({
        features: ['text', 'target_group', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 940
    })
    train: Dataset({
        features: ['text', 'target_group', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8960
    })
})


In [18]:
def collate_fn(batch):
    batch_dict = {k: [dic[k] for dic in batch] for k in batch[0]}
    for k in ['input_ids', 'attention_mask', 'token_type_ids', 'labels']:
        if k in batch_dict:
            # Convert lists to tensors before stacking
            batch_dict[k] = torch.stack([torch.tensor(x) for x in batch_dict[k]])
    # Optionally, keep 'text' and 'target_group' as lists of strings if you need them
    return batch_dict

In [19]:
# create a smaller subset of the dataset as previously shown to speed up the fine-tuning
small_train_dataset = tokenized_toxigen["train"].shuffle(seed=42).select(range(300))
small_eval_dataset = tokenized_toxigen["test"].shuffle(seed=42).select(range(100))

In [20]:
# create a DataLoader for your training and test datasets so you can iterate over batches of data:
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
test_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [21]:
model_hatebert.train(train_dataloader=train_dataloader,
                     num_epochs=3,
                     learning_rate=2e-5,
                     save_path=None)

INFO:hatebert_model:Epoch 1/3
  inputs['labels'] = torch.tensor(labels).to(self.device)
Training: 100%|██████████| 38/38 [01:23<00:00,  2.21s/it, loss=0.803]
INFO:hatebert_model:Epoch 1 - Train Loss: 0.6727, Train Accuracy: 0.5600
INFO:hatebert_model:Epoch 2/3
  inputs['labels'] = torch.tensor(labels).to(self.device)
Training: 100%|██████████| 38/38 [01:28<00:00,  2.33s/it, loss=0.526]
INFO:hatebert_model:Epoch 2 - Train Loss: 0.5713, Train Accuracy: 0.6667
INFO:hatebert_model:Epoch 3/3
  inputs['labels'] = torch.tensor(labels).to(self.device)
Training: 100%|██████████| 38/38 [01:41<00:00,  2.67s/it, loss=0.314]
INFO:hatebert_model:Epoch 3 - Train Loss: 0.4879, Train Accuracy: 0.7667


{'train_loss': [0.6726961026066228, 0.571270069793651, 0.4879345933073445],
 'train_accuracy': [0.56, 0.6666666666666666, 0.7666666666666667]}

In [22]:
model_hatebert.predict(test_dataloader, return_probs=True)

  inputs['labels'] = torch.tensor(labels).to(self.device)
Testing: 100%|██████████| 13/13 [00:10<00:00,  1.24it/s]


(array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([[0.6371546 , 0.3628454 ],
        [0.4520047 , 0.5479952 ],
        [0.6962602 , 0.3037398 ],
        [0.53668576, 0.46331427],
        [0.6436471 , 0.35635293],
        [0.7979959 , 0.20200408],
        [0.79391813, 0.20608185],
        [0.6720464 , 0.32795355],
        [0.7851104 , 0.21488956],
        [0.8026144 , 0.19738556],
        [0.78238255, 0.21761744],
        [0.7546486 , 0.24535131],
        [0.72492355, 0.27507642],
        [0.6535448 , 0.34645528],
        [0.70093304, 0.299067  ],
        [0.650606  , 0.34939393],
        [0.7866784 , 0.21332163],
        [0.5156771 , 0.48432294],
        [0.6903167 , 0.30968335],
        [0.7

In [23]:
def evaluate_model_by_target_group(test_dataloader, predictions, target_groups=None):
    """
    Compute confusion matrix and F1 score for each target group.
    
    Args:
        test_dataloader: DataLoader containing test data
        predictions: Model predictions array
        target_groups: Optional list of target groups to evaluate specifically
    """
    from sklearn.metrics import confusion_matrix, f1_score
    
    # Extract all labels and target groups from test dataloader
    all_labels = []
    all_target_groups = []
    
    for batch in test_dataloader:
        all_labels.extend(batch['labels'].numpy())
        all_target_groups.extend(batch['target_group'])
    
    # Get unique target groups if not specified
    if target_groups is None:
        target_groups = list(set(all_target_groups))
    
    print("Overall metrics:")
    print("Confusion Matrix:")
    print(confusion_matrix(all_labels, predictions))
    print(f"F1 Score: {f1_score(all_labels, predictions):.4f}\n")
    
    print("Metrics by target group:")
    for group in target_groups:
        # Get indices for this target group
        group_indices = [i for i, g in enumerate(all_target_groups) if g == group]
        
        if len(group_indices) == 0:
            continue
            
        # Get predictions and labels for this group
        group_preds = predictions[group_indices]
        group_labels = [all_labels[i] for i in group_indices]
        
        print(f"\nTarget Group: {group}")
        print("Confusion Matrix:")
        print(confusion_matrix(group_labels, group_preds))
        print(f"F1 Score: {f1_score(group_labels, group_preds):.4f}")

# Example usage:
preds, _ = model_hatebert.predict(test_dataloader, return_probs=True)
evaluate_model_by_target_group(test_dataloader, preds)


Testing:   0%|          | 0/13 [00:00<?, ?it/s]

Testing: 100%|██████████| 13/13 [00:09<00:00,  1.38it/s]


Overall metrics:
Confusion Matrix:
[[59  1]
 [35  5]]
F1 Score: 0.2174

Metrics by target group:

Target Group: muslim
Confusion Matrix:
[[5 0]
 [1 0]]
F1 Score: 0.0000

Target Group: black
Confusion Matrix:
[[3 0]
 [6 0]]
F1 Score: 0.0000

Target Group: jewish
Confusion Matrix:
[[2 0]
 [3 1]]
F1 Score: 0.4000

Target Group: chinese
Confusion Matrix:
[[6 0]
 [2 0]]
F1 Score: 0.0000

Target Group: women
Confusion Matrix:
[[2 0]
 [5 0]]
F1 Score: 0.0000

Target Group: mexican
Confusion Matrix:
[[7 0]
 [0 1]]
F1 Score: 1.0000

Target Group: asian
Confusion Matrix:
[[5 0]
 [1 0]]
F1 Score: 0.0000

Target Group: middle_east
Confusion Matrix:
[[0 1]
 [3 1]]
F1 Score: 0.3333

Target Group: native_american
Confusion Matrix:
[[4 0]
 [2 0]]
F1 Score: 0.0000

Target Group: lgbtq
Confusion Matrix:
[[7 0]
 [2 1]]
F1 Score: 0.5000

Target Group: physical_dis
Confusion Matrix:
[[10  0]
 [ 5  0]]
F1 Score: 0.0000

Target Group: mental_dis
Confusion Matrix:
[[3 0]
 [3 0]]
F1 Score: 0.0000

Target Group