In [1]:
pip install torch requests scipy datasets nltk transformers lime shap pandas datasets nltk matplotlib scipy requests

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import ast
import re
import random
from io import StringIO

import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
nltk.download('wordnet')

import requests
from datasets import load_dataset

from scipy.stats import spearmanr, kendalltau
import scipy as sp

import shap
from lime.lime_text import LimeTextExplainer

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import transformers

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## **Model**

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on {device}!")

model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
tokenizer = AutoTokenizer.from_pretrained(model_name)
NON_HATE_SPEECH_CLASS = 0
HATE_SPEECH_CLASS = 1

model = AutoModelForSequenceClassification.from_pretrained(model_name, output_attentions=True)
model.to(device)
model.eval()

Running on cuda!


config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: facebook/roberta-hate-speech-dynabench-r4-target
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [6]:
text = "i want to cut your throat, dont then to kill you!!!!"

inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device)
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
outputs = model(**inputs)
probabilities = F.softmax(outputs.logits, dim=-1)

predicted_class_id = probabilities.argmax().item()
predicted_label = model.config.id2label[predicted_class_id]
predicted_score = probabilities[0][predicted_class_id].item()

print(f"Text: '{text}'")
print(f"Tokens: {tokens}")
print(f"Predicted label: {predicted_label}")
print(f"Confidence: {predicted_score:.4f}")

Text: 'i want to cut your throat, dont then to kill you!!!!'
Tokens: ['<s>', 'i', 'Ġwant', 'Ġto', 'Ġcut', 'Ġyour', 'Ġthroat', ',', 'Ġdont', 'Ġthen', 'Ġto', 'Ġkill', 'Ġyou', '!!!!', '</s>']
Predicted label: hate
Confidence: 0.9969


In [7]:
def is_word_token(token: str) -> bool:
  global tokenizer

  if token in tokenizer.all_special_tokens:
    return False

  token = token.lstrip("Ġ")
  return any(c.isalpha() for c in token)

In [8]:
def get_probabilities(text: str) -> torch.Tensor:
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
  with torch.no_grad():
    outputs = model(**inputs)

  probabilities = F.softmax(outputs.logits, dim=-1).cpu().numpy()
  return probabilities

# [0]: non hate speech
# [1]: hate speech
get_probabilities(text)

array([[0.00311355, 0.99688643]], dtype=float32)

## **Dataset**

In [10]:
dataset_url = "https://raw.githubusercontent.com/VladWero08/xai-hate-speech-classifier/refs/heads/main/dataset/hate_speech_dataset_with_explainers.csv"
response = requests.get(dataset_url)
ucb_hate_speech_df = pd.read_csv(StringIO(response.text), sep=",")
    
print("Hate Speech Tweets Dataset")
print("Shape:", ucb_hate_speech_df.shape)
print("Columns:", ucb_hate_speech_df.columns.tolist())
print("First 5 rows:")
print(ucb_hate_speech_df.head())

Hate Speech Tweets Dataset
Shape: (500, 147)
Columns: ['comment_id', 'annotator_id', 'platform', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'hate_speech_score', 'text', 'infitms', 'outfitms', 'annotator_severity', 'std_err', 'annotator_infitms', 'annotator_outfitms', 'hypothesis', 'target_race_asian', 'target_race_black', 'target_race_latinx', 'target_race_middle_eastern', 'target_race_native_american', 'target_race_pacific_islander', 'target_race_white', 'target_race_other', 'race', 'target_religion_atheist', 'target_religion_buddhist', 'target_religion_christian', 'target_religion_hindu', 'target_religion_jewish', 'target_religion_mormon', 'target_religion_muslim', 'target_religion_other', 'religion', 'target_origin_immigrant', 'target_origin_migrant_worker', 'target_origin_specific_country', 'target_origin_undocumented', 'target_origin_other', 'origin', 'target_gender_men', 'target_gender_non_binary',

## **Explainers**

In [18]:
def compute_token_attention_score(
  attentions: tuple[torch.Tensor],
  token_pos: int,
  last_n_layers: int = 4
) -> float:
  """
  Given the attention matrix resulted from BERT inference, and a token position,
  computes the attention score for that token by summing the attention of the
  last 4 layers of BERT.

  Parameters
  ----------
  attentions : tuple[torch.Tensor]
             Attention resulted from BERT inference of a tokenized paragraph.
  token_pos  : int
             Position of the token, should be > 0 (class token)
  """
  # (num_layers, num_heads, query, key)
  # the query in this case will be token 0, corresponding to the class token
  # the key in this case will be the token_pos received as parameter
  attention_score = 0
  for i in range(len(attentions) - 1, len(attentions) - last_n_layers - 1, -1):
      attention_score += attentions[i][:, :, 0, token_pos].mean().item()

  return attention_score

def compute_attention_scores(
    attentions: tuple[torch.Tensor],
    tokens: list[str],
    last_n_layers: int = 4
) -> list[tuple]:
  """
  Given the attention matrix resulted from BERT inference, computes the attention
  score for each token by summing the attention of the last 4 layers of BERT.

  Parameters
  ----------
  attentions : tuple[torch.Tensor]
              Attention resulted from BERT inference of a tokenized paragraph.
  """
  global tokenizer
  attention_scores = []

  for i in range(len(tokens)):
    token = tokens[i].lstrip("Ġ")
    attention_scores.append(
        (token, compute_token_attention_score(attentions, i, last_n_layers))
    )

  return attention_scores

inputs = tokenizer(ucb_hate_speech_df[:4]["text"].tolist()[1], return_tensors="pt", add_special_tokens=False).to(device)
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
outputs = model(**inputs)

compute_attention_scores(outputs.attentions, tokens)

[('I', 0.19648334383964539),
 ('fucking', 0.04118870850652456),
 ('H', 0.18134414218366146),
 ('ATE', 0.33032047376036644),
 ('americ', 0.8735839948058128),
 ('ans', 0.5834043361246586),
 ('.', 0.5437215864658356),
 ('I', 0.181207574903965),
 ('hope', 0.06883443333208561),
 ('your', 0.19421817548573017),
 ('country', 0.10570773109793663),
 ('BUR', 0.0880456380546093),
 ('NS', 0.07501181587576866),
 ('.', 0.5369281955063343)]

In [19]:
def compute_lime_scores(
  text: str,
  tokens: list[str],
  label: int = HATE_SPEECH_CLASS,
  num_samples: int = 100,
) -> dict:
  """
  Given a text, its tokens and the predicted class by the hate speech classifier, it
  uses LIME to extract the importance of each token. The number of features used by
  the LimeTextExplainer is equal to the total number of non special tokens.
  """
  global model, tokenizer

  # compute the number of features as the number of tokens different from the special ones
  num_features = len([token for token in tokens if token not in tokenizer.all_special_tokens])

  # define the LIME explainer
  explainer = LimeTextExplainer(class_names=list(model.config.id2label.values()))
  explanation = explainer.explain_instance(
      text,
      get_probabilities,
      num_features=num_features,
      num_samples=num_samples,
  )

  # extract the LIME scores and store them in a dictionary
  feature_attributions = explanation.as_list(label=label)
  tokens = [(word.item(), score) for _, (word, score) in enumerate(feature_attributions)]

  return tokens

# compute the LIME score for the tokens in the example sentence
compute_lime_scores(ucb_hate_speech_df[:4]["text"].tolist()[1], tokens, predicted_class_id)

[('I', -0.4584079330608032),
 ('BURNS', -0.14306140561571845),
 ('fucking', 0.13477088462484987),
 ('your', 0.1080903565336436),
 ('HATE', 0.08692601050602869),
 ('americans', -0.04190788188265134),
 ('country', -0.04098660678550688),
 ('hope', 0.03739524395181921)]

In [20]:
# define a prediction function
def f(x):
    tv = torch.tensor([tokenizer.encode(v, padding="max_length", max_length=500, truncation=True, add_special_tokens=False) for v in x]).to(device)
    with torch.no_grad():
      outputs = model(tv)[0].detach().cpu().numpy()
    scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
    val = sp.special.logit(scores[:, 1])  # use one vs rest logit units
    return val

# build an explainer using a token masker
shap_explainer = shap.Explainer(f, tokenizer)

def compute_shap_scores(text: str) -> np.ndarray:
  global shap_explainer
  shap_values = shap_explainer([text], fixed_context=1, batch_size=1)
  shap_values_list = []

  for i, value in enumerate(shap_values.values[0]):
      token = shap_values.data[0][i]
      if token == '' or token in tokenizer.all_special_tokens:
          continue

      shap_values_list.append((token, value.item()))

  return shap_values_list

compute_shap_scores(ucb_hate_speech_df[:4]["text"].tolist()[1])

[('I ', 0.9005785882472992),
 ('fucking ', 0.027191132307052612),
 ('H', 0.11832618117332458),
 ('ATE ', 0.29801749587059023),
 ('americ', 0.1651911675930023),
 ('ans', 0.1309647500514984),
 ('. ', 0.415344113111496),
 ('I ', 0.174204843384879),
 ('hope ', 0.42708660875047955),
 ('your ', 0.27539290700639996),
 ('country ', 0.16818346296037948),
 ('BUR', 0.09716351543154034),
 ('NS', 0.003816204411642876),
 ('.', 0.01946146999086651)]

## **Distribution Metrics**

For each sample, the normalized entropy, gini and top-5 mass coefficients are computed, and afterwards these metrics will be averaged per hate speech category.

In [11]:
def compute_entropy(distribution: list[float]) -> float:
  n = len(distribution)
  entropy = -sum([p * np.log2(p) for p in distribution]) / np.log2(n)
  entropy = np.round(entropy, 4)
  return entropy

def compute_gini(distribution: list[float]) -> float:
  distribution = np.sort(distribution)
  n = len(distribution)

  i = np.arange(1, n + 1)
  gini = 1 - 2 * np.sum(distribution * (n - i + 0.5) / n)

  return np.round(gini, 4)

def compute_top_k_mass(
    distribution: list[float],
    k: int = 5,
) -> float:
  k = min(len(distribution), k)
  distribution = np.sort(distribution)[::-1]
  distribution = distribution[:k]
  mass = np.sum(distribution)

  return mass

distribution = [0.05, 0.9, 0.05]
print(f"Entropy: {compute_entropy(distribution)}")
print(f"Gini: {compute_gini(distribution)}")
print(f"Top-5 Mass: {compute_top_k_mass(distribution)}")

Entropy: 0.359
Gini: 0.5667
Top-5 Mass: 1.0


In [13]:
def compute_comparison_metrics(row):
  attention = F.softmax(torch.Tensor([score for token, score in ast.literal_eval(row["attention_scores"])]), dim=0).tolist()
  lime = F.softmax(torch.Tensor([score for token, score in ast.literal_eval(row["lime_scores"])]), dim=0).tolist()
  shap = F.softmax(torch.Tensor([score for token, score in ast.literal_eval(row["shap_scores"])]), dim=0).tolist()

  # attention distribution metrics
  attention_entropy = compute_entropy(attention)
  attention_gini = compute_gini(attention)
  attention_top_k_mass = compute_top_k_mass(attention)

  # lime distribution metrics
  lime_entropy = compute_entropy(lime)
  lime_gini = compute_gini(lime)
  lime_top_k_mass = compute_top_k_mass(lime)

  # shap distribution metrics
  shap_entropy = compute_entropy(shap)
  shap_gini = compute_gini(shap)
  shap_top_k_mass = compute_top_k_mass(shap)

  return pd.Series({
    'attention_entropy':      np.round(attention_entropy, 4),
    'attention_gini':         np.round(attention_gini, 4),
    'attention_top_k_mass':   np.round(attention_top_k_mass, 4),
    'lime_entropy':           np.round(lime_entropy, 4),
    'lime_gini':              np.round(lime_gini, 4),
    'lime_top_k_mass':        np.round(lime_top_k_mass, 4),
    'shap_entropy':           np.round(shap_entropy, 4),
    'shap_gini':              np.round(shap_gini, 4),
    'shap_top_k_mass':        np.round(shap_top_k_mass, 4)
  })

# apply to entire dataframe
ucb_hate_speech_df[[
  'attention_entropy',
  'attention_gini',
  'attention_top_k_mass',
  'lime_entropy',
  'lime_gini',
  'lime_top_k_mass',
  'shap_entropy',
  'shap_gini',
  'shap_top_k_mass'
]] = ucb_hate_speech_df.apply(compute_comparison_metrics, axis=1)

  ucb_hate_speech_df[[
  ucb_hate_speech_df[[
  ucb_hate_speech_df[[
  ucb_hate_speech_df[[
  ucb_hate_speech_df[[
  ucb_hate_speech_df[[
  ucb_hate_speech_df[[
  ucb_hate_speech_df[[
  ucb_hate_speech_df[[


In [14]:
metrics = ['attention_entropy', 'attention_gini', 'attention_top_k_mass', 'lime_entropy', 'lime_gini', 'lime_top_k_mass', 'shap_entropy', 'shap_gini', 'shap_top_k_mass']

for metric in metrics:
  average_by_category = ucb_hate_speech_df.groupby('hate_speech_label')[metric].mean()

  print(f"Metrics = {metric}")
  print(average_by_category)
  print()

Metrics = attention_entropy
hate_speech_label
gender       0.987397
origin       0.988522
race         0.989092
religion     0.988492
sexuality    0.992650
Name: attention_entropy, dtype: float64

Metrics = attention_gini
hate_speech_label
gender       0.092174
origin       0.093591
race         0.085079
religion     0.084955
sexuality    0.078559
Name: attention_gini, dtype: float64

Metrics = attention_top_k_mass
hate_speech_label
gender       0.449039
origin       0.358162
race         0.406818
religion     0.362582
sexuality    0.610836
Name: attention_top_k_mass, dtype: float64

Metrics = lime_entropy
hate_speech_label
gender       0.993632
origin       0.996073
race         0.994454
religion     0.993900
sexuality    0.989986
Name: lime_entropy, dtype: float64

Metrics = lime_gini
hate_speech_label
gender       0.063598
origin       0.056510
race         0.057544
religion     0.061757
sexuality    0.067141
Name: lime_gini, dtype: float64

Metrics = lime_top_k_mass
hate_speech_lab

In [15]:
def top_k_jaccard(a: list[float], b: list[float], k: int = 5):
  k = min(len(a), k)
  top_k_tokens_a = set([token for token, score in sorted(a, key=lambda x: x[1], reverse=True)[:k]])
  top_k_tokens_b = set([token for token, score in sorted(b, key=lambda x: x[1], reverse=True)[:k]])

  intersection = len(top_k_tokens_a & top_k_tokens_b)
  union = len(top_k_tokens_a | top_k_tokens_b)
  jaccard_index = intersection / union if union > 0 else 0

  return jaccard_index

def compute_rank_metrics(row):
  attention = F.softmax(torch.Tensor([score for token, score in ast.literal_eval(row["attention_scores"])]), dim=0).tolist()
  shap = F.softmax(torch.Tensor([score for token, score in ast.literal_eval(row["shap_scores"])]), dim=0).tolist()

  spearman_corr, spearman_pval = spearmanr(attention, shap)
  kendall_corr, kendall_pval = kendalltau(attention, shap)
  jaccard_index = top_k_jaccard(
    a=ast.literal_eval(row["attention_scores"]),
    b=ast.literal_eval(row["shap_scores"]),
  )

  return pd.Series({
    'attention_shap_spearman':  np.round(spearman_corr, 4),
    'attention_shap_kendall':   np.round(kendall_corr, 4),
    'attention_shap_jaccard':   np.round(jaccard_index, 4)
  })

# apply to entire dataframe
ucb_hate_speech_df[['attention_shap_spearman', 'attention_shap_kendall', 'attention_shap_jaccard']] = ucb_hate_speech_df.apply(compute_rank_metrics, axis=1)

  ucb_hate_speech_df[['attention_shap_spearman', 'attention_shap_kendall', 'attention_shap_jaccard']] = ucb_hate_speech_df.apply(compute_rank_metrics, axis=1)
  ucb_hate_speech_df[['attention_shap_spearman', 'attention_shap_kendall', 'attention_shap_jaccard']] = ucb_hate_speech_df.apply(compute_rank_metrics, axis=1)
  ucb_hate_speech_df[['attention_shap_spearman', 'attention_shap_kendall', 'attention_shap_jaccard']] = ucb_hate_speech_df.apply(compute_rank_metrics, axis=1)


In [16]:
rank_metrics = ['attention_shap_spearman', 'attention_shap_kendall', 'attention_shap_jaccard']

for rank_metric in rank_metrics:
  average_by_category = ucb_hate_speech_df.groupby('hate_speech_label')[rank_metric].mean()

  print(f"Metrics = {rank_metric}")
  print(average_by_category)
  print()

Metrics = attention_shap_spearman
hate_speech_label
gender       0.033107
origin       0.047723
race         0.015232
religion     0.014872
sexuality   -0.177919
Name: attention_shap_spearman, dtype: float64

Metrics = attention_shap_kendall
hate_speech_label
gender       0.023235
origin       0.038814
race         0.007516
religion     0.010347
sexuality   -0.157412
Name: attention_shap_kendall, dtype: float64

Metrics = attention_shap_jaccard
hate_speech_label
gender       0.085989
origin       0.075370
race         0.094758
religion     0.081096
sexuality    0.178402
Name: attention_shap_jaccard, dtype: float64



In [35]:
def get_first_synonym(word: str) -> str | None:
    synsets = wordnet.synsets(word.lower())

    if synsets:
        lemmas = synsets[0].lemmas()
        for lemma in lemmas:
            if lemma.name() != word.lower():
                return lemma.name()
    
    return None

def create_adversarial_example(
    text: str, 
    scores: list[tuple], 
    replacements: int = 5
) -> str:
    top_tokens = sorted(scores, key=lambda x: x[1], reverse=True)
    words = word_tokenize(text)    
    adversarial = words.copy()
    replaced = 0

    for i, word in enumerate(words):
        if replaced >= replacements:
            break
        
        for top_token, score in top_tokens:
            if word.lower() == top_token.lower() or top_token.lower() in word.lower():
                synonym = get_first_synonym(word)
                
                if synonym:
                    adversarial[i] = synonym
                    replaced += 1
                    break
    if replaced == 0:
        return None
        
    return " ".join(adversarial)

n_sample = 93
text = ucb_hate_speech_df.iloc[n_sample]["text"]
scores = ast.literal_eval(ucb_hate_speech_df.iloc[n_sample]["lime_scores"])
adversarial = create_adversarial_example(text, scores)

print(f"Original: {text}")
print(f"Adversarial: {adversarial}")

Original: EAT SHIT REDNECK CUNTS
Adversarial: EAT crap cracker cunt


In [36]:
def compute_adversarial_examples(row):
    global tokenizer
    attention_adversarial_scores = None
    attention_adversarial_probs = None
    lime_adversarial_scores = None
    lime_adversarial_probs = None
    shap_adversarial_scores = None
    shap_adversarial_probs = None
    
    text = row["text"]
    
    # get default predictions
    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(device)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    outputs = model(**inputs)
    default_probs = F.softmax(outputs.logits, dim=-1).detach().cpu().numpy()
    
    attention = ast.literal_eval(row["attention_scores"])
    lime = ast.literal_eval(row["lime_scores"])
    shap = ast.literal_eval(row["shap_scores"])
    
    # compute adversarial examples
    attention_adversarial = create_adversarial_example(text, attention)
    lime_adversarial = create_adversarial_example(text, lime)
    shap_adversarial = create_adversarial_example(text, shap)
    
    # compute attention adversarial
    if attention_adversarial:
        inputs = tokenizer(attention_adversarial, return_tensors="pt", add_special_tokens=False).to(device)
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        outputs = model(**inputs)
        attention_adversarial_scores = compute_attention_scores(outputs.attentions, tokens)
        attention_adversarial_probs = F.softmax(outputs.logits, dim=-1).detach().cpu().numpy()
    
    # compute LIME adversarial
    if lime_adversarial:
        inputs = tokenizer(lime_adversarial, return_tensors="pt", add_special_tokens=False).to(device)
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        outputs = model(**inputs)
        lime_adversarial_scores = compute_lime_scores(lime_adversarial, tokens)
        lime_adversarial_probs = F.softmax(outputs.logits, dim=-1).detach().cpu().numpy()
    
    # compute SHAP adversarial
    if shap_adversarial:
        inputs = tokenizer(shap_adversarial, return_tensors="pt", add_special_tokens=False).to(device)
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        outputs = model(**inputs)
        shap_adversarial_scores = compute_shap_scores(shap_adversarial)
        shap_adversarial_probs = F.softmax(outputs.logits, dim=-1).detach().cpu().numpy()
    
    return pd.Series({
        'text_probs': default_probs,
        'attention_adversarial_scores': attention_adversarial_scores,
        'attention_adversarial_probs': attention_adversarial_probs,
        'lime_adversarial_scores': lime_adversarial_scores,
        'lime_adversarial_probs': lime_adversarial_probs,
        'shap_adversarial_scores': shap_adversarial_scores,
        'shap_adversarial_probs': shap_adversarial_probs
    })


ucb_hate_speech_df[[
    'text_probs',
    'attention_adversarial_scores',
    'attention_adversarial_probs',
    'lime_adversarial_scores',
    'lime_adversarial_probs',
    'shap_adversarial_scores',
    'shap_adversarial_probs'
]] = ucb_hate_speech_df.apply(compute_adversarial_examples, axis=1)

In [37]:
def compute_flip_rate_and_cd(df: pd.DataFrame, adversarial_metric: str):
    """
    Compute flip rate and confidence distribution for adversarial examples.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with original and adversarial predictions
    adversarial_metric : str
        Column name for adversarial probabilities
    
    Returns:
    --------
    dict : Flip rate, CD mean, and CD std per hate speech category
    """
    
    # extract original hate speech probability (class 1)
    original_hate_probs = df['text_probs'].apply(
        lambda x: x[0, 1] if x is not None else np.nan
    )
    original_predictions = original_hate_probs >= 0.5
    
    # extract adversarial hate speech probability (class 1)
    adversarial_hate_probs = df[adversarial_metric].apply(
        lambda x: x[0, 1] if x is not None else np.nan
    )
    adversarial_predictions = adversarial_hate_probs >= 0.5
    
    # compute flips: was hate speech (1), now is not hate speech (0)
    flips = original_predictions & ~adversarial_predictions
    
    # compute confidence distribution: CD = p_original - p_adversarial
    cd = original_hate_probs - adversarial_hate_probs
    
    # group by hate_speech_label and compute metrics
    results = {}
    
    print(f"\n{'='*80}")
    print(f"Adversarial Robustness Analysis: {adversarial_metric}")
    print(f"{'='*80}\n")
    print(f"{'Category':<20} {'Flip Rate':<15} {'CD Mean':<15} {'CD Std':<15} {'# Flips':<10} {'# Total':<10}")
    print(f"{'-'*80}")
    
    for label in sorted(df['hate_speech_label'].unique()):
        mask = df['hate_speech_label'] == label
        
        # flip rate calculation
        label_flips = flips[mask].sum()
        label_total = mask.sum()
        label_flip_rate = (label_flips / label_total * 100) if label_total > 0 else 0
        
        # confidence distribution (CD) statistics
        label_cd = cd[mask].dropna()
        label_cd_mean = label_cd.mean()
        label_cd_std = label_cd.std()
        
        results[label] = {
            'flip_rate': label_flip_rate,
            'cd_mean': label_cd_mean,
            'cd_std': label_cd_std,
            'num_flips': label_flips,
            'total_samples': label_total
        }
        
        print(f"{str(label):<20} {label_flip_rate:>6.2f}%        {label_cd_mean:>8.4f}      {label_cd_std:>8.4f}      {label_flips:>8} {label_total:>8}")
    
    # overall statistics
    overall_flip_rate = (flips.sum() / len(df) * 100)
    overall_cd_mean = cd.dropna().mean()
    overall_cd_std = cd.dropna().std()
    
    print(f"{'-'*80}")
    print(f"{'OVERALL':<20} {overall_flip_rate:>6.2f}%        {overall_cd_mean:>8.4f}      {overall_cd_std:>8.4f}")
    print(f"{'='*80}\n")
    
    return results


print("\n" + "="*80)
print("ADVERSARIAL ROBUSTNESS EVALUATION")
print("="*80)

metrics = ['attention_adversarial_probs', 'lime_adversarial_probs', 'shap_adversarial_probs']
for metric in metrics:
    compute_flip_rate_and_cd(ucb_hate_speech_df, metric)



ADVERSARIAL ROBUSTNESS EVALUATION

Adversarial Robustness Analysis: attention_adversarial_probs

Category             Flip Rate       CD Mean         CD Std          # Flips    # Total   
--------------------------------------------------------------------------------
gender                 1.04%         -0.0785        0.2790             1       96
origin                 7.41%         -0.0404        0.3505             6       81
race                   3.68%         -0.0693        0.3023             6      163
religion               6.86%          0.0102        0.3457             7      102
sexuality              5.17%          0.0027        0.3110             3       58
--------------------------------------------------------------------------------
OVERALL                4.60%         -0.0418        0.3174


Adversarial Robustness Analysis: lime_adversarial_probs

Category             Flip Rate       CD Mean         CD Std          # Flips    # Total   
------------------------------

In [None]:
ucb_hate_speech_df.to_csv("hate_speech_with_explainers_adversarial.csv")