# **Explainer Feature Extraction**

In this notebook, the explanation features will be extracted from the attention heads of the model, and the LIME and SHAP scores. Afterwards, they will be saved to be reused for analysis.

In [66]:
pip install torch transformers lime shap pandas datasets nltk matplotlib scipy requests

[0mNote: you may need to restart the kernel to use updated packages.


In [67]:
import torch
import torch.nn.functional as F
import shap
import pandas as pd
import requests
import re
import nltk
import nltk.tokenize
import random
import requests
import numpy as np
import matplotlib.pyplot as plt
import shap
import scipy as sp
import transformers

from io import StringIO
from datasets import load_dataset
from lime.lime_text import LimeTextExplainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## **Model**

In [68]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on {device}!")

model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
tokenizer = AutoTokenizer.from_pretrained(model_name)
NON_HATE_SPEECH_CLASS = 0
HATE_SPEECH_CLASS = 1

model = AutoModelForSequenceClassification.from_pretrained(model_name, output_attentions=True)
model.to(device)
model.eval()

Running on cuda!


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: facebook/roberta-hate-speech-dynabench-r4-target
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [69]:
num_attention_heads = model.config.num_attention_heads
num_hidden_layers = model.config.num_hidden_layers

print(f"Special tokens: {tokenizer.all_special_tokens}")
print(f"Number of attention heads: {num_attention_heads}")
print(f"Number of hidden layers: {num_hidden_layers}")

Special tokens: ['<s>', '</s>', '<unk>', '<pad>', '<mask>']
Number of attention heads: 12
Number of hidden layers: 12


In [70]:
text = "i want to cut your throat, dont then to kill you!!!!"

inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device)
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
outputs = model(**inputs)
probabilities = F.softmax(outputs.logits, dim=-1)

predicted_class_id = probabilities.argmax().item()
predicted_label = model.config.id2label[predicted_class_id]
predicted_score = probabilities[0][predicted_class_id].item()

print(f"Text: '{text}'")
print(f"Tokens: {tokens}")
print(f"Predicted label: {predicted_label}")
print(f"Confidence: {predicted_score:.4f}")

Text: 'i want to cut your throat, dont then to kill you!!!!'
Tokens: ['<s>', 'i', 'Ġwant', 'Ġto', 'Ġcut', 'Ġyour', 'Ġthroat', ',', 'Ġdont', 'Ġthen', 'Ġto', 'Ġkill', 'Ġyou', '!!!!', '</s>']
Predicted label: hate
Confidence: 0.9969


In [71]:
def is_word_token(token: str) -> bool:
  global tokenizer

  if token in tokenizer.all_special_tokens:
    return False

  token = token.lstrip("Ġ")
  return any(c.isalpha() for c in token)

In [72]:
def get_probabilities(text: str) -> torch.Tensor:
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
  with torch.no_grad():
    outputs = model(**inputs)

  probabilities = F.softmax(outputs.logits, dim=-1).cpu().numpy()
  return probabilities

# [0]: non hate speech
# [1]: hate speech
get_probabilities(text)

array([[0.00311355, 0.99688643]], dtype=float32)

## **Dataset**

In [73]:
ucb_hate_speech_ds = load_dataset("ucberkeley-dlab/measuring-hate-speech")
ucb_hate_speech_df = ucb_hate_speech_ds["train"].to_pandas()

print("UC Berkeley Hate Speech Dataset")
print("Shape:", ucb_hate_speech_df.shape)
print("Columns:", ucb_hate_speech_df.columns.tolist())

UC Berkeley Hate Speech Dataset
Shape: (135556, 143)
Columns: ['comment_id', 'annotator_id', 'platform', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'hate_speech_score', 'text', 'infitms', 'outfitms', 'annotator_severity', 'std_err', 'annotator_infitms', 'annotator_outfitms', 'hypothesis', 'target_race_asian', 'target_race_black', 'target_race_latinx', 'target_race_middle_eastern', 'target_race_native_american', 'target_race_pacific_islander', 'target_race_white', 'target_race_other', 'target_race', 'target_religion_atheist', 'target_religion_buddhist', 'target_religion_christian', 'target_religion_hindu', 'target_religion_jewish', 'target_religion_mormon', 'target_religion_muslim', 'target_religion_other', 'target_religion', 'target_origin_immigrant', 'target_origin_migrant_worker', 'target_origin_specific_country', 'target_origin_undocumented', 'target_origin_other', 'target_origin', 'target_gender_men'

In [74]:
ucb_categories = ["race", "religion", "origin", "gender", "sexuality", "age", "disability", "politics"]
ucb_hate_speech_df.rename(columns={
    "target_race": "race",
    "target_religion": "religion",
    "target_origin": "origin",
    "target_gender": "gender",
    "target_sexuality": "sexuality",
    "target_age": "age",
    "target_disability": "disability",
    "target_politics": "politics",
}, inplace=True)

Next, we need to aggregate the columns by the scores given by each annotator. The hyphotesis score will be the *mean* of all the scores for that comment, and the final hate speech category will be the most voted one.

In [75]:
# gather the aggregate type for each comment
ucb_aggregations = {}
for col in ucb_hate_speech_df.columns:
    if col == 'comment_id':
        continue

    if col == 'hypothesis':
        ucb_aggregations[col] = 'mean'
    elif ucb_hate_speech_df[col].dtype == 'bool':
        ucb_aggregations[col] = 'sum'
    else:
        ucb_aggregations[col] = 'first'

# group each comment based on the annotator's score
ucb_hate_speech_df = ucb_hate_speech_df.groupby('comment_id', as_index=False).agg(ucb_aggregations)

# add a new column with the hate speech label
ucb_hate_speech_df['hate_speech_label'] = ucb_hate_speech_df[ucb_categories].idxmax(axis=1)

print("UC Berkeley Hate Speech Dataset Aggregated")
print("Shape:", ucb_hate_speech_df.shape)

UC Berkeley Hate Speech Dataset Aggregated
Shape: (39565, 144)


  ucb_hate_speech_df = ucb_hate_speech_df.groupby('comment_id', as_index=False).agg(ucb_aggregations)
  ucb_hate_speech_df['hate_speech_label'] = ucb_hate_speech_df[ucb_categories].idxmax(axis=1)


Extract the comments that are definetely hate speech, depending on their hate speech score.

In [76]:
# keep only the comments with a hate speech score larger than 0.5
ucb_hate_speech_df = ucb_hate_speech_df[ucb_hate_speech_df['hypothesis'] >= 0.5]
ucb_hate_speech_df = ucb_hate_speech_df.drop_duplicates(subset=['comment_id'])

# remove the categories with very few samples
ucb_hate_speech_df = ucb_hate_speech_df[~ucb_hate_speech_df["hate_speech_label"].isin(["politics", "disability"])]

print("Only Hate Speech Dataset")
print("Shape:", ucb_hate_speech_df.shape)

Only Hate Speech Dataset
Shape: (501, 144)


In [77]:
ucb_hate_speech_df[:10]["text"].tolist()

['Eat cock and die',
 'I fucking HATE americans. I hope your country BURNS.',
 'kill yourself you fucking idiot cunt',
 'Fuck those Saudi rats',
 'Fuck off you nazi cunt. Die in a fire.',
 'Stupid fucking cunt. I hope your mother dies in crippling agony.',
 'Nuke the border area, Nuke Algeria, Nuke Morocco, Nuke the UN',
 'Burn their fucking church down!',
 'Shoot yourself in the pussy!',
 'You are just Another retarded white boy ready to shoot up a school smh you are such trash.... please go to your closet and make a noose and kill yourself your fuckin trash']

## **Explainers**

### **Attention Weights**

In [88]:
def compute_token_attention_score(
  attentions: tuple[torch.Tensor],
  token_pos: int,
  last_n_layers: int = 4
) -> float:
  """
  Given the attention matrix resulted from BERT inference, and a token position,
  computes the attention score for that token by summing the attention of the
  last 4 layers of BERT.

  Parameters
  ----------
  attentions : tuple[torch.Tensor]
             Attention resulted from BERT inference of a tokenized paragraph.
  token_pos  : int
             Position of the token, should be > 0 (class token)
  """
  # (num_layers, num_heads, query, key)
  # the query in this case will be token 0, corresponding to the class token
  # the key in this case will be the token_pos received as parameter
  attention_score = 0
  for i in range(len(attentions) - 1, len(attentions) - last_n_layers - 1, -1):
      attention_score += attentions[i][:, :, 0, token_pos].mean().item()

  return attention_score

def compute_attention_scores(
    attentions: tuple[torch.Tensor],
    tokens: list[str],
    last_n_layers: int = 4
) -> list[tuple]:
  """
  Given the attention matrix resulted from BERT inference, computes the attention
  score for each token by summing the attention of the last 4 layers of BERT.

  Parameters
  ----------
  attentions : tuple[torch.Tensor]
              Attention resulted from BERT inference of a tokenized paragraph.
  """
  global tokenizer
  attention_scores = []

  for i in range(len(tokens)):
    token = tokens[i].lstrip("Ġ")
    attention_scores.append(
        (token, compute_token_attention_score(attentions, i, last_n_layers))
    )

  return attention_scores

inputs = tokenizer(ucb_hate_speech_df[:4]["text"].tolist()[1], return_tensors="pt", add_special_tokens=False).to(device)
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
outputs = model(**inputs)

compute_attention_scores(outputs.attentions, tokens)

[('I', 0.19648334383964539),
 ('fucking', 0.04118870850652456),
 ('H', 0.18134414218366146),
 ('ATE', 0.33032047376036644),
 ('americ', 0.8735839948058128),
 ('ans', 0.5834043361246586),
 ('.', 0.5437215864658356),
 ('I', 0.181207574903965),
 ('hope', 0.06883443333208561),
 ('your', 0.19421817548573017),
 ('country', 0.10570773109793663),
 ('BUR', 0.0880456380546093),
 ('NS', 0.07501181587576866),
 ('.', 0.5369281955063343)]

### **LIME**

In [79]:
def compute_lime_scores(
  text: str,
  tokens: list[str],
  label: int = HATE_SPEECH_CLASS,
  num_samples: int = 100,
) -> dict:
  """
  Given a text, its tokens and the predicted class by the hate speech classifier, it
  uses LIME to extract the importance of each token. The number of features used by
  the LimeTextExplainer is equal to the total number of non special tokens.
  """
  global model, tokenizer

  # compute the number of features as the number of tokens different from the special ones
  num_features = len([token for token in tokens if token not in tokenizer.all_special_tokens])

  # define the LIME explainer
  explainer = LimeTextExplainer(class_names=list(model.config.id2label.values()))
  explanation = explainer.explain_instance(
      text,
      get_probabilities,
      num_features=num_features,
      num_samples=num_samples,
  )

  # extract the LIME scores and store them in a dictionary
  feature_attributions = explanation.as_list(label=label)
  tokens = [(word.item(), score) for _, (word, score) in enumerate(feature_attributions)]

  return tokens

# compute the LIME score for the tokens in the example sentence
compute_lime_scores(ucb_hate_speech_df[:4]["text"].tolist()[1], tokens, predicted_class_id)

[('I', -0.3403289444333246),
 ('fucking', 0.14628814036484936),
 ('BURNS', -0.13557863173949833),
 ('your', 0.10478344049653388),
 ('country', -0.05991141416140099),
 ('americans', 0.029500551878973054),
 ('hope', 0.014137257887937729),
 ('HATE', 0.0025525026786472546)]

### **SHAP**

In [80]:
# define a prediction function
def f(x):
    tv = torch.tensor([tokenizer.encode(v, padding="max_length", max_length=500, truncation=True, add_special_tokens=False) for v in x]).to(device)
    with torch.no_grad():
      outputs = model(tv)[0].detach().cpu().numpy()
    scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
    val = sp.special.logit(scores[:, 1])  # use one vs rest logit units
    return val

# build an explainer using a token masker
shap_explainer = shap.Explainer(f, tokenizer)

def compute_shap_scores(text: str) -> np.ndarray:
  global shap_explainer
  shap_values = shap_explainer([text], fixed_context=1, batch_size=1)
  shap_values_list = []

  for i, value in enumerate(shap_values.values[0]):
      token = shap_values.data[0][i]
      if token == '' or token in tokenizer.all_special_tokens:
          continue
          
      shap_values_list.append((token, value.item()))

  return shap_values_list

compute_shap_scores(ucb_hate_speech_df[:4]["text"].tolist()[1])

[('I ', 0.9005785882472992),
 ('fucking ', 0.027191132307052612),
 ('H', 0.11832618117332458),
 ('ATE ', 0.29801749587059023),
 ('americ', 0.1651911675930023),
 ('ans', 0.1309647500514984),
 ('. ', 0.415344113111496),
 ('I ', 0.174204843384879),
 ('hope ', 0.42708660875047955),
 ('your ', 0.27539290700639996),
 ('country ', 0.16818346296037948),
 ('BUR', 0.09716351543154034),
 ('NS', 0.003816204411642876),
 ('.', 0.01946146999086651)]

## **Extend Dataset**

In [90]:
def compute_all_explanations(row):
    global tokenizer
    text = row['text']
    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(device)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    outputs = model(**inputs)
    
    # compute attention scores
    attention_scores = compute_attention_scores(outputs.attentions, tokens)
    # compute LIME scores
    lime_scores = compute_lime_scores(text, tokens)
    # compute SHAP scores
    shap_scores = compute_shap_scores(text)
    
    return pd.Series({
        'attention_scores': attention_scores,
        'lime_scores': lime_scores,
        'shap_scores': shap_scores
    })

# Apply to entire dataframe (iterates through each row)
ucb_hate_speech_df[['attention_scores', 'lime_scores', 'shap_scores']] = ucb_hate_speech_df.apply(compute_all_explanations, axis=1)

  ucb_hate_speech_df[['attention_scores', 'lime_scores', 'shap_scores']] = ucb_hate_speech_df.apply(compute_all_explanations, axis=1)
  ucb_hate_speech_df[['attention_scores', 'lime_scores', 'shap_scores']] = ucb_hate_speech_df.apply(compute_all_explanations, axis=1)
  ucb_hate_speech_df[['attention_scores', 'lime_scores', 'shap_scores']] = ucb_hate_speech_df.apply(compute_all_explanations, axis=1)


In [111]:
attention_scores_length = ucb_hate_speech_df["attention_scores"].apply(len).tolist()
shap_scores_length = ucb_hate_speech_df["shap_scores"].apply(len).tolist()
different_length = []

for i in range(len(attention_scores_length)):
    if attention_scores_length[i] != shap_scores_length[i]:
        different_length.append(i)

if len(different_length) > 0:
    print(f"There are different lengths for attentions and SHAP features in: {different_length}")
    
    ucb_hate_speech_df = ucb_hate_speech_df.drop(ucb_hate_speech_df.index[different_length])
    ucb_hate_speech_df = ucb_hate_speech_df.reset_index(drop=True)
    
    print(f"Deleted {len(different_length)} rows")
    print("UC Berkerly Hate Speech Dataset Filtered:", ucb_hate_speech_df.shape)
else:
    print("Attention and SHAP features match perfectly!")

Attention and SHAP features match perfectly!
