# Models

### Testing models

Using the two following models:

https://huggingface.co/cardiffnlp/twitter-roberta-base-hate-latest

https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import dtale
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
import torch

from master_thesis.config import PROCESSED_DATA_DIR, load_dataframe_from_pickle

In [None]:
annotated_and_targeted_hatespeech_df = load_dataframe_from_pickle(
    PROCESSED_DATA_DIR / "annotated_and_targeted_hatespeech.pkl"
)

display(annotated_and_targeted_hatespeech_df.head(10))

In [None]:
test_text_0 = annotated_and_targeted_hatespeech_df['text'][2]
test_text_1 = annotated_and_targeted_hatespeech_df['text'][6]

In [None]:
device = 0 if torch.cuda.is_available() else -1


In [None]:

pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-hate-latest")


In [None]:
result = pipe(test_text_0)
print(result)

In [None]:
result = pipe(test_text_1)
print(result)

In [None]:
model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
classifier = pipeline("sentiment-analysis", model=model_name, framework="pt", device=device)


In [None]:
result = classifier(test_text_0)
print(result)

In [None]:
result = classifier(test_text_1)
print(result)

### Using LIME


In [None]:
from lime.lime_text import LimeTextExplainer

In [None]:
class_names = ['not_hate', 'hate']

# Model 1: CardiffNLP
pipe_cardiff = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-hate-latest", return_all_scores=True)

# Model 2: Facebook Dynabench
pipe_fb = pipeline("text-classification", model="facebook/roberta-hate-speech-dynabench-r4-target", return_all_scores=True)


In [None]:
def predict_cardiff(texts):
    return np.array([[score['score'] for score in pipe_cardiff(text)[0]] for text in texts])

def predict_fb(texts):
    return np.array([[score['score'] for score in pipe_fb(text)[0]] for text in texts])

In [None]:
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
explanation_cardiff = explainer.explain_instance(test_text_0, predict_cardiff, num_features=10)
explanation_cardiff.show_in_notebook(text=True)


In [None]:
explanation_cardiff = explainer.explain_instance(test_text_1, predict_cardiff, num_features=10)
explanation_cardiff.show_in_notebook(text=True)

In [None]:
explanation_fb = explainer.explain_instance(test_text_0, predict_fb, num_features=10)
explanation_fb.show_in_notebook(text=True)

In [None]:
explanation_fb = explainer.explain_instance(test_text_1, predict_fb, num_features=10)
explanation_fb.show_in_notebook(text=True)

### SHAP

In [None]:
import shap

In [None]:
text_list_0 = [test_text_0]
text_list_1 = [test_text_1]

In [None]:
# Model 1: CardiffNLP
pipe_cardiff = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-hate-latest", return_all_scores=True)

# Model 2: Facebook Dynabench
pipe_fb = pipeline("text-classification", model="facebook/roberta-hate-speech-dynabench-r4-target", return_all_scores=True)


In [None]:
from transformers import AutoTokenizer
tokenizer_cardiff = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-hate-latest")
tokenizer_fb = AutoTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")

In [None]:
def predict_cardiff(texts):
    return np.array([[label['score'] for label in pipe_cardiff(text)[0]] for text in texts])

def predict_fb(texts):
    return np.array([[label['score'] for label in pipe_fb(text)[0]] for text in texts])


In [None]:
explainer_cardiff = shap.Explainer(
    predict_cardiff,
    masker=shap.maskers.Text(tokenizer_cardiff),
    output_names=class_names
)

explainer_fb = shap.Explainer(
    predict_fb,
    masker=shap.maskers.Text(tokenizer_fb),
    output_names=class_names
)


In [None]:
shap_values_cardiff = explainer_cardiff(text_list_0)
shap_values_fb = explainer_fb(text_list_0)

In [None]:
shap.plots.text(shap_values_cardiff[0])

In [None]:
shap.plots.text(shap_values_fb[0])

In [None]:
shap_values_cardiff = explainer_cardiff(text_list_1)
shap_values_fb = explainer_fb(text_list_1)

In [None]:
shap.plots.text(shap_values_cardiff[0])

In [None]:
shap.plots.text(shap_values_fb[0])

In [None]:
# Compute SHAP values for a single instance and a specific class
shap_values_cardiff = explainer_cardiff(text_list_0)
shap_values_fb = explainer_fb(text_list_0)

# Select SHAP values for the "hate" class (index 1)
shap_values_cardiff_hate = shap_values_cardiff[0][:, 1]
shap_values_fb_hate = shap_values_fb[0][:, 1]

# Generate waterfall plots for the "hate" class
print("Waterfall plot for CardiffNLP model:")
shap.plots.waterfall(shap_values_cardiff_hate)
print("Waterfall plot for Facebook Dynabench model:")
shap.plots.waterfall(shap_values_fb_hate)

In [None]:
# Compute SHAP values for a single instance and a specific class
shap_values_cardiff = explainer_cardiff(text_list_1)
shap_values_fb = explainer_fb(text_list_1)

# Select SHAP values for the "hate" class (index 1)
shap_values_cardiff_hate = shap_values_cardiff[0][:, 1]
shap_values_fb_hate = shap_values_fb[0][:, 1]

# Generate waterfall plots for the "hate" class
print("Waterfall plot for CardiffNLP model:")
shap.plots.waterfall(shap_values_cardiff_hate)
print("Waterfall plot for Facebook Dynabench model:")
shap.plots.waterfall(shap_values_fb_hate)

### Applying to dataframe

In [1]:
from lime.lime_text import LimeTextExplainer
import shap
import numpy as np
from transformers import pipeline, AutoTokenizer
import random
import torch


  from .autonotebook import tqdm as notebook_tqdm
2025-04-20 19:59:29.314786: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-20 19:59:29.338640: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-20 19:59:29.338672: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-20 19:59:29.338694: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-20 19:59:29.344105: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical o

In [2]:

def set_seed(seed_value):
    """Set seed for reproducibility."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        # Optional: for determinism with CuDNN
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False



In [3]:
seed_value = 42
set_seed(seed_value)

class_names = ['not_hate', 'hate']

# Pipelines
pipe_cardiff = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-hate-latest")
pipe_fb = pipeline("text-classification", model="facebook/roberta-hate-speech-dynabench-r4-target")

# Tokenizers
tokenizer_cardiff = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-hate-latest")
tokenizer_fb = AutoTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")

# Prediction functions for LIME (no [0] indexing)
def predict_cardiff(texts):
    # Ensure input is list of strings
    if isinstance(texts, np.ndarray):
        texts = texts.tolist()
    return np.array([
        [res['score'] for res in pipe_cardiff(text, top_k=5)]
        for text in texts
    ])

def predict_fb(texts):
    if isinstance(texts, np.ndarray):
        texts = texts.tolist()
    return np.array([
        [res['score'] for res in pipe_fb(text, top_k=5)]
        for text in texts
    ])

# LIME & SHAP explainers
explainer_lime = LimeTextExplainer(class_names=class_names)
explainer_shap_cardiff = shap.Explainer(predict_cardiff, masker=shap.maskers.Text(tokenizer_cardiff), output_names=class_names)
explainer_shap_fb = shap.Explainer(predict_fb, masker=shap.maskers.Text(tokenizer_fb), output_names=class_names)


Device set to use cpu
Device set to use cpu


In [4]:
import pandas as pd
data = {'text': ["nawt yall niggers ignoring me",
                 "and this is why i end up with nigger trainee doctors who can not speak properly lack basic knowledge of biology it truly scary if the public only knew",
                 "this is a neutral sentence"]}
annotated_and_targeted_hatespeech_df = pd.DataFrame(data)

The following was individually

In [5]:
lime_cardiff_explanations = []
lime_fb_explanations = []
shap_cardiff_explanations = []
shap_fb_explanations = []


In [6]:

# ---- LIME: Cardiff ----
for text in annotated_and_targeted_hatespeech_df["text"]:
    lime_cardiff = explainer_lime.explain_instance(text, predict_cardiff, num_features=5)
    lime_cardiff_keywords = [f"{word}: {weight:.2f}" for word, weight in lime_cardiff.as_list()]
    lime_cardiff_explanations.append(", ".join(lime_cardiff_keywords))


In [7]:
annotated_and_targeted_hatespeech_df["lime_cardiff"] = lime_cardiff_explanations

In [8]:

# ---- LIME: Facebook ----
for text in annotated_and_targeted_hatespeech_df["text"]:
    lime_fb = explainer_lime.explain_instance(text, predict_fb, num_features=5)
    lime_fb_keywords = [f"{word}: {weight:.2f}" for word, weight in lime_fb.as_list()]
    lime_fb_explanations.append(", ".join(lime_fb_keywords))


In [9]:
annotated_and_targeted_hatespeech_df["lime_fb"] = lime_fb_explanations

In [10]:

# ---- SHAP: Cardiff ----
for text in annotated_and_targeted_hatespeech_df["text"]:
    shap_vals_cardiff = explainer_shap_cardiff([text])
    shap_cardiff_words = [
        f"{feature}: {value:.2f}"
        for feature, value in zip(shap_vals_cardiff[0].data, shap_vals_cardiff[0].values[:, 1])
        if abs(value) > 0.01
    ]
    shap_cardiff_explanations.append(", ".join(shap_cardiff_words))


PartitionExplainer explainer: 2it [00:20, 20.15s/it]               


In [11]:
annotated_and_targeted_hatespeech_df["shap_cardiff"] = shap_cardiff_explanations

In [12]:

# ---- SHAP: Facebook ----
for text in annotated_and_targeted_hatespeech_df["text"]:
    shap_vals_fb = explainer_shap_fb([text])
    shap_fb_words = [
        f"{feature}: {value:.2f}"
        for feature, value in zip(shap_vals_fb[0].data, shap_vals_fb[0].values[:, 1])
        if abs(value) > 0.01
    ]
    shap_fb_explanations.append(", ".join(shap_fb_words))


PartitionExplainer explainer: 2it [00:20, 20.41s/it]               


In [13]:
annotated_and_targeted_hatespeech_df["shap_fb"] = shap_fb_explanations


In [25]:
annotated_and_targeted_hatespeech_df

Unnamed: 0,text,lime_cardiff,lime_fb,shap_cardiff,shap_fb
0,nawt yall niggers ignoring me,"nawt: -0.09, niggers: 0.08, me: 0.07, yall: 0....","nawt: -0.02, ignoring: -0.02, yall: -0.01, me:...","n: 0.01, nig: 0.04, gers : -0.03, ignoring : 0.02","n: 0.05, aw: -0.02, t : 0.04, y: -0.01, all : ..."
1,and this is why i end up with nigger trainee d...,"with: -0.01, nigger: 0.01, trainee: -0.00, i: ...","nigger: -0.02, scary: 0.00, trainee: 0.00, and...",,"and : -0.03, end : -0.01, n: -0.03, igger : -0..."
2,this is a neutral sentence,"a: -0.00, is: -0.00, this: -0.00, sentence: -0...","neutral: -0.00, this: -0.00, sentence: 0.00, a...",,"this : -0.10, is : 0.04, a : 0.04, sentence: -..."


Following is for each text

In [30]:
class_names = ['not_hate', 'hate']
explainer_lime = LimeTextExplainer(class_names=class_names)

# Simplified LIME explainers
def lime_explain(text, predictor, num_features=4):
    exp = explainer_lime.explain_instance(text, predictor, num_features=num_features)
    return ", ".join([f"{word}: {weight:.2f}" for word, weight in exp.as_list()])

# Predictor wrappers
def predict_cardiff(texts):
    if isinstance(texts, np.ndarray):
        texts = texts.tolist()
    outputs = pipe_cardiff(texts, top_k=2)
    return np.array([
        [label['score'] for label in sorted(res, key=lambda x: x['label'])]
        for res in outputs
    ])

def predict_fb(texts):
    if isinstance(texts, np.ndarray):
        texts = texts.tolist()
    outputs = pipe_fb(texts, top_k=2)
    return np.array([
        [label['score'] for label in sorted(res, key=lambda x: x['label'])]
        for res in outputs
    ])

In [31]:
def get_shap_values(text, explainer):
    shap_vals = explainer([text])
    return ", ".join([
        f"{feature}: {value:.2f}"
        for feature, value in zip(shap_vals[0].data, shap_vals[0].values)
        if abs(value) > 0.01
    ])

In [33]:
for text in annotated_and_targeted_hatespeech_df["text"]:
    # Get predictions
    pred_cardiff = pipe_cardiff(text)[0]
    pred_fb = pipe_fb(text)[0]

    # LIME
    lime_exp_cardiff = lime_explain(text, predict_cardiff)
    lime_exp_fb = lime_explain(text, predict_fb)

    # SHAP
    shap_vals_cardiff = explainer_shap_cardiff([text])
    shap_exp_cardiff = ", ".join([
        f"{feature}: {value:.2f}"
        for feature, value in zip(shap_vals_cardiff[0].data, shap_vals_cardiff[0].values)
        if abs(value.item()) > 0.01
    ])

    shap_vals_fb = explainer_shap_fb([text])
    shap_exp_fb = ", ".join([
        f"{feature}: {value:.2f}"
        for feature, value in zip(shap_vals_fb[0].data, shap_vals_fb[0].values)
        if abs(value.item()) > 0.01
    ])

    # Store results
    results.append({
        "text": text,
        "cardiff_label": pred_cardiff['label'],
        "cardiff_score": pred_cardiff['score'],
        "fb_label": pred_fb['label'],
        "fb_score": pred_fb['score'],
        "lime_cardiff": lime_exp_cardiff,
        "lime_fb": lime_exp_fb,
        "shap_cardiff": shap_exp_cardiff,
        "shap_fb": shap_exp_fb
    })

ValueError: can only convert an array of size 1 to a Python scalar

In [None]:
final_df = pd.DataFrame(results)

# Preview
final_df.head()