# Stage 1

In [1]:
test_sentence = "You are the worst person I know"

In [2]:
# CyberBullyingClassifier

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer


model_path = "./trained_bert_cyberbullying_mendeley"

model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

trainer = Trainer(model=model)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
import torch.nn.functional as F
from captum.attr import LayerIntegratedGradients

# Ensure the model is on CPU
device = torch.device("cpu")
model.to(device)

# Tokenize and prepare input
inputs = tokenizer(test_sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Run inference
with torch.no_grad():
    outputs = model(**inputs)

# Get logits and predicted class
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()

# Get prediction confidence
probs = F.softmax(logits, dim=1)
confidence = probs[0, predicted_class].item()

# Map predicted class to label
label_map = {1: "Not Cyberbullying", 0: "Cyberbullying"}
result = label_map[predicted_class]

print(f"Prediction: {result} (Confidence: {confidence:.2%})")

# Define a forward function for IG
def forward_func(input_ids, attention_mask):
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    return output.logits[:, predicted_class]

# Get embedding layer
embedding_layer = model.get_input_embeddings()

# Setup Layer Integrated Gradients
lig = LayerIntegratedGradients(forward_func, embedding_layer)

# Create baseline (PAD tokens)
baseline_input_ids = torch.full_like(inputs["input_ids"], tokenizer.pad_token_id).to(device)

# Compute attributions
attributions, delta = lig.attribute(
    inputs=(inputs["input_ids"], inputs["attention_mask"]),
    baselines=(baseline_input_ids, inputs["attention_mask"]),
    return_convergence_delta=True
)

# Sum attributions across embeddings
attributions_sum = attributions.sum(dim=-1).squeeze(0)

# Decode tokens
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())

# Display token-level attributions
print("\nToken Attributions (Layer Integrated Gradients):")
for token, score in zip(tokens, attributions_sum.detach().cpu().numpy()):
    print(f"{token}: {score:.4f}")


Prediction: Cyberbullying (Confidence: 99.72%)

Token Attributions (Layer Integrated Gradients):
[CLS]: 0.5309
you: 0.4722
are: 0.3987
the: 1.6299
worst: 2.6743
person: 0.6498
i: 0.8518
know: 0.5079
[SEP]: 0.4447


### Hatespeech classifier

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Load model and tokenizer
hate_model_path = "hate_speech_recall_optimized_model"

hate_tokenizer = AutoTokenizer.from_pretrained(hate_model_path)
hate_model = AutoModelForSequenceClassification.from_pretrained(hate_model_path, output_attentions=True)
hate_model.eval()

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hate_model.to(device)

# === Step 2: Input sentence ===
inputs = hate_tokenizer(test_sentence, return_tensors="pt", truncation=True, padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}

# === Step 3: Forward pass ===
with torch.no_grad():
    outputs = hate_model(**inputs)
    logits = outputs.logits.cpu().numpy()
    attentions = outputs.attentions  # shape: (num_layers, batch_size, num_heads, seq_len, seq_len)

# === Step 4: Apply sigmoid and threshold ===
hate_probs = 1 / (1 + np.exp(-logits))  # Sigmoid
preds = (hate_probs > 0.51).astype(int).flatten()

# === Step 5: Optional - If true label is known, compute metrics ===
# Dummy label for demo (replace with real one if available)
labels = np.array([1])
print("Prediction:", preds[0])
print("Probability:", hate_probs[0][0])


# === Step 6: Attention weights from [CLS] token ===
last_layer_attention = attentions[-1]  # last layer: (1, num_heads, seq_len, seq_len)
attention_matrix = last_layer_attention[0][0]  # head 0 → shape: (seq_len, seq_len)
cls_attention = attention_matrix[0]  # [CLS] token attends to others → shape: (seq_len,)

# Decode tokens
tokens = hate_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

# Show token attention weights
print("\nAttention weights from [CLS] to each token:")
for token, weight in zip(tokens, cls_attention):
    print(f"{token}: {weight.item():.4f}")

Prediction: 1
Probability: 0.6927011

Attention weights from [CLS] to each token:
[CLS]: 0.0000
[UNK]: 0.0026
are: 0.0000
the: 0.0545
worst: 0.1406
person: 0.0195
[UNK]: 0.0032
know: 0.0000
[SEP]: 0.7795


In [29]:
# Sentence target
import spacy

nlp = spacy.load("en_core_web_lg")

# Set of pronouns we consider as PERSONs
PERSON_PRONOUNS = {"you", "he", "she", "they", "him", "her"}

def detect_person_targets(text):
    doc = nlp(text)

    person_tokens = []

    for token in doc:
        # Add pronouns like "you", "he", etc.
        if token.text.lower() in PERSON_PRONOUNS:
            person_tokens.append(token.text)
    
    # Add spaCy named PERSON entities
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            person_tokens.append(ent.text)

    # Remove duplicates and preserve order
    seen = set()
    unique_persons = [p for p in person_tokens if not (p.lower() in seen or seen.add(p.lower()))]

    return {
        "person_tokens": unique_persons,
        "is_directed_towards_someone": len(unique_persons) > 0
    }

ner_person = detect_person_targets(test_sentence)

In [31]:
stage_1_prob = 0.7*confidence + 0.3*int(ner_person['is_directed_towards_someone'])*hate_probs[0][0]

print(stage_1_prob)

0.9071002423763275
