# Single Samples

Samples of [53812, 15185, 38012, 18694] for visuals from previous notebook. Colab settings to be done same as experiment except output names of CLASS_NAMES made for easier reading of graphs

In [None]:
#%pip install lime shap hf_xet

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import torch
from transformers import pipeline, AutoTokenizer
import lime
from lime.lime_text import LimeTextExplainer
import shap
import os
from typing import List, Union


In [None]:
def set_seed(seed_value):
    """Set seed for reproducibility."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        # Optional: for determinism with CuDNN
        torch.backends.cudnn.deterministic = True

In [None]:
BATCH_SIZE = 512 
CLASS_NAMES = ["HATE", "NOT_HATE"]

In [None]:
print("Loading models and tokenizers...")
pipe_cardiff = pipeline("text-classification",
                       model="cardiffnlp/twitter-roberta-base-hate-latest",
                       device=0 if torch.cuda.is_available() else -1,
                       batch_size=BATCH_SIZE)
tokenizer_cardiff = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-hate-latest")

In [None]:
class HateSpeechDataset:
    def __init__(self, texts):
        self.texts = texts
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx]

In [None]:
def batch_predict(texts: Union[List[str], np.ndarray], pipeline_fn):
    """Run predictions in batches for efficiency"""
    if isinstance(texts, np.ndarray):
        texts = texts.tolist()
    
    dataset = HateSpeechDataset(texts)
    
    # Process in batches
    all_outputs = []
    for i in range(0, len(dataset), BATCH_SIZE):
        batch_texts = [dataset[j] for j in range(i, min(i + BATCH_SIZE, len(dataset)))]
        outputs = pipeline_fn(batch_texts, top_k=2)
        all_outputs.extend(outputs)
    
    # Convert to numpy array with consistent order
    return np.array([
        [label['score'] for label in sorted(res, key=lambda x: x['label'])]
        for res in all_outputs
    ])

In [None]:
def predict_cardiff(texts):
    return batch_predict(texts, pipe_cardiff)

In [None]:
def generate_lime_visualization(sample_id, text, seed, evaluation):
    """Generate LIME visualization for a given sample and seed, and save as SVG."""
    print(f"Generating {evaluation} LIME visualization for sample {sample_id} with seed {seed}...")
    
    # Set seed for reproducibility
    set_seed(seed)
    
    # Initialize LIME explainer with the specified seed
    explainer = LimeTextExplainer(random_state=seed, class_names=CLASS_NAMES)
    
    # Generate explanation
    exp = explainer.explain_instance(
        text, 
        predict_cardiff
    )
    
    exp.show_in_notebook(text=True)



In [None]:
def generate_shap_visualization(sample_id, text, seed, evaluation):
    text_list = [text]
    set_seed(seed)

    explainer = shap.Explainer(predict_cardiff, 
                                masker=shap.maskers.Text(tokenizer_cardiff),
                                output_names=CLASS_NAMES,
                                seed=seed)
    shap_values = explainer(text_list)
    print(f"Generating {evaluation} SHAP visualization for sample {sample_id} with seed {seed}...")
    shap.plots.text(shap_values[0])

In [None]:
import pandas as pd
import requests
from io import BytesIO

# GitHub raw content URL for the pickle file
github_raw_url = "https://github.com/Takosaga/master_thesis/raw/main/data/processed/experiement_samples.pkl"
# Download the pickle file
response = requests.get(github_raw_url)

# Check if the request was successful
if response.status_code == 200:
    # Load the pickle data into a pandas DataFrame
    df = pd.read_pickle(BytesIO(response.content))
    print(f"DataFrame loaded successfully with {len(df)} rows")

    # Display the first few rows
    print("\nPreview of the DataFrame:")
    display(df.head())
else:
    print(f"Failed to download the file. Status code: {response.status_code}")
    print(f"Response: {response.text}")

In [None]:
text_TN = df.loc[df['id'] == 53812, 'text'].values[0]
text_FN = df.loc[df['id'] == 15185, 'text'].values[0]
text_FP = df.loc[df['id'] == 38012, 'text'].values[0]
text_TP = df.loc[df['id'] == 18694, 'text'].values[0]

In [None]:
id_used = [53812, 15185, 38012, 18694]

In [None]:
seeds_TN = [83811, 97197]
seeds_FN = [83811, 97197]
seeds_FP = [3279, 83811]
seeds_TP = [3279, 14593]

### LIME Visuals

In [None]:
generate_lime_visualization(id_used[0], text_TN, seeds_TN[0], 'TN' )

In [None]:
generate_lime_visualization(id_used[0], text_TN, seeds_TN[1], 'TN' )

In [None]:
generate_lime_visualization(id_used[1], text_FN, seeds_FN[0], 'FN' )


In [None]:
generate_lime_visualization(id_used[1], text_FN, seeds_FN[1], 'FN' )

In [None]:
generate_lime_visualization(id_used[2], text_FP, seeds_FP[0], 'FP' )


In [None]:
generate_lime_visualization(id_used[2], text_FP, seeds_FP[1], 'FP' )

In [None]:
generate_lime_visualization(id_used[3], text_TP, seeds_TP[0], 'TP' )

In [None]:

generate_lime_visualization(id_used[3], text_TP, seeds_TP[1], 'TP' )

### SHAP Visuals

In [None]:
generate_shap_visualization(id_used[0], text_TN, seeds_TN[0], 'TN' )

In [None]:
generate_shap_visualization(id_used[0], text_TN, seeds_TN[1], 'TN' )

In [None]:
generate_shap_visualization(id_used[1], text_FN, seeds_FN[0], 'FN' )


In [None]:
generate_shap_visualization(id_used[1], text_FN, seeds_FN[1], 'FN' )

In [None]:
generate_shap_visualization(id_used[2], text_FP, seeds_FP[0], 'FP' )

In [None]:
generate_shap_visualization(id_used[2], text_FN, seeds_FN[1], 'FN' )

In [None]:
generate_shap_visualization(id_used[3], text_TP, seeds_TP[0], 'TP' )


In [None]:
generate_shap_visualization(id_used[3], text_TP, seeds_TP[1], 'TP' )