In [None]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import required libraries
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import json
import numpy as np
import os

# Import data utilities
from Preprocess.dataCollect import get_annotated_data

## 1. Load and Prepare Data

In [None]:
# Data configuration for 2-class (toxic vs non-toxic)
dict_data_folder = {
    '2': {'data_file': 'Data/dataset.json', 'class_label': 'Data/classes_two.npy'},
    '3': {'data_file': 'Data/dataset.json', 'class_label': 'Data/classes.npy'}
}

params = {}
params['num_classes'] = 2  # toxic vs non-toxic for bias evaluation
params['data_file'] = dict_data_folder[str(params['num_classes'])]['data_file']
params['class_names'] = dict_data_folder[str(params['num_classes'])]['class_label']

# Load dataset
data_all_labelled = get_annotated_data(params)
print(f"Loaded {len(data_all_labelled)} samples")

In [None]:
# Preview dataset
data_all_labelled.head()

In [None]:
# Dataset statistics
print("\nLabel distribution:")
print(data_all_labelled['final_label'].value_counts())

## 2. Extract Target Community Information

In [None]:
def generate_target_information(dataset):
    """
    Extract target community information using majority voting.
    A community is selected if at least 2 out of 3 annotators identified it.
    
    Args:
        dataset: DataFrame with annotated data
    
    Returns:
        final_target_output: Dict mapping post_id to target communities
        all_communities_selected: List of all selected communities
    """
    final_target_output = defaultdict(list)
    all_communities_selected = []
    
    for each in dataset.iterrows():
        # Combine targets from all 3 annotators
        all_targets = each[1]['target1'] + each[1]['target2'] + each[1]['target3']
        community_dict = dict(Counter(all_targets))
        
        # Select communities with majority agreement (≥2 annotators)
        for key in community_dict:
            if community_dict[key] > 1:
                final_target_output[each[1]['post_id']].append(key)
                all_communities_selected.append(key)
        
        # Mark as 'None' if no majority consensus
        if each[1]['post_id'] not in final_target_output:
            final_target_output[each[1]['post_id']].append('None')
            all_communities_selected.append('None')

    return final_target_output, all_communities_selected

In [None]:
# Generate target information
target_information, all_communities_selected = generate_target_information(data_all_labelled)
print(f"Extracted target info for {len(target_information)} posts")

In [None]:
# Community statistics
community_count_dict = Counter(all_communities_selected)
print("\nAll communities (sorted by frequency):")
for community, count in community_count_dict.most_common():
    print(f"  {community}: {count}")

In [None]:
# Select top 10 communities (excluding 'None' and 'Other')
community_count_filtered = community_count_dict.copy()
community_count_filtered.pop('None', None)
community_count_filtered.pop('Other', None)

list_selected_community = [community for community, _ in community_count_filtered.most_common(10)]
print(f"\nSelected top 10 communities for bias evaluation:")
for i, community in enumerate(list_selected_community, 1):
    print(f"  {i}. {community} ({community_count_dict[community]} mentions)")

In [None]:
# Filter target information to include only top 10 communities
final_target_information = {}
for post_id in target_information:
    matched_communities = list(set(target_information[post_id]) & set(list_selected_community))
    final_target_information[post_id] = matched_communities if matched_communities else None

# Add to dataset
data_all_labelled['final_target_category'] = data_all_labelled['post_id'].map(final_target_information)

## 3. Filter to Test Set

In [None]:
# Load test split IDs
with open('./Data/post_id_divisions.json', 'r') as fp:
    post_id_dict = json.load(fp)

print(f"Train: {len(post_id_dict['train'])} samples")
print(f"Val:   {len(post_id_dict['val'])} samples")
print(f"Test:  {len(post_id_dict['test'])} samples")

In [None]:
# Filter to test set
data_all_labelled_bias = data_all_labelled[data_all_labelled['post_id'].isin(post_id_dict['test'])].copy()
print(f"\nFiltered to {len(data_all_labelled_bias)} test samples for bias evaluation")

## 4. Define Bias Evaluation Functions

In [None]:
def convert_to_score(label_name, label_dict):
    """
    Convert classification label to toxicity score [0, 1].
    Higher score = more toxic.
    
    Args:
        label_name: Predicted label ('toxic' or 'non-toxic')
        label_dict: Dict with prediction probabilities
    
    Returns:
        Toxicity score between 0 and 1
    """
    if label_name == 'non-toxic':
        return 1 - label_dict[label_name]
    else:
        return label_dict[label_name]

In [None]:
def bias_evaluation_metric(dataset, method, community):
    """
    Divide post IDs into positive (toxic) and negative (non-toxic) sets
    based on the bias evaluation method.
    
    Methods:
        - subgroup: Posts mentioning the community
        - bpsn: Background Positive, Subgroup Negative
        - bnsp: Background Negative, Subgroup Positive
    
    Args:
        dataset: DataFrame with test data
        method: One of 'subgroup', 'bpsn', 'bnsp'
        community: Target community name
    
    Returns:
        Dict with 'positiveID' and 'negativeID' lists
    """
    positive_ids = []
    negative_ids = []
    
    for _, row in dataset.iterrows():
        if row['final_target_category'] is None:
            continue
        
        is_community = community in row['final_target_category']
        is_toxic = row['final_label'] != 'non-toxic'
        
        if method == 'subgroup':
            # Subgroup AUC: only posts mentioning the community
            if is_community:
                if is_toxic:
                    positive_ids.append(row['post_id'])
                else:
                    negative_ids.append(row['post_id'])
        
        elif method == 'bpsn':
            # BPSN AUC: Measures false positive bias
            # Positive: toxic posts NOT mentioning community
            # Negative: non-toxic posts mentioning community
            if is_community and not is_toxic:
                negative_ids.append(row['post_id'])
            elif not is_community and is_toxic:
                positive_ids.append(row['post_id'])
        
        elif method == 'bnsp':
            # BNSP AUC: Measures false negative bias
            # Positive: toxic posts mentioning community
            # Negative: non-toxic posts NOT mentioning community
            if is_community and is_toxic:
                positive_ids.append(row['post_id'])
            elif not is_community and not is_toxic:
                negative_ids.append(row['post_id'])
        
        else:
            print(f'Unknown method: {method}')
    
    return {'positiveID': positive_ids, 'negativeID': negative_ids}

## 5. Calculate Bias Scores

In [None]:
from sklearn.metrics import roc_auc_score

# Configuration
parent_path = './explanations_dicts/'
method_list = ['subgroup', 'bpsn', 'bnsp']
community_list = list(list_selected_community)

# Model bias score file mapping
# Add your model files here
bias_score_file_mapping = {
    'BiRNN-Attn': 'bestModel_birnnatt_bias.json',
    'BiRNN-Scrat': 'bestModel_birnnscrat_bias.json',
    'CNN-GRU': 'bestModel_cnn_gru_bias.json',
    'BERT-Base': 'bestModel_bert_base_uncased_Attn_train_FALSE_bias.json',
    'BERT-HateXplain': 'bestModel_bert_base_uncased_Attn_train_TRUE_bias.json',
}

In [None]:
# Check which bias files exist
print("Available bias score files:")
available_models = {}
for model_name, filename in bias_score_file_mapping.items():
    filepath = os.path.join(parent_path, filename)
    if os.path.exists(filepath):
        print(f"  ✓ {model_name}: {filename}")
        available_models[model_name] = filename
    else:
        print(f"  ✗ {model_name}: {filename} (not found)")

if not available_models:
    print("\nNo bias score files found. Run testing_for_bias.py first.")

In [None]:
# Calculate bias scores for each model
final_bias_dictionary = defaultdict(lambda: defaultdict(dict))

for model_name in tqdm(available_models, desc="Processing models"):
    filepath = os.path.join(parent_path, available_models[model_name])
    
    # Load model predictions
    total_data = {}
    with open(filepath) as fp:
        for line in fp:
            data = json.loads(line)
            total_data[data['annotation_id']] = data
    
    # Calculate AUC for each method and community
    for method in method_list:
        for community in community_list:
            community_data = bias_evaluation_metric(data_all_labelled_bias, method, community)
            
            truth_values = []
            prediction_values = []
            label_to_value = {'toxic': 1.0, 'non-toxic': 0.0}
            
            # Collect positive samples
            for post_id in community_data['positiveID']:
                if post_id in total_data:
                    truth_values.append(label_to_value[total_data[post_id]['ground_truth']])
                    prediction_values.append(convert_to_score(
                        total_data[post_id]['classification'],
                        total_data[post_id]['classification_scores']
                    ))
            
            # Collect negative samples
            for post_id in community_data['negativeID']:
                if post_id in total_data:
                    truth_values.append(label_to_value[total_data[post_id]['ground_truth']])
                    prediction_values.append(convert_to_score(
                        total_data[post_id]['classification'],
                        total_data[post_id]['classification_scores']
                    ))
            
            # Calculate AUC if both classes present
            if len(truth_values) > 0 and len(set(truth_values)) > 1:
                auc_score = roc_auc_score(truth_values, prediction_values)
                final_bias_dictionary[model_name][method][community] = auc_score

## 6. Display Results

In [None]:
# Display per-community bias scores
import pandas as pd

for model_name in final_bias_dictionary:
    print(f"\n{'='*60}")
    print(f"Model: {model_name}")
    print(f"{'='*60}")
    
    for method in method_list:
        print(f"\n{method.upper()} AUC:")
        for community in final_bias_dictionary[model_name][method]:
            score = final_bias_dictionary[model_name][method][community]
            print(f"  {community:20s}: {score:.4f}")

In [None]:
# Calculate generalized mean of bias scores
# Using power value of -5 as in the original paper
power_value = -5
num_communities = len(community_list)

print("\n" + "="*60)
print("GENERALIZED MEAN BIAS SCORES (p=-5)")
print("="*60)
print(f"{'Model':25s} {'Subgroup':>12s} {'BPSN':>12s} {'BNSP':>12s}")
print("-"*60)

for model_name in final_bias_dictionary:
    scores = []
    for method in method_list:
        temp_values = []
        for community in final_bias_dictionary[model_name][method]:
            temp_values.append(
                pow(final_bias_dictionary[model_name][method][community], power_value)
            )
        if temp_values:
            gen_mean = pow(np.sum(temp_values) / num_communities, 1 / power_value)
            scores.append(gen_mean)
        else:
            scores.append(0.0)
    
    if scores:
        print(f"{model_name:25s} {scores[0]:12.4f} {scores[1]:12.4f} {scores[2]:12.4f}")

In [None]:
# Save results to JSON
output_file = './bias_evaluation_results.json'

# Convert defaultdict to regular dict for JSON serialization
results_dict = {
    model: {
        method: dict(communities)
        for method, communities in methods.items()
    }
    for model, methods in final_bias_dictionary.items()
}

with open(output_file, 'w') as fp:
    json.dump(results_dict, fp, indent=2)

print(f"\nResults saved to {output_file}")

## 7. Interpretation Guide

### Bias Metrics:

- **Subgroup AUC**: Measures model performance on posts mentioning a specific community. 
  - Higher is better (closer to 1.0)
  - Low score indicates poor discrimination for that subgroup

- **BPSN (Background Positive, Subgroup Negative) AUC**: Measures false positive bias.
  - Higher is better
  - Low score indicates the model may incorrectly classify benign mentions of a community as toxic

- **BNSP (Background Negative, Subgroup Positive) AUC**: Measures false negative bias.
  - Higher is better
  - Low score indicates the model may miss toxic content targeting a community

### Generalized Mean:
The generalized mean with p=-5 emphasizes lower scores, making it sensitive to worst-case performance across communities.

### References:
- Borkan et al. (2019) - "Nuanced Metrics for Measuring Unintended Bias with Real Data for Text Classification"