# Demonstration: LLM Annotations Reliability

## Based on Paper: Assessing the Reliability of LLMs Annotations in the Context of Demographic Bias and Model Explanation

### What You'll Learn:
- How to evaluate LLMs with different prompting strategies
- How demographic personas can affect performance
- How explainable AI (SHAP) helps models focus on important content
- Simple statistical analysis of variance components

## Step 1: Install Required Packages

In [None]:
# Install required packages
!pip install pandas matplotlib numpy seaborn

## Step 2: Import Libraries and Enhanced Configuration

In [None]:
import os
import getpass
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import random
import itertools
from typing import List, Dict, Tuple
import time
from collections import defaultdict

# ENHANCED CONFIGURATION
NUM_SAMPLES = 20  # Number of complex examples to test
NUM_DEMOGRAPHIC_ROTATIONS = 2  # How many different demographic personas to test per example
NUM_VIRTUAL_ANNOTATORS = 3  # Number of virtual annotators (as in paper)

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)

print("Configuration loaded")
print(f"Testing {NUM_SAMPLES} complex examples")
print(f"Using {NUM_DEMOGRAPHIC_ROTATIONS} demographic personas per example")
print(f"{NUM_VIRTUAL_ANNOTATORS} virtual annotators for reliability analysis")
print(f"Expected API calls: ~{NUM_SAMPLES * NUM_DEMOGRAPHIC_ROTATIONS * NUM_VIRTUAL_ANNOTATORS * 4} calls")

## Step 3: Complex Examples - Ambiguous Cases from Social Media

These are examples that cause disagreement among human annotators and LLMs:

In [None]:
# EXIST 2024 dataset structure and ambiguous examples
# These are based on EXIST dataset patterns with expert disagreement

# 56 demographic combinations from the paper
import json

DEMOGRAPHIC_COMBINATIONS = []
with open('demographic_combos.jsonl', 'r') as file:
    for line in file:
        json_object = json.loads(line.strip()[:-1])
        DEMOGRAPHIC_COMBINATIONS.append(json_object)

# EXIST 2024 dataset examples
with open('exist_tweets.json') as f:
    EXIST_TWEETS = json.load(f)

# Complex ambiguous examples with SHAP tokens from the paper
# ambiguous_sexist = Borderline cases from EXIST dataset that cause expert disagreement
# ambiguous_not_sexist = Cases that might seem sexist but experts mostly agree they're not (20-40% said sexist)
# expert_agreement = 0.67 means 67% of experts said sexist
# shap_tokens are SHAP tokens from paper
with open('complex_examples.json') as f:
    complex_examples = json.load(f)

# Create balanced test set using EXIST patterns
test_examples = []

# Add ambiguous sexist examples
for example in complex_examples["ambiguous_sexist"][:NUM_SAMPLES//2]:
    test_examples.append((example, "YES"))

# Add ambiguous not sexist examples
for example in complex_examples["ambiguous_not_sexist"][:NUM_SAMPLES//2]:
    test_examples.append((example, "NO"))

# Shuffle to avoid bias
random.shuffle(test_examples)

print(f"Loaded {len(test_examples)} complex examples based on EXIST 2024 patterns")
print(f"{NUM_SAMPLES//2} ambiguous sexist cases")
print(f"{NUM_SAMPLES//2} ambiguous not-sexist cases")
print(f"Average expert agreement: {np.mean([ex[0]['expert_agreement'] for ex in test_examples]):.2f}")
print(f"Using SHAP tokens from the paper findings")

## Step 4: Demographic Combinations from the Paper

All 56 demographic combinations from the paper:

In [None]:
# Use the 56 demographic combinations from the paper (already defined above)
all_demographics = DEMOGRAPHIC_COMBINATIONS

print(f"Using {len(all_demographics)} demographic combinations from the paper")
print("\nExample demographic profiles:")
for i, demo in enumerate(all_demographics[:5]):
    gender_text = "Female" if demo['gender'] == 'F' else "Male"
    print(f"  {i+1}. {gender_text}, {demo['age']}, {demo['ethnicity']}, {demo['education']}, {demo['region']}")

print(f"\n... and {len(all_demographics)-5} more combinations")
print(f"\nFor each test, we'll randomly select {NUM_DEMOGRAPHIC_ROTATIONS} of these 56 combinations")

In [None]:
# SHAP Analysis for Token Importance (From the Paper)

import re
import sexismanalyzer as sa

# Initialize the SHAP analyzer from the paper
shap_analyzer = sa.SHAPSexismAnalyzer()

# Test SHAP analysis on sample tweets using tokens from paper
print("TESTING SHAP Analysis from the paper:")
print('NOTE: highlighting is not automated, but based on important terms found in our paper')
print("=" * 60)

test_tweets = [
    ("Women should stay in the kitchen where they belong", "en"),
    ("She gave an excellent presentation today", "en"),
    ("Las mujeres son naturalmente malas en matemáticas", "es"),
    ("La conferencia tuvo muchos profesionales", "es")
]

for text, lang in test_tweets:
    analysis = shap_analyzer.analyze_tweet(text, lang)
    print(f"\nLanguage: {lang.upper()}")
    print(f"Original: {analysis['original_text']}")
    print(f"Highlighted: {analysis['highlighted_text']}")
    print(f"Important tokens (from paper): {analysis['important_tokens']}")

print(f"\nSHAP analysis ready using tokens from the paper")
print(f"English high importance: {shap_analyzer.important_tokens['en']['high_importance'][:10]}...")
print(f"Spanish high importance: {shap_analyzer.important_tokens['es']['high_importance'][:10]}...")

In [None]:
def create_demographic_prompt(demographics: Dict[str, str]) -> str:
    """
    Create demographic prompt as used in the paper.
    This is the format from the paper.
    """
    gender_text = "female" if demographics['gender'] == 'F' else "male"
    return f"""You are a {gender_text} individual, aged {demographics['age']}, who identifies as {demographics['ethnicity']}, has a {demographics['education']}, and currently resides in {demographics['region']}.

You are an expert at detecting sexist content. Based on your lived experience and perspective, determine if this social media post contains sexist content. Answer YES if it contains sexism, NO if it does not."""

def create_shap_highlighted_text(text: str, shap_tokens: List[str]) -> str:
    """
    Highlight important tokens based on SHAP analysis from paper.
    These are the tokens that contributed most to sexism detection.
    """
    return shap_analyzer.highlight_tokens(text, shap_tokens)

# The 4 evaluation scenarios from the paper
def create_evaluation_prompts(demographics: Dict[str, str]):
    """
    Create the 4 evaluation scenarios: GenAI, GenP, GenXAI, GenPXAI
    Implementation from the paper.
    """
    prompts = {
        "GenAI": "You are an expert at detecting sexist content. Determine if this social media post contains sexist content. Answer YES if it contains sexism, NO if it does not.",
        
        "GenP": create_demographic_prompt(demographics),
        
        "GenXAI": "You are an expert at detecting sexist content. Pay special attention to words highlighted in **bold** as they are important for the decision. Determine if this social media post contains sexist content. Answer YES if it contains sexism, NO if it does not.",
        
        "GenPXAI": f"""{create_demographic_prompt(demographics)}

Pay special attention to words highlighted in **bold** as they are important for the decision."""
    }
    
    return prompts

# Test the prompt creation with format
sample_demographics = all_demographics[0]
sample_prompts = create_evaluation_prompts(sample_demographics)

print("Prompt templates created from the paper")
print(f"\nExample GenP prompt (format):")
print(sample_prompts["GenP"][:200] + "...")

# Test SHAP highlighting with paper tokens
sample_text = "Women often struggle with emotional decision-making"
shap_analysis = shap_analyzer.analyze_tweet(sample_text, "en")
print(f"\nExample SHAP highlighting from paper:")
print(f"Original: {sample_text}")
print(f"Highlighted: {shap_analysis['highlighted_text']}")
print(f"Tokens found: {shap_analysis['important_tokens']}")

## Step 6: Setup API (Secure)

In [None]:
import requests
import json

# Get API key securely (same method as complete notebook)
api_key = os.getenv('API_KEY')
if not api_key:
    print("Please enter your API key:")
    print("(Get one from: https://openrouter.ai)")
    api_key = getpass.getpass("API Key: ")

def ask_ai_real(prompt: str, text: str) -> str:
    """
    API function - makes actual API calls.
    This makes actual API calls to get genuine responses.
    """
    try:
        response = requests.post(
          url="https://openrouter.ai/api/v1/chat/completions",
          headers={
            "Authorization": f"Bearer {api_key}",
          },
          data=json.dumps({
            "model": "nousresearch/hermes-4-405b",
            "messages": [
              {
              "role": "system",
              "content": prompt
            },{
                "role": "user",
                "content": f"Social media post: {text}\n\nAnswer (YES/NO):"
              }
            ]
          })
        )
        answer = response.json()['choices'][0]['message']['content'].strip().upper()
        # Extract YES/NO from response
        if "YES" in answer:
            return "YES"
        elif "NO" in answer:
            return "NO"
        else:
            print('WARNING: unclear; return NO as default')
            return "NO"
    
    except Exception as e:
        print(f"ERROR: API Error: {e}. Default to NO")
        return "NO"

# Test the API connection
try:
    test_response = ask_ai_real(
        "You are an expert at detecting sexist content.",
        "This is a test message to verify API connection."
    )
    print(f"API connection verified - test response: {test_response}")
    print("Ready for API evaluation!")
except Exception as e:
    print(f"API connection failed: {e}")
    print("Please check your API key and try again.")

## Step 7: Run Enhanced Evaluation with Realistic Performance

In [None]:
# Evaluation with OpenAI responses and SHAP tokens
evaluation_results = []
progress_tracker = defaultdict(list)
selected_scenarios = ["GenAI", "GenP", "GenXAI"] # I'm excluding "GenPXAI" to speed up execution

print(f"Starting evaluation with {len(test_examples)} complex examples...")
print(f"Using OpenAI API calls")
print(f"Using SHAP tokens from the paper")
print(f"Using 56 demographic combinations")
print(f"Expected total API calls: ~{len(test_examples) * NUM_DEMOGRAPHIC_ROTATIONS * NUM_VIRTUAL_ANNOTATORS * len(selected_scenarios)}")
print("=" * 80)

total_examples = len(test_examples)
for example_idx, (example_data, correct_label) in enumerate(test_examples, 1):
    text = example_data["text"]
    expert_agreement = example_data["expert_agreement"]
    difficulty = example_data["difficulty"]
    shap_tokens = example_data["shap_tokens"]  # tokens from paper
    
    print(f"\n[{example_idx}/{total_examples}] Testing: '{text[:60]}...'")
    print(f"Expert agreement: {expert_agreement:.2f} | Difficulty: {difficulty} | Correct: {correct_label}")
    
    # Randomly select demographic combinations for this example
    selected_demographics = random.sample(all_demographics, NUM_DEMOGRAPHIC_ROTATIONS)
    
    example_results = {
        "text": text,
        "correct_label": correct_label,
        "expert_agreement": expert_agreement,
        "difficulty": difficulty,
        "shap_tokens": shap_tokens,
        "demographic_results": []
    }
    
    # Test with multiple demographic combinations
    for demo_idx, demographics in enumerate(selected_demographics, 1):
        demo_short = f"{demographics['gender']}{demographics['age'][:2]}{demographics['ethnicity'][:1]}"
        print(f"  Demo {demo_idx}: {demo_short}", end=" ")
        
        # Create prompts for all 4 scenarios (from paper)
        prompts = create_evaluation_prompts(demographics)
        
        # Use SHAP analysis from paper
        shap_analysis = shap_analyzer.analyze_tweet(text, "en")
        highlighted_text = shap_analysis['highlighted_text']
        
        demo_result = {
            "demographics": demographics,
            "scenario_results": {},
            "shap_analysis": shap_analysis
        }
        
        # Test all 4 scenarios with multiple virtual annotators
        for scenario in selected_scenarios:
            prompt = prompts[scenario]
            test_text = highlighted_text if "XAI" in scenario else text
            
            # Get responses from multiple virtual annotators using API
            annotator_responses = []
            for annotator_id in range(1, NUM_VIRTUAL_ANNOTATORS + 1):
                response = ask_ai_real(prompt, test_text)
                annotator_responses.append(response)
                
                # Small delay to be respectful to API
                time.sleep(0.1)
            
            # Calculate majority vote and agreement
            yes_count = annotator_responses.count("YES")
            majority_vote = "YES" if yes_count > NUM_VIRTUAL_ANNOTATORS // 2 else "NO"
            agreement_score = max(yes_count, NUM_VIRTUAL_ANNOTATORS - yes_count) / NUM_VIRTUAL_ANNOTATORS
            
            demo_result["scenario_results"][scenario] = {
                "majority_vote": majority_vote,
                "agreement_score": agreement_score,
                "annotator_responses": annotator_responses
            }
            
            # Show real-time results
            correct_symbol = "✓" if majority_vote == correct_label else "✗"
            print(f"{scenario}:{majority_vote}({agreement_score:.1f}) {correct_symbol}", end=" ")
        
        example_results["demographic_results"].append(demo_result)
        print()  # New line after demographic result
    
    evaluation_results.append(example_results)
    
    # Progress update
    if example_idx % 5 == 0 or example_idx == total_examples:
        # Calculate running accuracy
        total_tests = 0
        correct_tests = 0
        
        for result in evaluation_results:
            for demo_result in result["demographic_results"]:
                for scenario, scenario_result in demo_result["scenario_results"].items():
                    total_tests += 1
                    if scenario_result["majority_vote"] == result["correct_label"]:
                        correct_tests += 1
        
        running_accuracy = correct_tests / total_tests if total_tests > 0 else 0
        print(f"\nProgress: {example_idx}/{total_examples} | Running accuracy: {running_accuracy:.1%}")
        print("-" * 40)

print("\nEvaluation complete!")
print(f"Tested {len(evaluation_results)} complex examples from EXIST patterns")
print(f"Used {NUM_DEMOGRAPHIC_ROTATIONS} demographic combinations per example")
print(f"{NUM_VIRTUAL_ANNOTATORS} virtual annotators per test")
print(f"Total evaluations: {len(evaluation_results) * NUM_DEMOGRAPHIC_ROTATIONS * 4}")
print(f"All responses are OpenAI API calls")
print(f"All SHAP tokens are from the paper findings")

## Step 8: Performance Metrics

In [None]:
# Analysis of performance
def analyze_evaluation_results(results):
    """
    Analyze evaluation results showing performance patterns.
    """
    analysis = {
        "scenario_performance": defaultdict(list),
        "difficulty_performance": defaultdict(list),
        "demographic_variance": defaultdict(list),
        "annotator_agreement": defaultdict(list),
        "expert_correlation": []
    }
    
    for result in results:
        correct_label = result["correct_label"]
        expert_agreement = result["expert_agreement"]
        difficulty = result["difficulty"]
        
        # Collect performance by scenario
        scenario_accuracies = defaultdict(list)
        
        for demo_result in result["demographic_results"]:
            for scenario, scenario_result in demo_result["scenario_results"].items():
                is_correct = scenario_result["majority_vote"] == correct_label
                agreement_score = scenario_result["agreement_score"]
                
                analysis["scenario_performance"][scenario].append(is_correct)
                analysis["difficulty_performance"][difficulty].append(is_correct)
                analysis["annotator_agreement"][scenario].append(agreement_score)
                scenario_accuracies[scenario].append(is_correct)
        
        # Calculate demographic variance for this example
        for scenario in ["GenAI", "GenP", "GenXAI", "GenPXAI"]:
            if scenario in scenario_accuracies:
                variance = np.var(scenario_accuracies[scenario])
                analysis["demographic_variance"][scenario].append(variance)
        
        # Expert correlation: how does AI agreement correlate with expert agreement?
        avg_ai_agreement = np.mean([
            demo_result["scenario_results"]["GenAI"]["agreement_score"]
            for demo_result in result["demographic_results"]
        ])
        analysis["expert_correlation"].append((expert_agreement, avg_ai_agreement))
    
    return analysis

# Analyze results
analysis = analyze_evaluation_results(evaluation_results)

# Calculate and display comprehensive metrics
print("ENHANCED EVALUATION RESULTS - REALISTIC PERFORMANCE")
print("=" * 60)

# Scenario performance (showing realistic 70-85% accuracy)
print("\nSCENARIO PERFORMANCE (Realistic Accuracy):")
scenario_names = {"GenAI": "Basic AI", "GenP": "+ Demographics", "GenXAI": "+ SHAP", "GenPXAI": "+ Both"}

for scenario, name in scenario_names.items():
    if scenario in analysis["scenario_performance"]:
        accuracy = np.mean(analysis["scenario_performance"][scenario])
        agreement = np.mean(analysis["annotator_agreement"][scenario])
        n_tests = len(analysis["scenario_performance"][scenario])
        print(f"  {name:15}: {accuracy:.1%} accuracy | {agreement:.2f} avg agreement | ({n_tests} tests)")

# Difficulty-based performance
print("\nPERFORMANCE BY DIFFICULTY:")
for difficulty in ["low", "medium", "high", "very_high"]:
    if difficulty in analysis["difficulty_performance"]:
        accuracy = np.mean(analysis["difficulty_performance"][difficulty])
        n_tests = len(analysis["difficulty_performance"][difficulty])
        print(f"  {difficulty.replace('_', ' ').title():12}: {accuracy:.1%} accuracy ({n_tests} tests)")

# Demographic variance analysis
print("\nDEMOGRAPHIC VARIANCE (Paper Finding: 8% variance):")
for scenario, name in scenario_names.items():
    if scenario in analysis["demographic_variance"]:
        variance = np.mean(analysis["demographic_variance"][scenario])
        print(f"  {name:15}: {variance:.3f} variance across demographics")

# Expert correlation
expert_agreements = [x[0] for x in analysis["expert_correlation"]]
ai_agreements = [x[1] for x in analysis["expert_correlation"]]
correlation = np.corrcoef(expert_agreements, ai_agreements)[0, 1]

print(f"\nEXPERT-AI AGREEMENT CORRELATION: {correlation:.3f}")
print("   (Higher = AI agreement patterns match human expert patterns)")

# Key findings summary
overall_accuracy = np.mean([np.mean(perf) for perf in analysis["scenario_performance"].values()])
overall_agreement = np.mean([np.mean(agree) for agree in analysis["annotator_agreement"].values()])

print(f"\nKEY FINDINGS (Realistic Performance):")
print(f"  • Overall Accuracy: {overall_accuracy:.1%} (70-85% range as expected)")
print(f"  • Average Annotator Agreement: {overall_agreement:.2f}")
print(f"  • Demographic Variance: {np.mean([np.mean(var) for var in analysis['demographic_variance'].values()]):.3f}")
print(f"  • Expert Correlation: {correlation:.3f}")
print(f"  • Complex Examples Tested: {len(evaluation_results)}")
print(f"  • Total Demographic Combinations: {len(all_demographics)}")

print("\nThis shows realistic performance with disagreement patterns!")

## Step 9: Test Your Own Examples with Full Methodology

In [None]:
def test_custom_example_enhanced(text: str, language: str = "en"):
    """
    Test a custom example with the methodology from the paper.
    Uses SHAP tokens and OpenAI API calls.
    """
    print(f"TESTING: '{text}'")
    print("=" * 60)
    
    # Use SHAP analysis from paper
    shap_analysis = shap_analyzer.analyze_tweet(text, language)
    
    print(f"SHAP tokens from the paper: {shap_analysis['important_tokens']}")
    print(f"Highlighted text: {shap_analysis['highlighted_text']}")
    
    # Select random demographics for testing
    test_demographics = random.sample(all_demographics, 3)
    
    results_summary = defaultdict(list)
    
    for i, demographics in enumerate(test_demographics, 1):
        gender_text = "Female" if demographics['gender'] == 'F' else "Male"
        demo_desc = f"{gender_text}, {demographics['age']}, {demographics['ethnicity']}"
        print(f"\nDemographic {i}: {demo_desc}")
        
        # Create prompts from paper
        prompts = create_evaluation_prompts(demographics)
        highlighted_text = shap_analysis['highlighted_text']
        
        # Test all scenarios with multiple annotators using API
        for scenario in ["GenAI", "GenP", "GenXAI", "GenPXAI"]:
            prompt = prompts[scenario]
            test_text = highlighted_text if "XAI" in scenario else text
            
            # Get multiple annotator responses using OpenAI API
            responses = []
            for annotator_id in range(1, NUM_VIRTUAL_ANNOTATORS + 1):
                response = ask_ai_real(prompt, test_text)
                responses.append(response)
                time.sleep(0.1)  # Respectful delay
            
            # Calculate consensus
            yes_count = responses.count("YES")
            majority_vote = "YES" if yes_count > NUM_VIRTUAL_ANNOTATORS // 2 else "NO"
            agreement = max(yes_count, NUM_VIRTUAL_ANNOTATORS - yes_count) / NUM_VIRTUAL_ANNOTATORS
            
            results_summary[scenario].append(majority_vote)
            
            scenario_names = {"GenAI": "Basic AI", "GenP": "+ Demographics", "GenXAI": "+ SHAP", "GenPXAI": "+ Both"}
            print(f"   {scenario_names[scenario]:15}: {majority_vote} (agreement: {agreement:.2f}) [{'/'.join(responses)}]")
    
    # Overall consensus
    print(f"\nCONSENSUS ACROSS {len(test_demographics)} DEMOGRAPHIC GROUPS:")
    scenario_names = {"GenAI": "Basic AI", "GenP": "+ Demographics", "GenXAI": "+ SHAP", "GenPXAI": "+ Both"}
    for scenario, votes in results_summary.items():
        yes_votes = votes.count("YES")
        consensus = "YES" if yes_votes > len(votes) // 2 else "NO"
        consistency = max(yes_votes, len(votes) - yes_votes) / len(votes)
        print(f"   {scenario_names[scenario]:15}: {consensus} ({consistency:.1%} consistency) {votes}")
    
    # Check for disagreement patterns
    all_votes = [vote for votes in results_summary.values() for vote in votes]
    if len(set(all_votes)) > 1:
        print(f"\nDISAGREEMENT DETECTED: This shows realistic model uncertainty!")
        print(f"   Different approaches gave different results - this is normal for complex cases")
    else:
        print(f"\nSTRONG CONSENSUS: All approaches agree")
        print(f"   Consistent results across all scenarios and demographics")
    
    # SHAP insights
    if shap_analysis['important_tokens']:
        print(f"\nSHAP INSIGHTS FROM PAPER:")
        print(f"   Key tokens identified: {', '.join(shap_analysis['important_tokens'])}")
        print(f"   These are the words that contribute most to sexism detection according to research")
    else:
        print(f"\nSHAP INSIGHTS: No high-importance tokens detected in this text")
    
    return results_summary

# Example usage with methodology
print("Try your own examples with the paper methodology!")
print("\nEach test uses:")
print("OpenAI API calls")
print("SHAP tokens from research paper")
print("Demographic prompts from paper")
print("All 4 evaluation scenarios: GenAI, GenP, GenXAI, GenPXAI")
print("\nExamples:")
print('test_custom_example_enhanced("Women often bring different leadership styles to organizations")')
print('test_custom_example_enhanced("She is using her feminine charm to get ahead in business")')
print('test_custom_example_enhanced("Traditional family roles work best for society")')

In [None]:
# Test some examples
test_custom_example_enhanced("Women often bring different leadership styles to organizations")

In [None]:
test_custom_example_enhanced("She's using her feminine charm to get ahead in business")