# Comprehensive Annotation Comparison
This notebook compares annotations from:
1. Individual coders against curated annotations ("gold standard")
2. LLM-based annotations against curated annotations

The goal is to understand how individual human coders compare to the curated gold standard, and then see how the LLM-based methods perform in relation to both individual coders and the curated annotations.

## 1. Import Required Modules

In [None]:
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from typing import Dict, List, Optional, Any, Union, Tuple

# Add parent directory to path to import our modules
sys.path.append('..')

# Import our custom modules
from src.utils import (
    get_project_root, 
    load_coding_scheme, 
    filter_coding_scheme,
    load_raw_text,
    load_curated_annotations,
    load_coder_annotations,
    get_articles_paths
)

from src.evaluation import (
    evaluate_article,
    compare_coder_to_curated,
    get_absolute_metrics
)

## 2. Configure Evaluation Parameters

In [None]:
# Article to evaluate
article_id = "EU_32018R1999_Title_0_Chapter_6_Section_3_Article_37"

# Target layer and tagset
target_layer = "Policydesigncharacteristics"
target_tagset = "Actor"

# Boundary tolerance in characters
tolerance = 15

# Paths to standard and extended model annotations
root_dir = get_project_root()
article_dir = os.path.join(root_dir, 'data', '03b_processed_to_json', article_id)
standard_path = os.path.join(article_dir, 'Generated_Annotations_Standard.json')
extended_path = os.path.join(article_dir, 'Generated_Annotations_Extended.json')

# Verify the files exist
if not os.path.exists(standard_path):
    print(f"Warning: Standard annotations file not found at {standard_path}")
if not os.path.exists(extended_path):
    print(f"Warning: Extended annotations file not found at {extended_path}")

## 3. Helper Functions

In [None]:
def get_coders_for_article(article_id: str) -> List[str]:
    """Get list of coder IDs that have annotated a specific article."""
    try:
        coder_annotations = load_coder_annotations(article_id)
        return list(coder_annotations.keys())
    except FileNotFoundError:
        return []
    except Exception as e:
        print(f"Error loading coder annotations for {article_id}: {e}")
        return []

def create_comparison_dataframe(article_id: str, layers: List[str], tagsets: List[str], tolerance: int = 15) -> pd.DataFrame:
    """Create a DataFrame comparing individual coders and LLM annotations against curated annotations."""
    # Dictionary to store results for each annotator
    results = {}
    
    # Get coder IDs
    coders = get_coders_for_article(article_id)
    
    # Compare each coder to curated annotations
    for coder_id in coders:
        try:
            coder_results = compare_coder_to_curated(
                article_id=article_id,
                coder_id=coder_id,
                layers=layers,
                tagsets=tagsets,
                tolerance=tolerance
            )
            results[coder_id] = coder_results
        except Exception as e:
            print(f"Error comparing coder {coder_id} to curated annotations: {e}")
    
    # Compare LLM standard annotations to curated
    try:
        standard_path = os.path.join(get_project_root(), 'data', '03b_processed_to_json', 
                                   article_id, 'Generated_Annotations_Standard.json')
        if os.path.exists(standard_path):
            standard_results = evaluate_article(
                article_id=article_id,
                generated_path=standard_path,
                layers=layers,
                tagsets=tagsets,
                save_results=False,
                tolerance=tolerance
            )
            results['Standard'] = standard_results
    except Exception as e:
        print(f"Error evaluating standard annotations: {e}")
    
    # Compare LLM extended annotations to curated
    try:
        extended_path = os.path.join(get_project_root(), 'data', '03b_processed_to_json', 
                                    article_id, 'Generated_Annotations_Extended.json')
        if os.path.exists(extended_path):
            extended_results = evaluate_article(
                article_id=article_id,
                generated_path=extended_path,
                layers=layers,
                tagsets=tagsets,
                save_results=False,
                tolerance=tolerance
            )
            results['Extended'] = extended_results
    except Exception as e:
        print(f"Error evaluating extended annotations: {e}")
    
    # Create a DataFrame from the results
    data = []
    for annotator, result in results.items():
        # Get absolute metrics
        abs_metrics = get_absolute_metrics(result)
        
        data.append({
            'Annotator': annotator,
            'Span F1': result['summary']['span_f1'],
            'Tag Accuracy': result['tag_assignment']['tag_accuracy'],
            'Full Match Acc': result['tag_assignment']['full_match_accuracy'],
            'Combined': result['summary']['combined_score'],
            'Correct Anno': abs_metrics['correct_annotations'],
            'Extra Anno': abs_metrics['extra_annotations'],
            'Missed Anno': abs_metrics['missed_annotations'],
            'Total Anno': abs_metrics.get('total_generated', 0),
            'Total Curated': abs_metrics.get('total_curated', 0)
        })
    
    return pd.DataFrame(data)

## 4. Single Article Evaluation

In [None]:
# Check if the article has individual coder annotations
coders = get_coders_for_article(article_id)
print(f"Found {len(coders)} coders for article {article_id}: {', '.join(coders)}")

# Create the comparison DataFrame
comparison_df = create_comparison_dataframe(
    article_id=article_id,
    layers=[target_layer],
    tagsets=[target_tagset],
    tolerance=tolerance
)

# Display the results
print(f"\nComparison for article {article_id}, layer {target_layer}, tagset {target_tagset} (tolerance: ±{tolerance} characters):\n")
comparison_df.set_index('Annotator', inplace=True)
display(comparison_df)

# Format the table for better readability
styled_df = comparison_df.style.format({
    'Span F1': '{:.4f}',
    'Tag Accuracy': '{:.4f}',
    'Full Match Acc': '{:.4f}',
    'Combined': '{:.4f}',
    'Correct Anno': '{:.0f}',
    'Extra Anno': '{:.0f}',
    'Missed Anno': '{:.0f}',
    'Total Anno': '{:.0f}',
    'Total Curated': '{:.0f}'
})

# Display the styled table
display(styled_df)

## 5. Visualize the Results

In [None]:
# Create bar charts for key metrics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
metrics = ['Span F1', 'Full Match Acc', 'Correct Anno', 'Extra Anno']
titles = ['Span F1 Score', 'Full Match Accuracy', 'Correctly Annotated Spans', 'Extra Annotations']
colors = ['#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4']

# Add highlighting for LLM methods
if 'Standard' in comparison_df.index:
    colors[comparison_df.index.get_loc('Standard')] = '#ff7f0e'
if 'Extended' in comparison_df.index:
    colors[comparison_df.index.get_loc('Extended')] = '#2ca02c'

# Create each subplot
for i, (ax, metric, title) in enumerate(zip(axes.flatten(), metrics, titles)):
    comparison_df[metric].plot(kind='bar', ax=ax, color=colors)
    ax.set_title(title)
    ax.set_ylabel(metric)
    
    # Add value labels on top of bars
    for j, v in enumerate(comparison_df[metric]):
        if metric in ['Span F1', 'Full Match Acc']:
            ax.text(j, v + 0.01, f"{v:.4f}", ha='center')
        else:
            ax.text(j, v + 0.5, f"{int(v)}", ha='center')

plt.tight_layout()
plt.show()

## 6. Multi-Article Evaluation

In [None]:
# Get a list of all articles
all_articles = get_articles_paths()
print(f"Found {len(all_articles)} articles in total")

# Find articles that have both coder annotations and LLM annotations
valid_articles = []
for article in all_articles:
    article_id = article['id']
    coders = get_coders_for_article(article_id)
    
    # Check if standard and extended annotations exist
    standard_path = os.path.join(root_dir, 'data', '03b_processed_to_json', 
                               article_id, 'Generated_Annotations_Standard.json')
    extended_path = os.path.join(root_dir, 'data', '03b_processed_to_json', 
                                article_id, 'Generated_Annotations_Extended.json')
    
    if len(coders) > 0 and os.path.exists(standard_path) and os.path.exists(extended_path):
        valid_articles.append({
            'id': article_id,
            'coders': coders
        })

print(f"Found {len(valid_articles)} articles with both coder and LLM annotations")
print(f"First 5 valid articles: {[a['id'] for a in valid_articles[:5]]}")

In [None]:
# If there are too many articles, limit to a smaller sample
sample_size = min(5, len(valid_articles))
sample_articles = valid_articles[:sample_size]

# Create a DataFrame for each article and then concatenate them
all_comparisons = []

for article in sample_articles:
    article_id = article['id']
    try:
        # Create comparison DataFrame for this article
        df = create_comparison_dataframe(
            article_id=article_id,
            layers=[target_layer],
            tagsets=[target_tagset],
            tolerance=tolerance
        )
        df['Article'] = article_id  # Add article ID as a column
        all_comparisons.append(df)
    except Exception as e:
        print(f"Error processing article {article_id}: {e}")

# Combine all comparisons into a single DataFrame
if all_comparisons:
    combined_df = pd.concat(all_comparisons, ignore_index=True)
    print(f"\nCombined results across {len(all_comparisons)} articles:")
    display(combined_df.head())
else:
    print("No comparison data available for the selected articles.")

In [None]:
if all_comparisons:
    # Calculate average performance by annotator type
    aggregated_df = combined_df.groupby('Annotator').agg({
        'Span F1': 'mean',
        'Tag Accuracy': 'mean',
        'Full Match Acc': 'mean',
        'Combined': 'mean',
        'Correct Anno': 'sum',
        'Extra Anno': 'sum',
        'Missed Anno': 'sum',
        'Total Anno': 'sum',
        'Total Curated': 'mean'
    })
    
    print("\nAverage performance by annotator type:")
    # Format the aggregated table for better readability
    styled_agg_df = aggregated_df.style.format({
        'Span F1': '{:.4f}',
        'Tag Accuracy': '{:.4f}',
        'Full Match Acc': '{:.4f}',
        'Combined': '{:.4f}',
        'Correct Anno': '{:.0f}',
        'Extra Anno': '{:.0f}',
        'Missed Anno': '{:.0f}',
        'Total Anno': '{:.0f}',
        'Total Curated': '{:.1f}'
    })
    
    display(styled_agg_df)
    
    # Visualize the aggregated results
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    metrics = ['Span F1', 'Full Match Acc', 'Correct Anno', 'Extra Anno']
    titles = ['Average Span F1 Score', 'Average Full Match Accuracy', 
              'Total Correctly Annotated Spans', 'Total Extra Annotations']
    
    # Create a color map
    coder_ids = [col for col in aggregated_df.index if col not in ['Standard', 'Extended']]
    color_map = {coder_id: '#1f77b4' for coder_id in coder_ids}
    if 'Standard' in aggregated_df.index:
        color_map['Standard'] = '#ff7f0e'
    if 'Extended' in aggregated_df.index:
        color_map['Extended'] = '#2ca02c'
        
    colors = [color_map.get(idx, '#1f77b4') for idx in aggregated_df.index]
    
    # Create each subplot
    for i, (ax, metric, title) in enumerate(zip(axes.flatten(), metrics, titles)):
        aggregated_df[metric].plot(kind='bar', ax=ax, color=colors)
        ax.set_title(title)
        ax.set_ylabel(metric)
        
        # Add value labels on top of bars
        for j, v in enumerate(aggregated_df[metric]):
            if metric in ['Span F1', 'Full Match Acc']:
                ax.text(j, v + 0.01, f"{v:.4f}", ha='center')
            else:
                ax.text(j, v + 0.5, f"{int(v)}", ha='center')
    
    plt.tight_layout()
    plt.show()
    
    # Create a table comparing human coders vs LLM methods
    coder_avg = combined_df[combined_df['Annotator'].isin(coder_ids)].mean(numeric_only=True)
    standard_avg = combined_df[combined_df['Annotator'] == 'Standard'].mean(numeric_only=True) if 'Standard' in combined_df['Annotator'].values else None
    extended_avg = combined_df[combined_df['Annotator'] == 'Extended'].mean(numeric_only=True) if 'Extended' in combined_df['Annotator'].values else None
    
    comparison_data = {'Human Coders': coder_avg}
    if standard_avg is not None:
        comparison_data['Standard LLM'] = standard_avg
    if extended_avg is not None:
        comparison_data['Extended LLM'] = extended_avg
        
    comparison_summary = pd.DataFrame(comparison_data).T[['Span F1', 'Full Match Acc', 'Correct Anno', 'Extra Anno']]
    print("\nComparison summary (human coders vs LLM methods):")
    display(comparison_summary.style.format({
        'Span F1': '{:.4f}',
        'Full Match Acc': '{:.4f}',
        'Correct Anno': '{:.1f}',
        'Extra Anno': '{:.1f}'
    }))

## 7. Conclusion

This notebook provides a comprehensive comparison between:
1. Individual human coder annotations and the curated "gold standard" annotations
2. LLM-based annotations (standard and extended schemes) and the curated annotations

Key findings:
* [Add your observations here based on the results]
* [Note any patterns in how individual coders compare to the gold standard]
* [Compare LLM performance to individual coders]
* [Discuss the impact of using the extended scheme with examples]

The results suggest that [add your conclusions here].

**Future work:**
- Run this analysis across a larger set of articles to get more robust findings
- Explore additional layers and tagsets beyond Actor
- Investigate factors that contribute to differences between individual coders and the curated standard