# Analyzing Abstention and Hedging Metrics

This notebook provides visualizations and analysis of the Confident-Abstain (CA) and Hedging (HEDGE) metrics computed over the experimental results.


In [7]:
import json
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.2)

# Add the project root to the path to allow imports
repo_root = Path().absolute()
sys.path.insert(0, str(repo_root))

# Import the metrics analysis module
from abstainer.src.analysis import (
    MetricsAnalyzer,
    load_metrics_data,
    plot_metrics_by_form,
    plot_metrics_by_label_type,
    plot_metric_distributions,
    find_top_examples
)



## Load the Metrics Data

First, let's load the metrics data that was generated by the `run_metrics_analysis.py` script.


In [None]:
# Create a metrics analyzer instance and load the data
metrics_dir = Path('../../results/metrics_analysis')
analyzer = MetricsAnalyzer(metrics_dir)

try:
    metrics_df, run_name = analyzer.load_data()
    print(f"Using run: {run_name}")
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please run the run_metrics_analysis.py script first.")


In [None]:
# Display information about the loaded metrics data
try:
    print(f"Loaded {len(metrics_df)} metric records")
    
    # Display the first few rows and column names
    print("\nColumns in metrics_df:")
    print(metrics_df.columns.tolist())
    metrics_df.head()
except NameError:
    print("No metrics data available. Please run the run_metrics_analysis.py script first.")


## Analyze Metrics by Form

Let's analyze how the CA and HEDGE metrics vary across different prompt forms.


In [10]:

try:
    # Get the form summary from the analyzer
    form_df = analyzer.form_df
    
    # If form_df is not loaded, compute it
    if form_df is None:
        form_df = analyzer.compute_metrics_by_form()
    
    # Display the summary
    form_df
except NameError:
    print("No metrics data available.")

In [None]:
try:
    # Create a bar plot of CA scores by form using the analyzer
    fig = analyzer.plot_ca_scores_by_form(form_df)
    plt.show()
except Exception as e:
    print(f"Error: {str(e)}")

In [None]:
try:
    # Create a bar plot of HEDGE scores by form using the analyzer
    fig = analyzer.plot_hedge_scores_by_form(form_df)
    plt.show()
except Exception as e:
    print(f"Error: {str(e)}")


## Analyze Metrics by Label Type

Let's analyze how the metrics vary between alphabetic (A,B,C,D,E) and numeric (1,2,3,4,5) labels.


In [13]:
try:
    # Get the label type summary from the analyzer
    label_df = analyzer.label_df
    
    # If label_df is not loaded, compute it
    if label_df is None:
        label_df = analyzer.compute_metrics_by_label_type()
    
    # Display the summary
    label_df
except NameError:
    print("No metrics data available.")


In [None]:
try:
    # Create a grouped bar plot for CA and HEDGE scores by label type using the analyzer
    fig = analyzer.plot_metrics_by_label_type(label_df)
    plt.show()
except Exception as e:
    print(f"Error: {str(e)}")


## Distribution of Metrics

Let's look at the distribution of CA and HEDGE scores across all questions.


In [None]:
try:
    # Create histograms of CA and HEDGE scores using the analyzer
    fig = analyzer.plot_metric_distributions()
    plt.show()
except NameError:
    print("No metrics data available.")


## Top Examples with Highest HEDGE Scores

Let's examine the top 5 examples with the highest HEDGE scores across all experiments to better understand what types of questions lead to high hedging behavior.


In [None]:
try:
    # Find top examples with highest HEDGE scores
    top_hedge_examples = analyzer.find_top_examples_by_metric('hedge_score', 5)
    
    # Display the top examples with relevant information
    for i, (_, row) in enumerate(top_hedge_examples.iterrows(), 1):
        print(f"Example {i} - HEDGE Score: {row['hedge_score']:.4f}, CA Score: {row['ca_score']:.4f}")
        print(f"Question: {row['question']}")
        print(f"Form: {row['form']}, Permutation: {row['permutation']}, Label Type: {row['label_type']}")
        
        # Parse and display probabilities
        try:
            # Convert the JSON string to a dictionary
            probs = json.loads(row['canonical_probs'])
            print("Answer Probabilities (Canonical):")
            
            # Calculate Yes/No probabilities for clarity
            p_yes = probs.get("YY", 0) + probs.get("Y", 0)
            p_no = probs.get("N", 0) + probs.get("NN", 0)
            p_idk = probs.get("A", 0)
            
            # Display individual probabilities
            for label, prob in probs.items():
                print(f"  {label}: {prob:.4f}")
                
            # Display aggregated probabilities
            print(f"  Total Yes (YY+Y): {p_yes:.4f}")
            print(f"  Total No (N+NN): {p_no:.4f}")
            print(f"  IDK (A): {p_idk:.4f}")
            
            # Get HEDGE score components
            hedge_components = analyzer.analyze_hedge_components(row)
            
            if hedge_components:
                print("\nHEDGE Components:")
                print(f"  s (1-p_IDK): {hedge_components['s']:.4f}")
                if 'r_yes' in hedge_components:
                    print(f"  r_yes (p_yes/s): {hedge_components['r_yes']:.4f}")
                    print(f"  r_no (p_no/s): {hedge_components['r_no']:.4f}")
                    print(f"  H(r)/log(2): {hedge_components['normalized_entropy']:.4f}")
                    print(f"  HEDGE = s * H(r)/log(2): {hedge_components['hedge_score_calc']:.4f}")
                    
        except Exception as e:
            print(f"Could not parse probabilities: {e}")
            
        print(f"Canonical Label: {row['canonical_label']}")
        print(f"Correct Answer: {row['answer']}")
        print("-" * 80)
except NameError:
    print("No metrics data available.")
