# Abstention Direction Analysis

This notebook identifies a one-dimensional activation direction in the residual stream that linearly separates abstention from non-abstention at the answer token, focusing on forms V1 and V2 which elicit abstention more reliably.

In [1]:
# Import required packages
import logging
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

# Import from shrugger package
from shrugger import (
    ResidualVectorLoader,
    FisherLDAAnalyzer,
    DirectionAnalyzer
)

# Set random seed for reproducibility
np.random.seed(42)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)


## 1. Load Residual Stream Data

First, we'll load the residual stream data for forms V1 and V2 using the `ResidualVectorLoader` class.


In [None]:
# Initialize the residual vector loader
loader = ResidualVectorLoader()

# Define the path to the data directory
data_dir = Path("../../results/abstention_direction")

# Load residual vectors
try:
    # Try to load from layer files
    residual_vectors = loader.load_layer_files(data_dir=data_dir)
    print(f"Successfully loaded residual vectors for {len(residual_vectors['positive'])} layers")
    
    # Show available layers
    layers = sorted(residual_vectors['positive'].keys())
    print(f"Available layers: {layers}")
    
    # Show shape of vectors for the first layer
    first_layer = min(layers)
    pos_shape = residual_vectors['positive'][first_layer].shape
    neg_shape = residual_vectors['negative'][first_layer].shape
    print(f"Layer {first_layer} shapes: positive {pos_shape}, negative {neg_shape}")
    
except FileNotFoundError:
    print("No layer files found. Please run the data extraction process first.")


## 2. Compute Fisher LDA Directions

Next, we'll compute the Fisher LDA directions using the `FisherLDAAnalyzer` class.


In [None]:
# Initialize the Fisher LDA analyzer
lda_analyzer = FisherLDAAnalyzer(
    results_dir="../../results/LDA",  # Output directory for LDA results
    lambda_=-1.0,  # Use Ledoit-Wolf estimation for shrinkage
    alpha=1.0      # Identity scaling factor for shrinkage target
)

# Check if we already have existing results
has_existing = lda_analyzer.load_existing_results()
if has_existing:
    print("Loaded existing LDA results")
    lda_directions = lda_analyzer.get_directions()
    print(f"Found directions for {len(lda_directions)} layers")
else:
    print("No existing LDA results found, computing new directions...")
    
    # Compute directions for all layers
    lda_directions = lda_analyzer.compute_directions(
        residual_vectors=residual_vectors,
        save_incremental=True  # Save results after each layer
    )
    print(f"Computed directions for {len(lda_directions)} layers")

# Show a sample direction
if lda_directions:
    sample_layer = next(iter(lda_directions))
    print(f"Sample direction for layer {sample_layer} has shape {lda_directions[sample_layer].shape}")


## 3. Evaluate Direction Effectiveness

Now we'll evaluate the effectiveness of the computed directions using the `DirectionAnalyzer` class.


In [None]:
# Initialize the direction analyzer
direction_analyzer = DirectionAnalyzer()

# Evaluate all directions
evaluations = direction_analyzer.evaluate_all_layers(
    residual_vectors=residual_vectors,
    lda_directions=lda_directions
)

# Print summary of top layers
direction_analyzer.print_summary(n_top=10)


## 4. Analyze Best Layer

Let's examine the best performing layer in more detail.


In [None]:
# Get the best layer
best_layer = direction_analyzer.get_best_layer()
print(f"Best layer: {best_layer}")

# Get evaluation results for the best layer
best_eval = direction_analyzer.get_layer_evaluation(best_layer)

# Print detailed metrics
print(f"AUC: {best_eval['auc']:.4f}")
print(f"Cohen's d: {best_eval['cohen_d']:.4f}")
print(f"Separation: {best_eval['separation']:.4f}")
print(f"Overlap: {best_eval['overlap']:.4f}")
print(f"Examples: {best_eval['n_pos']} positive, {best_eval['n_neg']} negative")


## 5. Visualize Projections

Let's visualize the projections of examples onto the best direction.


In [None]:
from shrugger.src.plots import plot_projections
# Plot projections for the best layer using the function from plots module
plot_projections(best_eval, title=f"Layer {best_layer} - Projections onto Abstention Direction")


## 6. Compare Multiple Layers

Let's compare the performance of multiple top layers.


In [None]:
# Get the summary dataframe
summary_df = direction_analyzer.get_summary_dataframe()

# Display the top layers
summary_df.head(10)[['layer', 'auc', 'cohen_d', 'separation', 'n_pos', 'n_neg']]


In [None]:
# Create a plot of AUC and Cohen's d across layers using the function from plots module
from shrugger.src.plots import plot_layer_performance

# Plot layer performance metrics
plot_layer_performance(evaluations, best_layer=best_layer)
plt.show()


## 7. Save Results

Finally, let's save our analysis results.


In [None]:
# Create output directory if it doesn't exist
output_dir = Path('../../outputs/abstention_analysis')
output_dir.mkdir(parents=True, exist_ok=True)

# Save summary dataframe
summary_path = output_dir / 'layer_summary.csv'
summary_df.to_csv(summary_path, index=False)
print(f"Saved summary to {summary_path}")

# Save best layer information
best_layer_info = pd.DataFrame([
    {
        'layer': best_layer,
        'auc': best_eval['auc'],
        'cohen_d': best_eval['cohen_d'],
        'separation': best_eval['separation'],
        'overlap': best_eval['overlap'],
        'n_pos': best_eval['n_pos'],
        'n_neg': best_eval['n_neg']
    }
])
best_layer_path = output_dir / 'best_layer.csv'
best_layer_info.to_csv(best_layer_path, index=False)
print(f"Saved best layer info to {best_layer_path}")


## 8. Evaluate on Development Set

Let's evaluate our abstention direction on the development set to see how well it generalizes to unseen examples.


In [None]:
from pathlib import Path
from sklearn.metrics import roc_auc_score, roc_curve
from shrugger import load_dev_data

# Define path to the best layer data
best_layer_dir = Path(f"../../results/by_layer_20250911_112921/by_layer/layer_{best_layer}")

# Load development set data using the function from the shrugger package
dev_vectors, dev_metadata = load_dev_data(best_layer_dir, forms=['V1', 'V2'])


In [None]:
# Import the new functions from the shrugger package
from shrugger import create_abstention_labels, calculate_abstention_metrics
from shrugger.src.plots import plot_projections, plot_roc_curve

# Project vectors onto the best layer's LDA direction
dev_projections = np.dot(dev_vectors, lda_directions[best_layer])
print(f"Dev projections shape: {dev_projections.shape}")
print(f"Projection range: [{dev_projections.min():.4f}, {dev_projections.max():.4f}]")
print(f"Projection mean: {dev_projections.mean():.4f}, std: {dev_projections.std():.4f}")

# Create abstention labels from metadata
abstention_labels = create_abstention_labels(dev_metadata)
print(f"High confidence abstention examples: {np.sum(abstention_labels == 1)}")
print(f"Low confidence abstention examples: {np.sum(abstention_labels == 0)}")

# Calculate abstention metrics
metrics = calculate_abstention_metrics(dev_projections, abstention_labels)

if metrics:
    print(f"AUC: {metrics['auc']:.4f}, Cohen's d: {metrics['cohen_d']:.4f}")
    print(f"Mean High CA: {metrics['mean_high_ca']:.4f}, Mean Low CA: {metrics['mean_low_ca']:.4f}")
    print(f"Std High CA: {metrics['std_high_ca']:.4f}, Std Low CA: {metrics['std_low_ca']:.4f}")
    
    # Create evaluation results dictionary for plotting
    eval_results = {
        'proj_pos': metrics['high_ca_projections'],
        'proj_neg': metrics['low_ca_projections'],
        'mean_pos': metrics['mean_high_ca'],
        'mean_neg': metrics['mean_low_ca'],
        'auc': metrics['auc'],
        'cohen_d': metrics['cohen_d']
    }
    
    # Plot projections using the function from plots module
    plot_projections(eval_results, title=f'Projections onto Abstention Direction (Layer {best_layer}) - Dev Set')
    
    # Plot ROC curve
    plot_roc_curve(metrics['all_labels'], metrics['all_projections'], 
                  title=f'ROC Curve for Abstention Prediction (Layer {best_layer})')
else:
    print("Not enough data to calculate metrics")
