In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
from datetime import datetime

# Configure matplotlib styling
plt.rcParams.update({
    'font.family': 'sans-serif',
    'font.size': 12,
    'axes.labelsize': 12,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'lines.linewidth': 2,
})

In [None]:
# Setup paths
BASE_DIR = Path("data/eval_parallelized/")
BASE_DIR = Path("/Users/roy/data/ripple_bench/parallelized")

BASE_RESULTS = {
    "llama": BASE_DIR / "Llama-3-8b-Instruct_ripple_results.csv",
    "zephyr": BASE_DIR / "zephyr-7b-beta_ripple_results.csv",
}

UNLEARNING_RESULTS = {
    "llama": {
        "elm": BASE_DIR / "llama-3-8b-instruct-elm-ckpt7_ripple_results.csv",
        "graddiff": BASE_DIR / "llama-3-8b-instruct-graddiff-ckpt8_ripple_results.csv",
        "pbj": BASE_DIR / "llama-3-8b-instruct-pbj-ckpt6_ripple_results.csv",
        "repnoise": BASE_DIR / "llama-3-8b-instruct-repnoise-ckpt6_ripple_results.csv",
        "rmu": BASE_DIR / "llama-3-8b-instruct-rmu-ckpt6_ripple_results.csv",
        "rmu_lat": BASE_DIR / "llama-3-8b-instruct-rmu-lat-ckpt7_ripple_results.csv",
        "rr": BASE_DIR / "llama-3-8b-instruct-rr-ckpt8_ripple_results.csv",
        "tar": BASE_DIR / "llama-3-8b-instruct-tar-ckpt8_ripple_results.csv",
    },
    "zephyr": {
        "elm": BASE_DIR / "zephyr-7b-elm_ripple_results.csv",
    }
}

PLOT_DIR = Path("plots/")
PLOT_DIR.mkdir(parents=True, exist_ok=True)

## RAG Distance Distribution Analysis

RAG distance represents how far a topic is from WMDP topics in the Wikipedia RAG retrieval ranking.

In [None]:
# Load base model results to analyze RAG distances
df_llama = pd.read_csv(BASE_RESULTS["llama"])
df_zephyr = pd.read_csv(BASE_RESULTS["zephyr"])

print("Data structure:")
print(f"Columns: {df_llama.columns.tolist()}")
print(f"\nLlama data points: {len(df_llama)}")
print(f"Zephyr data points: {len(df_zephyr)}")
print(f"\nDistance range (Llama): {df_llama['distance'].min()} - {df_llama['distance'].max()}")
print(f"Distance range (Zephyr): {df_zephyr['distance'].min()} - {df_zephyr['distance'].max()}")

In [None]:
# Analyze RAG distance distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Distribution of RAG distances
ax1 = axes[0]
ax1.hist(df_llama['distance'], bins=50, alpha=0.7, label='Llama', color='blue', edgecolor='black')
ax1.hist(df_zephyr['distance'], bins=50, alpha=0.7, label='Zephyr', color='orange', edgecolor='black')
ax1.set_xlabel('RAG Distance')
ax1.set_ylabel('Count')
ax1.set_title('Distribution of RAG Distances')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Cumulative distribution
ax2 = axes[1]
llama_distances_sorted = np.sort(df_llama['distance'])
llama_cumulative = np.arange(1, len(llama_distances_sorted) + 1) / len(llama_distances_sorted)
zephyr_distances_sorted = np.sort(df_zephyr['distance'])
zephyr_cumulative = np.arange(1, len(zephyr_distances_sorted) + 1) / len(zephyr_distances_sorted)

ax2.plot(llama_distances_sorted, llama_cumulative, label='Llama', linewidth=2)
ax2.plot(zephyr_distances_sorted, zephyr_cumulative, label='Zephyr', linewidth=2)
ax2.set_xlabel('RAG Distance')
ax2.set_ylabel('Cumulative Proportion')
ax2.set_title('Cumulative Distribution of RAG Distances')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_xlim(0, 100)

plt.tight_layout()

# Save plot
date_str = datetime.now().strftime("%Y%m%d-%H%M%S")
filename = f"rag_distance_distribution_{date_str}"
plt.savefig(PLOT_DIR / f"{filename}.png", dpi=150, bbox_inches='tight')
plt.savefig(PLOT_DIR / f"{filename}.pdf", bbox_inches='tight')
plt.show()

# Print statistics
print("\nRAG Distance Statistics:")
print(f"Llama - Mean: {df_llama['distance'].mean():.2f}, Median: {df_llama['distance'].median():.2f}, Std: {df_llama['distance'].std():.2f}")
print(f"Zephyr - Mean: {df_zephyr['distance'].mean():.2f}, Median: {df_zephyr['distance'].median():.2f}, Std: {df_zephyr['distance'].std():.2f}")

## RAG Distance vs Accuracy

In [None]:
def plot_rag_vs_accuracy(model="llama", bucket_size=10, max_distance=100):
    """Plot RAG distance vs accuracy for base and unlearned models."""
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Load base model data
    df_base = pd.read_csv(BASE_RESULTS[model])
    df_base = df_base[df_base['distance'] < max_distance]
    df_base['distance_bucket'] = (df_base['distance'] // bucket_size) * bucket_size
    
    # Calculate accuracy by distance bucket
    base_accuracy = df_base.groupby('distance_bucket')['is_correct'].agg(['mean', 'std', 'count'])
    base_accuracy['sem'] = base_accuracy['std'] / np.sqrt(base_accuracy['count'])
    
    # Plot 1: Base model accuracy vs RAG distance
    ax1.errorbar(base_accuracy.index, base_accuracy['mean'] * 100, 
                yerr=base_accuracy['sem'] * 100,
                marker='o', linewidth=2, markersize=8, capsize=5,
                color='black', label=f'{model.title()} Base')
    
    ax1.set_xlabel('RAG Distance (bucketed)', fontsize=12)
    ax1.set_ylabel('Accuracy (%)', fontsize=12)
    ax1.set_title(f'Base Model Accuracy vs RAG Distance\n{model.title()}', fontsize=14)
    ax1.grid(True, alpha=0.3)
    ax1.legend()
    ax1.set_ylim(0, 100)
    
    # Plot 2: Compare base vs unlearned across RAG distances
    METHOD_COLORS = {
        'elm': '#FF6B6B', 'rmu': '#4ECDC4', 'graddiff': '#95E77E',
        'pbj': '#FFD93D', 'tar': '#A8E6CF', 'rmu_lat': '#FF8B94',
        'repnoise': '#B4A7D6', 'rr': '#FFB347'
    }
    
    # Plot base again for comparison
    ax2.errorbar(base_accuracy.index, base_accuracy['mean'] * 100,
                yerr=base_accuracy['sem'] * 100,
                marker='o', linewidth=3, markersize=8, capsize=5,
                color='black', label='Base', alpha=0.9, zorder=10)
    
    # Plot unlearning methods
    for method, path in UNLEARNING_RESULTS[model].items():
        if path.exists():
            df_unlearn = pd.read_csv(path)
            df_unlearn = df_unlearn[df_unlearn['distance'] < max_distance]
            df_unlearn['distance_bucket'] = (df_unlearn['distance'] // bucket_size) * bucket_size
            
            unlearn_accuracy = df_unlearn.groupby('distance_bucket')['is_correct'].agg(['mean', 'std', 'count'])
            unlearn_accuracy['sem'] = unlearn_accuracy['std'] / np.sqrt(unlearn_accuracy['count'])
            
            color = METHOD_COLORS.get(method, '#888888')
            ax2.errorbar(unlearn_accuracy.index, unlearn_accuracy['mean'] * 100,
                        yerr=unlearn_accuracy['sem'] * 100,
                        marker='s', linewidth=2, markersize=6, capsize=3,
                        color=color, label=method.upper().replace('_', '-'), alpha=0.7)
    
    ax2.set_xlabel('RAG Distance (bucketed)', fontsize=12)
    ax2.set_ylabel('Accuracy (%)', fontsize=12)
    ax2.set_title(f'RAG Distance vs Accuracy: All Methods\n{model.title()}', fontsize=14)
    ax2.legend(loc='best', ncol=2, fontsize=9)
    ax2.grid(True, alpha=0.3)
    ax2.set_ylim(0, 100)
    
    plt.tight_layout()
    
    # Save
    date_str = datetime.now().strftime("%Y%m%d-%H%M%S")
    filename = f"rag_distance_vs_accuracy_{model}_{date_str}"
    plt.savefig(PLOT_DIR / f"{filename}.png", dpi=150, bbox_inches='tight')
    plt.savefig(PLOT_DIR / f"{filename}.pdf", bbox_inches='tight')
    plt.show()

# Generate plots
plot_rag_vs_accuracy(model="llama")

## RAG Distance Correlation Analysis

In [None]:
def analyze_rag_correlations(model="llama"):
    """Analyze correlations between RAG distance and accuracy changes."""
    
    # Load base model data
    df_base = pd.read_csv(BASE_RESULTS[model])
    
    # Calculate topic-level statistics if source_topic exists
    if 'source_topic' in df_base.columns:
        topic_stats_base = df_base.groupby('source_topic').agg({
            'is_correct': 'mean',
            'distance': 'first',
            'question': 'count'
        }).rename(columns={'is_correct': 'base_accuracy', 'question': 'n_questions'})
    else:
        # Group by question if no source_topic
        topic_stats_base = df_base.groupby('question').agg({
            'is_correct': 'mean',
            'distance': 'first'
        }).rename(columns={'is_correct': 'base_accuracy'})
        topic_stats_base['n_questions'] = 1
    
    # Prepare correlation results
    correlation_results = []
    
    # Analyze each unlearning method
    for method, path in UNLEARNING_RESULTS[model].items():
        if not path.exists():
            continue
            
        df_unlearn = pd.read_csv(path)
        
        if 'source_topic' in df_unlearn.columns:
            topic_stats_unlearn = df_unlearn.groupby('source_topic')['is_correct'].mean()
            topic_stats_unlearn.name = 'unlearn_accuracy'
            
            # Merge with base stats
            merged_stats = topic_stats_base.join(topic_stats_unlearn, how='inner')
        else:
            topic_stats_unlearn = df_unlearn.groupby('question')['is_correct'].mean()
            topic_stats_unlearn.name = 'unlearn_accuracy'
            merged_stats = topic_stats_base.join(topic_stats_unlearn, how='inner')
        
        # Calculate accuracy delta
        merged_stats['accuracy_delta'] = merged_stats['base_accuracy'] - merged_stats['unlearn_accuracy']
        
        # Filter for topics with reasonable base accuracy
        filtered_stats = merged_stats[merged_stats['base_accuracy'] > 0.4]
        
        if len(filtered_stats) > 10:
            # Calculate different distance metrics and their correlations
            rag_dist = filtered_stats['distance'].values
            acc_delta = filtered_stats['accuracy_delta'].values
            
            # RAG distance correlations
            pearson_r, pearson_p = stats.pearsonr(rag_dist, acc_delta)
            spearman_r, spearman_p = stats.spearmanr(rag_dist, acc_delta)
            
            # Alternative distance transformations
            log_dist = np.log1p(rag_dist)
            log_pearson_r, _ = stats.pearsonr(log_dist, acc_delta)
            
            inv_dist = 1 / (1 + rag_dist)
            inv_pearson_r, _ = stats.pearsonr(inv_dist, acc_delta)
            
            exp_dist = np.exp(-rag_dist / 10)
            exp_pearson_r, _ = stats.pearsonr(exp_dist, acc_delta)
            
            correlation_results.append({
                'Method': method.upper(),
                'RAG Pearson': pearson_r,
                'RAG Spearman': spearman_r,
                'Log(1+d) Pearson': log_pearson_r,
                '1/(1+d) Pearson': inv_pearson_r,
                'exp(-d/10) Pearson': exp_pearson_r,
                'N Topics': len(filtered_stats)
            })
    
    return pd.DataFrame(correlation_results)

In [None]:
# Analyze correlations for Llama
corr_df = analyze_rag_correlations(model="llama")

if len(corr_df) > 0:
    print("RAG Distance Correlation Analysis (Llama):")
    print("="*80)
    print(corr_df.to_string(index=False))
    
    # Visualize correlations
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Prepare data for grouped bar plot
    metrics = ['RAG Pearson', 'RAG Spearman', 'Log(1+d) Pearson', '1/(1+d) Pearson', 'exp(-d/10) Pearson']
    x = np.arange(len(corr_df))
    width = 0.15
    
    for i, metric in enumerate(metrics):
        offset = (i - 2) * width  # Center the bars
        ax.bar(x + offset, corr_df[metric], width, label=metric, alpha=0.8)
    
    ax.set_xlabel('Unlearning Method', fontsize=12)
    ax.set_ylabel('Correlation with Accuracy Delta', fontsize=12)
    ax.set_title('RAG Distance Metrics: Correlation with Unlearning Effect', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(corr_df['Method'])
    ax.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    ax.legend(loc='best', ncol=3, fontsize=9)
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    
    # Save
    date_str = datetime.now().strftime("%Y%m%d-%H%M%S")
    filename = f"rag_distance_correlations_{date_str}"
    plt.savefig(PLOT_DIR / f"{filename}.png", dpi=150, bbox_inches='tight')
    plt.savefig(PLOT_DIR / f"{filename}.pdf", bbox_inches='tight')
    plt.show()

## RAG Distance vs Unlearning Delta (Scatter Plots)

In [None]:
def plot_rag_vs_delta_scatter(model="llama", method="elm", max_distance=100):
    """Create scatter plot of RAG distance vs accuracy delta."""
    
    if method not in UNLEARNING_RESULTS[model]:
        print(f"Method {method} not found for {model}")
        return
    
    # Load data
    df_base = pd.read_csv(BASE_RESULTS[model])
    df_unlearn = pd.read_csv(UNLEARNING_RESULTS[model][method])
    
    # Merge on question (or question_id if available)
    merge_col = 'question_id' if 'question_id' in df_base.columns else 'question'
    
    merged = pd.merge(
        df_base[[merge_col, 'is_correct', 'distance']],
        df_unlearn[[merge_col, 'is_correct']],
        on=merge_col,
        suffixes=('_base', '_unlearn')
    )
    
    # Filter by distance
    merged = merged[merged['distance'] < max_distance]
    
    # Group by distance to get average delta
    distance_groups = merged.groupby('distance').agg({
        'is_correct_base': 'mean',
        'is_correct_unlearn': 'mean'
    })
    distance_groups['accuracy_delta'] = distance_groups['is_correct_base'] - distance_groups['is_correct_unlearn']
    
    # Create figure with multiple subplots
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
    
    # Plot 1: Raw scatter
    ax1 = axes[0, 0]
    scatter = ax1.scatter(merged['distance'], 
                         merged['is_correct_base'] - merged['is_correct_unlearn'],
                         alpha=0.3, s=10, c=merged['distance'], cmap='viridis')
    ax1.axhline(y=0, color='red', linestyle='--', alpha=0.5)
    ax1.set_xlabel('RAG Distance')
    ax1.set_ylabel('Accuracy Delta (Base - Unlearned)')
    ax1.set_title(f'Individual Questions: RAG Distance vs Delta\n{model.title()} - {method.upper()}')
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Binned averages
    ax2 = axes[0, 1]
    ax2.plot(distance_groups.index, distance_groups['accuracy_delta'], 
            marker='o', linewidth=2, markersize=6, color='darkblue')
    ax2.axhline(y=0, color='red', linestyle='--', alpha=0.5)
    ax2.set_xlabel('RAG Distance')
    ax2.set_ylabel('Mean Accuracy Delta')
    ax2.set_title('Average Delta by RAG Distance')
    ax2.grid(True, alpha=0.3)
    
    # Plot 3: Hexbin for density
    ax3 = axes[1, 0]
    hexbin = ax3.hexbin(merged['distance'], 
                        merged['is_correct_base'] - merged['is_correct_unlearn'],
                        gridsize=30, cmap='YlOrRd', mincnt=1)
    ax3.axhline(y=0, color='white', linestyle='--', alpha=0.7)
    ax3.set_xlabel('RAG Distance')
    ax3.set_ylabel('Accuracy Delta (Base - Unlearned)')
    ax3.set_title('Density Plot: RAG Distance vs Delta')
    plt.colorbar(hexbin, ax=ax3, label='Count')
    
    # Plot 4: Different distance transformations
    ax4 = axes[1, 1]
    
    # Calculate correlations for different transformations
    valid_data = distance_groups.dropna()
    distances = valid_data.index.values
    deltas = valid_data['accuracy_delta'].values
    
    transformations = {
        'RAG Distance': distances,
        'Log(1 + d)': np.log1p(distances),
        '1/(1 + d)': 1 / (1 + distances),
        'exp(-d/10)': np.exp(-distances / 10)
    }
    
    correlations = []
    for name, transformed in transformations.items():
        if len(transformed) > 2:
            r, p = stats.pearsonr(transformed, deltas)
            correlations.append(f"{name}: r={r:.3f}")
    
    # Plot correlations as text
    ax4.text(0.1, 0.5, '\n'.join(correlations), 
            transform=ax4.transAxes, fontsize=12, verticalalignment='center')
    ax4.set_title('Distance Metric Correlations')
    ax4.axis('off')
    
    plt.suptitle(f'RAG Distance Analysis: {model.title()} - {method.upper()}', fontsize=16, y=1.02)
    plt.tight_layout()
    
    # Save
    date_str = datetime.now().strftime("%Y%m%d-%H%M%S")
    filename = f"rag_distance_scatter_{model}_{method}_{date_str}"
    plt.savefig(PLOT_DIR / f"{filename}.png", dpi=150, bbox_inches='tight')
    plt.savefig(PLOT_DIR / f"{filename}.pdf", bbox_inches='tight')
    plt.show()
    
    # Print summary statistics
    print(f"\nSummary for {model.title()} - {method.upper()}:")
    print(f"Total questions analyzed: {len(merged)}")
    print(f"Mean accuracy delta: {(merged['is_correct_base'] - merged['is_correct_unlearn']).mean():.4f}")
    print(f"Questions with positive delta (hurt by unlearning): {((merged['is_correct_base'] - merged['is_correct_unlearn']) > 0).sum()}")
    print(f"Questions with negative delta (helped): {((merged['is_correct_base'] - merged['is_correct_unlearn']) < 0).sum()}")

# Generate scatter plots for different methods
plot_rag_vs_delta_scatter(model="llama", method="elm")
plot_rag_vs_delta_scatter(model="llama", method="rmu")

## Compare RAG Distance Effects Across All Methods

In [None]:
def compare_rag_effects_all_methods(model="llama", bucket_size=5, max_distance=50):
    """Compare how RAG distance affects accuracy across all unlearning methods."""
    
    METHOD_COLORS = {
        'elm': '#FF6B6B', 'rmu': '#4ECDC4', 'graddiff': '#95E77E',
        'pbj': '#FFD93D', 'tar': '#A8E6CF', 'rmu_lat': '#FF8B94',
        'repnoise': '#B4A7D6', 'rr': '#FFB347'
    }
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Load base model
    df_base = pd.read_csv(BASE_RESULTS[model])
    df_base = df_base[df_base['distance'] < max_distance]
    df_base['distance_bucket'] = (df_base['distance'] // bucket_size) * bucket_size
    base_acc = df_base.groupby('distance_bucket')['is_correct'].mean()
    
    # Plot 1: Accuracy vs RAG distance for all methods
    ax1.plot(base_acc.index, base_acc.values * 100, 
            marker='o', linewidth=3, markersize=8,
            color='black', label='Base', alpha=0.9, zorder=10)
    
    # Plot 2 will show deltas
    for method, path in UNLEARNING_RESULTS[model].items():
        if not path.exists():
            continue
            
        df_unlearn = pd.read_csv(path)
        df_unlearn = df_unlearn[df_unlearn['distance'] < max_distance]
        df_unlearn['distance_bucket'] = (df_unlearn['distance'] // bucket_size) * bucket_size
        unlearn_acc = df_unlearn.groupby('distance_bucket')['is_correct'].mean()
        
        color = METHOD_COLORS.get(method, '#888888')
        
        # Plot accuracy
        ax1.plot(unlearn_acc.index, unlearn_acc.values * 100,
                marker='s', linewidth=2, markersize=5,
                color=color, label=method.upper().replace('_', '-'), alpha=0.7)
        
        # Calculate and plot delta
        common_idx = base_acc.index.intersection(unlearn_acc.index)
        if len(common_idx) > 0:
            delta = base_acc.loc[common_idx] - unlearn_acc.loc[common_idx]
            ax2.plot(common_idx, delta * 100,
                    marker='s', linewidth=2, markersize=5,
                    color=color, label=method.upper().replace('_', '-'), alpha=0.7)
    
    # Format plot 1
    ax1.set_xlabel('RAG Distance (bucketed)', fontsize=12)
    ax1.set_ylabel('Accuracy (%)', fontsize=12)
    ax1.set_title(f'Accuracy vs RAG Distance\n{model.title()} - All Methods', fontsize=14)
    ax1.legend(loc='best', ncol=2, fontsize=9)
    ax1.grid(True, alpha=0.3)
    ax1.set_ylim(0, 100)
    
    # Format plot 2
    ax2.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
    ax2.set_xlabel('RAG Distance (bucketed)', fontsize=12)
    ax2.set_ylabel('Accuracy Delta (%)', fontsize=12)
    ax2.set_title(f'Ripple Effect vs RAG Distance\n{model.title()} - All Methods', fontsize=14)
    ax2.legend(loc='best', ncol=2, fontsize=9)
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    # Save
    date_str = datetime.now().strftime("%Y%m%d-%H%M%S")
    filename = f"rag_distance_comparison_all_methods_{model}_{date_str}"
    plt.savefig(PLOT_DIR / f"{filename}.png", dpi=150, bbox_inches='tight')
    plt.savefig(PLOT_DIR / f"{filename}.pdf", bbox_inches='tight')
    plt.show()

compare_rag_effects_all_methods(model="llama", bucket_size=5, max_distance=50)

## RAG Distance 0 (WMDP Topics) Analysis

In [None]:
def analyze_wmdp_topics(model="llama"):
    """Special analysis of topics at RAG distance 0 (WMDP topics)."""
    
    # Load base data
    df_base = pd.read_csv(BASE_RESULTS[model])
    
    # Get WMDP topics (distance 0)
    wmdp_topics = df_base[df_base['distance'] == 0]
    non_wmdp = df_base[df_base['distance'] > 0]
    
    print(f"\nWMDP Topics Analysis ({model.title()}):")
    print("="*50)
    print(f"WMDP topics (distance 0): {len(wmdp_topics)} questions")
    print(f"Non-WMDP topics: {len(non_wmdp)} questions")
    print(f"\nBase accuracy on WMDP: {wmdp_topics['is_correct'].mean()*100:.2f}%")
    print(f"Base accuracy on non-WMDP: {non_wmdp['is_correct'].mean()*100:.2f}%")
    
    # Compare unlearning effects
    results = []
    for method, path in UNLEARNING_RESULTS[model].items():
        if not path.exists():
            continue
            
        df_unlearn = pd.read_csv(path)
        
        wmdp_unlearn = df_unlearn[df_unlearn['distance'] == 0]
        non_wmdp_unlearn = df_unlearn[df_unlearn['distance'] > 0]
        
        results.append({
            'Method': method.upper(),
            'WMDP Base Acc': wmdp_topics['is_correct'].mean() * 100,
            'WMDP Unlearn Acc': wmdp_unlearn['is_correct'].mean() * 100,
            'WMDP Delta': (wmdp_topics['is_correct'].mean() - wmdp_unlearn['is_correct'].mean()) * 100,
            'Non-WMDP Base Acc': non_wmdp['is_correct'].mean() * 100,
            'Non-WMDP Unlearn Acc': non_wmdp_unlearn['is_correct'].mean() * 100,
            'Non-WMDP Delta': (non_wmdp['is_correct'].mean() - non_wmdp_unlearn['is_correct'].mean()) * 100
        })
    
    results_df = pd.DataFrame(results)
    
    # Visualize
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot 1: WMDP vs Non-WMDP deltas
    x = np.arange(len(results_df))
    width = 0.35
    
    ax1.bar(x - width/2, results_df['WMDP Delta'], width, label='WMDP (d=0)', color='red', alpha=0.7)
    ax1.bar(x + width/2, results_df['Non-WMDP Delta'], width, label='Non-WMDP (d>0)', color='blue', alpha=0.7)
    
    ax1.set_xlabel('Unlearning Method', fontsize=12)
    ax1.set_ylabel('Accuracy Delta (%)', fontsize=12)
    ax1.set_title(f'Unlearning Effect: WMDP vs Non-WMDP Topics\n{model.title()}', fontsize=14)
    ax1.set_xticks(x)
    ax1.set_xticklabels(results_df['Method'], rotation=45, ha='right')
    ax1.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    ax1.legend()
    ax1.grid(True, alpha=0.3, axis='y')
    
    # Plot 2: Scatter plot of WMDP delta vs Non-WMDP delta
    ax2.scatter(results_df['WMDP Delta'], results_df['Non-WMDP Delta'], s=100, alpha=0.7)
    
    for i, row in results_df.iterrows():
        ax2.annotate(row['Method'], 
                    (row['WMDP Delta'], row['Non-WMDP Delta']),
                    xytext=(5, 5), textcoords='offset points', fontsize=9)
    
    ax2.set_xlabel('WMDP Delta (%)', fontsize=12)
    ax2.set_ylabel('Non-WMDP Delta (%)', fontsize=12)
    ax2.set_title(f'WMDP vs Non-WMDP Unlearning Effects\n{model.title()}', fontsize=14)
    ax2.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    ax2.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    ax2.grid(True, alpha=0.3)
    
    # Add diagonal line
    lims = [min(ax2.get_xlim()[0], ax2.get_ylim()[0]),
            max(ax2.get_xlim()[1], ax2.get_ylim()[1])]
    ax2.plot(lims, lims, 'k--', alpha=0.3, zorder=0)
    
    plt.tight_layout()
    
    # Save
    date_str = datetime.now().strftime("%Y%m%d-%H%M%S")
    filename = f"wmdp_vs_nonwmdp_analysis_{model}_{date_str}"
    plt.savefig(PLOT_DIR / f"{filename}.png", dpi=150, bbox_inches='tight')
    plt.savefig(PLOT_DIR / f"{filename}.pdf", bbox_inches='tight')
    plt.show()
    
    print("\nDetailed Results:")
    print(results_df.to_string(index=False))
    
    return results_df

# Analyze WMDP topics
wmdp_analysis = analyze_wmdp_topics(model="llama")