# Safety Benchmark Scaling Analysis - Comprehensive Visualizations

This notebook creates clear visualizations to analyze:
1. **Baseline Scaling**: Which benchmarks improve with model scale?
2. **Technique Effectiveness**: Which safety techniques work best?
3. **Scaling Behavior**: Do techniques become more/less effective at larger scales?

**Following SafetyWashing methodology**: Individual data points, Spearman correlations, slope-based categorization

## Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
import hashlib
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (16, 10)
plt.rcParams['font.size'] = 11

print('âœ“ All imports successful')

## Load Merged Data

**Note**: Run `preprocess_and_merge.ipynb` first to create `merged_data.csv`

In [None]:
# Load merged dataset
df = pd.read_csv('../data/merged_data.csv')

print(f'Loaded {len(df)} data points')
print(f'\nColumns: {list(df.columns)}')
print(f'\nFirst few rows:')
df.head()

## Data Preprocessing: Critical Corrections

**Two critical preprocessing steps:**

1. **Unique Technique Identifiers**: Same technique name from different papers = different implementations
2. **Benchmark Directionality**: Some benchmarks (WMDP: BIO, CHEM, CYBER) have inverted scoring (higher = worse safety)

In [None]:
# STEP 1: Create unique technique identifiers (Technique + Source Paper)
def create_paper_id(source_paper):
    """Create a short hash ID from the source paper URL"""
    if pd.isna(source_paper):
        return 'unknown'
    return hashlib.md5(str(source_paper).encode()).hexdigest()[:8]

df['paper_id'] = df['Source paper'].apply(create_paper_id)
df['technique_id'] = df['Technique'] + '_' + df['paper_id']

print(f"âœ“ Created unique technique identifiers")
print(f"  Original technique names: {df['Technique'].nunique()}")
print(f"  Unique technique_id count: {df['technique_id'].nunique()}")

# STEP 2: Identify and handle benchmark directionality
# WMDP benchmarks (BIO, CHEM, CYBER): higher score = worse safety (need inversion)
# Other benchmarks: higher score = better safety (already correct)

INVERTED_BENCHMARKS = ['BIO', 'CHEM', 'CYBER']

print(f"\nâœ“ Benchmark directionality identified:")
print(f"  Inverted (higher = worse): {INVERTED_BENCHMARKS}")

# STEP 3: Invert performance for WMDP benchmarks to make higher = better consistently
df['Performance_normalized'] = df['Performance'].copy()

for benchmark in INVERTED_BENCHMARKS:
    mask = df['Benchmark'] == benchmark
    if mask.sum() > 0:
        # Invert: new_score = 1 - old_score (assuming scores are 0-1)
        # If scores are percentages (0-100), use: 100 - old_score
        max_val = df.loc[mask, 'Performance'].max()
        if max_val > 1.5:  # Likely percentage scale
            df.loc[mask, 'Performance_normalized'] = 100 - df.loc[mask, 'Performance']
        else:  # Likely 0-1 scale
            df.loc[mask, 'Performance_normalized'] = 1 - df.loc[mask, 'Performance']
        
        print(f"  Inverted {benchmark}: {mask.sum()} data points")

print(f"\nâœ“ All benchmarks now normalized: higher = better safety")
print(f"\nExample - BIO benchmark before/after:")
bio_sample = df[df['Benchmark'] == 'BIO'][['Technique', 'Model', 'Performance', 'Performance_normalized']].head(5)
print(bio_sample)

---
# Phase 1: Data Overview
---

## 1.1 Dataset Statistics Dashboard

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 12))

# Plot 1: Data points per benchmark
ax1 = axes[0, 0]
benchmark_counts = df['Benchmark'].value_counts().sort_values(ascending=True)
benchmark_counts.plot(kind='barh', ax=ax1, color='steelblue', edgecolor='black')
ax1.set_xlabel('Number of Data Points', fontweight='bold')
ax1.set_title('Data Coverage by Benchmark', fontsize=14, fontweight='bold')
ax1.grid(axis='x', alpha=0.3)

# Plot 2: Data points per technique (top 15)
ax2 = axes[0, 1]
technique_counts = df['Technique'].value_counts().head(15).sort_values(ascending=True)
technique_counts.plot(kind='barh', ax=ax2, color='coral', edgecolor='black')
ax2.set_xlabel('Number of Data Points', fontweight='bold')
ax2.set_title('Data Coverage by Technique (Top 15)', fontsize=14, fontweight='bold')
ax2.grid(axis='x', alpha=0.3)

# Plot 3: Distribution of model scales
ax3 = axes[1, 0]
df['scale_numeric'].hist(bins=30, ax=ax3, color='lightgreen', edgecolor='black')
ax3.set_xlabel('Model Scale (Billions of Parameters)', fontweight='bold')
ax3.set_ylabel('Frequency', fontweight='bold')
ax3.set_title('Distribution of Model Scales', fontsize=14, fontweight='bold')
ax3.grid(axis='y', alpha=0.3)

# Plot 4: Coverage matrix
ax4 = axes[1, 1]
coverage_matrix = df.groupby(['Technique', 'Benchmark']).size().unstack(fill_value=0)
top_techniques = df['Technique'].value_counts().head(10).index
coverage_subset = coverage_matrix.loc[top_techniques]
sns.heatmap(coverage_subset, annot=True, fmt='d', cmap='YlOrRd', 
            linewidths=0.5, cbar_kws={'label': 'Data Points'}, ax=ax4)
ax4.set_title('Coverage Matrix: Top 10 Techniques Ã— Benchmarks', fontsize=14, fontweight='bold')
ax4.set_xlabel('Benchmark', fontweight='bold')
ax4.set_ylabel('Technique', fontweight='bold')

plt.tight_layout()
plt.show()

print(f'\nðŸ“Š Dataset Summary:')
print(f'  Total data points: {len(df)}')
print(f'  Unique benchmarks: {df["Benchmark"].nunique()}')
print(f'  Unique techniques (by name): {df["Technique"].nunique()}')
print(f'  Unique techniques (by ID): {df["technique_id"].nunique()}')
print(f'  Unique models: {df["Model"].nunique()}')
print(f'  Scale range: {df["scale_numeric"].min():.0f}B - {df["scale_numeric"].max():.0f}B')

---
# Phase 2: Baseline Scaling Analysis
---

## 2.1 Baseline Performance vs Scale

**Goal**: Identify which benchmarks are "saturated" (improve with scale) vs "not saturated"

In [None]:
# Extract baseline data - using normalized performance
baseline_df = df[df['Technique'] == 'Baseline'].copy()

print(f'Baseline data points: {len(baseline_df)}')
print(f'Benchmarks with baseline data: {sorted(baseline_df["Benchmark"].unique())}')

In [None]:
# Calculate correlation and slope for each benchmark using normalized performance
benchmark_stats = []

for benchmark in baseline_df['Benchmark'].unique():
    bench_data = baseline_df[baseline_df['Benchmark'] == benchmark]
    
    if len(bench_data) >= 2:
        scales = bench_data['scale_numeric'].values
        perfs = bench_data['Performance_normalized'].values  # UPDATED: use normalized
        
        # Spearman correlation
        corr, p_value = spearmanr(scales, perfs)
        
        # Linear regression for slope
        model = LinearRegression()
        model.fit(scales.reshape(-1, 1), perfs)
        slope = model.coef_[0]
        r2 = model.score(scales.reshape(-1, 1), perfs)
        
        benchmark_stats.append({
            'Benchmark': benchmark,
            'Correlation': corr,
            'P_value': p_value,
            'Slope': slope,
            'R2': r2,
            'N_points': len(bench_data),
            'Category': 'Saturated' if corr > 0.5 else 'Not Saturated'
        })

bench_stats_df = pd.DataFrame(benchmark_stats)

print('\nðŸ“ˆ Benchmark Scaling Statistics (with normalized scores):')
print('='*80)
print(bench_stats_df.to_string(index=False))

In [None]:
# Visualize baseline scaling with normalized performance
benchmarks = baseline_df['Benchmark'].unique()
n_benchmarks = len(benchmarks)
n_cols = min(3, n_benchmarks)
n_rows = (n_benchmarks + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(8*n_cols, 6*n_rows))
if n_benchmarks == 1:
    axes = [axes]
else:
    axes = axes.flatten() if n_benchmarks > 1 else [axes]

colors = plt.cm.tab10(np.linspace(0, 1, n_benchmarks))

for idx, benchmark in enumerate(benchmarks):
    if idx >= len(axes):
        break
    
    ax = axes[idx]
    bench_data = baseline_df[baseline_df['Benchmark'] == benchmark]
    
    # Scatter plot - using normalized performance
    ax.scatter(bench_data['scale_numeric'], bench_data['Performance_normalized'],
              s=150, alpha=0.7, color=colors[idx], edgecolors='black', linewidth=2)
    
    # Trend line
    if len(bench_data) >= 2:
        scales = bench_data['scale_numeric'].values.reshape(-1, 1)
        perfs = bench_data['Performance_normalized'].values
        model = LinearRegression()
        model.fit(scales, perfs)
        
        scale_range = np.linspace(scales.min(), scales.max(), 100)
        pred = model.predict(scale_range.reshape(-1, 1))
        ax.plot(scale_range, pred, '--', color=colors[idx], linewidth=3, alpha=0.8)
        
        # Add statistics
        stats = bench_stats_df[bench_stats_df['Benchmark'] == benchmark].iloc[0]
        
        # Mark if benchmark is inverted
        inverted_marker = ' (INVERTED)' if benchmark in INVERTED_BENCHMARKS else ''
        
        stats_text = (
            f"Corr: {stats['Correlation']:.3f}\n"
            f"Slope: {stats['Slope']:.4f}\n"
            f"RÂ²: {stats['R2']:.3f}\n"
            f"Status: {stats['Category']}{inverted_marker}"
        )
        ax.text(0.05, 0.95, stats_text, transform=ax.transAxes,
               verticalalignment='top',
               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8),
               fontsize=10, fontweight='bold')
    
    ax.set_xlabel('Model Scale (Billions of Parameters)', fontsize=12, fontweight='bold')
    ax.set_ylabel('Baseline Performance (Normalized)', fontsize=12, fontweight='bold')
    ax.set_title(f'{benchmark}', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)

# Remove extra subplots
for idx in range(n_benchmarks, len(axes)):
    fig.delaxes(axes[idx])

plt.suptitle('Baseline Performance vs Model Scale (Normalized: Higher = Better Safety)', 
             fontsize=18, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

## 2.2 Benchmark Saturation Categorization

In [None]:
# Horizontal bar chart with categorization
fig, ax = plt.subplots(figsize=(14, 8))

bench_stats_sorted = bench_stats_df.sort_values('Correlation')
colors = ['#2ECC71' if cat == 'Saturated' else '#E74C3C' 
          for cat in bench_stats_sorted['Category']]

y_pos = np.arange(len(bench_stats_sorted))
bars = ax.barh(y_pos, bench_stats_sorted['Correlation'], 
               color=colors, edgecolor='black', linewidth=2, alpha=0.7)

# Add threshold line
ax.axvline(x=0.5, color='red', linestyle='--', linewidth=3, 
          alpha=0.7, label='Saturation threshold (0.5)')

# Add value labels
for i, (bar, row) in enumerate(zip(bars, bench_stats_sorted.itertuples())):
    width = bar.get_width()
    label_text = f"{row.Correlation:.3f}\n(slope: {row.Slope:.4f})"
    ax.text(width + 0.02, bar.get_y() + bar.get_height()/2.,
           label_text, ha='left', va='center', fontsize=10, fontweight='bold')

ax.set_yticks(y_pos)
ax.set_yticklabels(bench_stats_sorted['Benchmark'], fontsize=12, fontweight='bold')
ax.set_xlabel('Spearman Correlation (Baseline Performance vs Scale)', 
             fontsize=13, fontweight='bold')
ax.set_title('Benchmark Saturation Analysis\n(Do benchmarks improve with model scale?)',
            fontsize=16, fontweight='bold')

# Add legend for colors
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#2ECC71', label='Saturated (corr > 0.5)'),
    Patch(facecolor='#E74C3C', label='Not Saturated (corr â‰¤ 0.5)')
]
ax.legend(handles=legend_elements, loc='lower right', fontsize=11)
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print('\nðŸ“Š Benchmark Categories:')
print('='*60)
for category in ['Saturated', 'Not Saturated']:
    benchmarks = bench_stats_df[bench_stats_df['Category'] == category]['Benchmark'].tolist()
    print(f'{category}: {benchmarks}')

---
# Phase 3: Technique Performance Analysis
---

## 3.1 Calculate Performance Difference from Baseline

In [None]:
# Calculate difference from baseline for each data point
# UPDATED: Use technique_id (unique per paper) and Performance_normalized
diff_data = []

for model in df['Model'].unique():
    for scale in df[df['Model'] == model]['Scale'].unique():
        for benchmark in df['Benchmark'].unique():
            subset = df[
                (df['Model'] == model) &
                (df['Scale'] == scale) &
                (df['Benchmark'] == benchmark)
            ]
            
            # Get baseline performance (normalized)
            baseline_perf = subset[subset['Technique'] == 'Baseline']['Performance_normalized']
            if len(baseline_perf) == 0:
                continue
            
            baseline_val = baseline_perf.values[0]
            scale_numeric = subset['scale_numeric'].values[0]
            
            # Calculate difference for each UNIQUE technique (technique_id)
            for technique_id in subset['technique_id'].unique():
                tech_row = subset[subset['technique_id'] == technique_id].iloc[0]
                technique_name = tech_row['Technique']
                paper_id = tech_row['paper_id']
                
                if technique_name == 'Baseline':
                    continue
                
                tech_val = tech_row['Performance_normalized']
                diff = tech_val - baseline_val
                
                diff_data.append({
                    'Model': model,
                    'Scale': scale,
                    'scale_numeric': scale_numeric,
                    'Benchmark': benchmark,
                    'Technique': technique_name,
                    'technique_id': technique_id,
                    'paper_id': paper_id,
                    'Source_paper': tech_row['Source paper'],
                    'Baseline_perf': baseline_val,
                    'Technique_perf': tech_val,
                    'Diff_from_baseline': diff,
                    'Relative_change_pct': (diff / baseline_val) * 100 if baseline_val != 0 else 0
                })

diff_df = pd.DataFrame(diff_data)

print(f'\nâœ“ Calculated performance differences for {len(diff_df)} data points')
print(f'\nUnique techniques (by NAME): {diff_df["Technique"].nunique()}')
print(f'Unique techniques (by ID): {diff_df["technique_id"].nunique()}')
print(f'\nðŸ’¡ Same technique from different papers is now treated as different!')
print(f'\nSample data:')
print(diff_df[['Technique', 'paper_id', 'technique_id', 'Benchmark', 'Diff_from_baseline']].head(10))

## 3.2 Performance Gain Heatmap

In [None]:
# Create heatmap of mean performance gain (using technique names for display)
top_techniques = diff_df['Technique'].value_counts().head(15).index
diff_subset = diff_df[diff_df['Technique'].isin(top_techniques)]

heatmap_data = diff_subset.pivot_table(
    index='Technique',
    columns='Benchmark',
    values='Diff_from_baseline',
    aggfunc='mean'
)

fig, ax = plt.subplots(figsize=(14, 10))
sns.heatmap(heatmap_data, annot=True, fmt='.2f', cmap='RdYlGn', center=0,
           linewidths=1, linecolor='black', cbar_kws={'label': 'Mean Performance Gain'},
           ax=ax, vmin=-30, vmax=30)

ax.set_title('Technique Effectiveness: Mean Performance Gain vs Baseline\n(Top 15 Techniques by Name)',
            fontsize=16, fontweight='bold')
ax.set_xlabel('Benchmark', fontsize=13, fontweight='bold')
ax.set_ylabel('Technique', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.show()

print('\nðŸŽ¯ Key Insights:')
print('  Green = Technique improves over baseline')
print('  Red = Technique worse than baseline')
print('  Yellow = Similar to baseline')

---
# Phase 4: Scaling Behavior Analysis
---

## 4.1 Performance Gain vs Scale (Critical Analysis)

**Key Question**: Do techniques become MORE or LESS effective at larger scales?

In [None]:
# Calculate scaling behavior for each technique-benchmark combination
# UPDATED: Use technique_id for unique identification
technique_scaling = []

for technique_id in diff_df['technique_id'].unique():
    technique_name = diff_df[diff_df['technique_id'] == technique_id]['Technique'].iloc[0]
    
    for benchmark in diff_df['Benchmark'].unique():
        subset = diff_df[
            (diff_df['technique_id'] == technique_id) &
            (diff_df['Benchmark'] == benchmark)
        ]
        
        if len(subset) >= 2:
            scales = subset['scale_numeric'].values
            diffs = subset['Diff_from_baseline'].values
            
            # Spearman correlation
            corr, p_value = spearmanr(scales, diffs)
            
            # Linear regression
            model = LinearRegression()
            model.fit(scales.reshape(-1, 1), diffs)
            slope = model.coef_[0]
            r2 = model.score(scales.reshape(-1, 1), diffs)
            
            # Categorize
            if corr > 0.3:
                category = 'Gain Increases with Scale'
            elif corr < -0.3:
                category = 'Gain Decreases with Scale'
            else:
                category = 'Gain Stable with Scale'
            
            technique_scaling.append({
                'Technique': technique_name,
                'technique_id': technique_id,
                'Benchmark': benchmark,
                'Correlation': corr,
                'P_value': p_value,
                'Slope': slope,
                'R2': r2,
                'N_points': len(subset),
                'Category': category,
                'Mean_gain': diffs.mean()
            })

tech_scaling_df = pd.DataFrame(technique_scaling)

print(f'\nâœ“ Analyzed scaling behavior for {len(tech_scaling_df)} technique-benchmark combinations')
print(f'  (Using unique technique_id to distinguish same techniques from different papers)')
print(f'\nTop 20 results:')
print(tech_scaling_df.sort_values('Slope', ascending=False).head(20)[[
    'Technique', 'Benchmark', 'Correlation', 'Slope', 'Mean_gain', 'Category'
]].to_string(index=False))

## 4.2 Slope Analysis: Technique Categorization

In [None]:
# Scatter plot: Slope vs Mean Gain
# UPDATED: Group by technique_id for unique techniques
tech_summary = tech_scaling_df.groupby('technique_id').agg({
    'Technique': 'first',
    'Slope': 'mean',
    'Mean_gain': 'mean',
    'N_points': 'sum'
}).reset_index()

tech_summary_filtered = tech_summary[tech_summary['N_points'] >= 5]

fig, ax = plt.subplots(figsize=(16, 10))

scatter = ax.scatter(tech_summary_filtered['Slope'], 
                    tech_summary_filtered['Mean_gain'],
                    s=tech_summary_filtered['N_points'] * 10,
                    alpha=0.6, c=range(len(tech_summary_filtered)),
                    cmap='viridis', edgecolors='black', linewidth=2)

# Add labels
for _, row in tech_summary_filtered.iterrows():
    label = row['Technique'][:20] if len(row['Technique']) > 20 else row['Technique']
    ax.annotate(label, 
               (row['Slope'], row['Mean_gain']),
               fontsize=9, fontweight='bold',
               bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.3))

# Add quadrant lines
ax.axhline(y=0, color='gray', linestyle='-', linewidth=1.5, alpha=0.5)
ax.axvline(x=0, color='gray', linestyle='-', linewidth=1.5, alpha=0.5)

# Add quadrant labels
ax.text(0.7, 0.95, 'HIGH GAIN\n+ SCALES WELL\n(BEST)', 
       transform=ax.transAxes, fontsize=12, fontweight='bold',
       bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.7),
       ha='center', va='top')
ax.text(0.3, 0.95, 'HIGH GAIN\n+ STABLE/DECREASES', 
       transform=ax.transAxes, fontsize=12, fontweight='bold',
       bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.7),
       ha='center', va='top')
ax.text(0.7, 0.05, 'LOW GAIN\n+ SCALES WELL', 
       transform=ax.transAxes, fontsize=12, fontweight='bold',
       bbox=dict(boxstyle='round', facecolor='orange', alpha=0.7),
       ha='center', va='bottom')
ax.text(0.3, 0.05, 'LOW GAIN\n+ DECREASES\n(WORST)', 
       transform=ax.transAxes, fontsize=12, fontweight='bold',
       bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.7),
       ha='center', va='bottom')

ax.set_xlabel('Slope (Change in Gain per Unit Scale)', fontsize=13, fontweight='bold')
ax.set_ylabel('Mean Performance Gain', fontsize=13, fontweight='bold')
ax.set_title('Technique Categorization: Current Performance vs Scaling Potential\n(Size = Total Data Points, Unique Techniques by Paper)',
            fontsize=16, fontweight='bold')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f'\nðŸ“Š Total unique techniques analyzed: {len(tech_summary_filtered)}')

---
# Phase 6: Summary Dashboard
---

## 6.1 One-Page Summary Dashboard

In [None]:
fig = plt.figure(figsize=(20, 12))
gs = fig.add_gridspec(2, 2, hspace=0.3, wspace=0.3)

# Panel 1: Benchmark Saturation
ax1 = fig.add_subplot(gs[0, 0])
bench_stats_sorted = bench_stats_df.sort_values('Correlation')
colors = ['#2ECC71' if cat == 'Saturated' else '#E74C3C' 
          for cat in bench_stats_sorted['Category']]
y_pos = np.arange(len(bench_stats_sorted))
ax1.barh(y_pos, bench_stats_sorted['Correlation'], color=colors, 
        edgecolor='black', linewidth=2, alpha=0.7)
ax1.set_yticks(y_pos)
ax1.set_yticklabels(bench_stats_sorted['Benchmark'], fontweight='bold')
ax1.axvline(x=0.5, color='red', linestyle='--', linewidth=2, alpha=0.7)
ax1.set_xlabel('Correlation', fontweight='bold')
ax1.set_title('Benchmark Saturation', fontsize=14, fontweight='bold')
ax1.grid(axis='x', alpha=0.3)

# Panel 2: Technique Effectiveness Heatmap
ax2 = fig.add_subplot(gs[0, 1])
top_5_techniques = diff_df['Technique'].value_counts().head(5).index
heatmap_subset = diff_df[diff_df['Technique'].isin(top_5_techniques)].pivot_table(
    index='Technique', columns='Benchmark', values='Diff_from_baseline', aggfunc='mean'
)
sns.heatmap(heatmap_subset, annot=True, fmt='.1f', cmap='RdYlGn', center=0,
           linewidths=1, cbar_kws={'label': 'Mean Gain'}, ax=ax2)
ax2.set_title('Top 5 Technique Effectiveness', fontsize=14, fontweight='bold')
ax2.set_xlabel('Benchmark', fontweight='bold')
ax2.set_ylabel('Technique', fontweight='bold')

# Panel 3: Scaling Trends
ax3 = fig.add_subplot(gs[1, 0])
tech_summary_top = tech_summary_filtered.head(10)
scatter = ax3.scatter(tech_summary_top['Slope'], tech_summary_top['Mean_gain'],
                     s=tech_summary_top['N_points'] * 20, alpha=0.6,
                     c=range(len(tech_summary_top)), cmap='viridis',
                     edgecolors='black', linewidth=2)
for _, row in tech_summary_top.iterrows():
    ax3.annotate(row['Technique'][:15], (row['Slope'], row['Mean_gain']),
                fontsize=8, bbox=dict(boxstyle='round,pad=0.2', 
                facecolor='yellow', alpha=0.3))
ax3.axhline(y=0, color='gray', linestyle='-', alpha=0.5)
ax3.axvline(x=0, color='gray', linestyle='-', alpha=0.5)
ax3.set_xlabel('Slope (Scaling Behavior)', fontweight='bold')
ax3.set_ylabel('Mean Gain', fontweight='bold')
ax3.set_title('Scaling Trends (Top 10)', fontsize=14, fontweight='bold')
ax3.grid(True, alpha=0.3)

# Panel 4: Best Technique per Benchmark
ax4 = fig.add_subplot(gs[1, 1])
ax4.axis('off')

best_techniques = []
for benchmark in diff_df['Benchmark'].unique():
    bench_data = diff_df[diff_df['Benchmark'] == benchmark]
    best = bench_data.groupby('Technique')['Diff_from_baseline'].mean().idxmax()
    best_gain = bench_data.groupby('Technique')['Diff_from_baseline'].mean().max()
    best_techniques.append([benchmark, best, f'{best_gain:.2f}'])

table_data = [['Benchmark', 'Best Technique', 'Mean Gain']] + best_techniques
table = ax4.table(cellText=table_data, cellLoc='left', loc='center',
                 colWidths=[0.3, 0.45, 0.25])
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 2)

for i in range(3):
    table[(0, i)].set_facecolor('#3498DB')
    table[(0, i)].set_text_props(weight='bold', color='white')

ax4.set_title('Best Technique per Benchmark', fontsize=14, fontweight='bold', pad=20)

plt.suptitle('Safety Benchmark Scaling Analysis - Summary Dashboard',
            fontsize=20, fontweight='bold', y=0.98)
plt.show()

## Key Findings Summary

In [None]:
print('='*80)
print('KEY FINDINGS')
print('='*80)

print('\n1. BENCHMARK SATURATION:')
saturated = bench_stats_df[bench_stats_df['Category'] == 'Saturated']['Benchmark'].tolist()
not_saturated = bench_stats_df[bench_stats_df['Category'] == 'Not Saturated']['Benchmark'].tolist()
print(f'   Saturated (improve with scale): {saturated}')
print(f'   Not Saturated: {not_saturated}')

print('\n2. TECHNIQUE EFFECTIVENESS:')
top_3_overall = tech_summary_filtered.nlargest(3, 'Mean_gain')
print('   Top 3 techniques by mean gain:')
for _, row in top_3_overall.iterrows():
    print(f'   - {row["Technique"]}: {row["Mean_gain"]:.2f} gain')

print('\n3. SCALING BEHAVIOR:')
scales_well = tech_summary_filtered[tech_summary_filtered['Slope'] > 0.1]
print(f'   Techniques that scale well (positive slope > 0.1): {len(scales_well)}')
if len(scales_well) > 0:
    print('   Top 3:')
    for _, row in scales_well.nlargest(3, 'Slope').iterrows():
        print(f'   - {row["Technique"]}: slope={row["Slope"]:.4f}')

print('\n4. SWEET SPOT TECHNIQUES (High gain + Positive slope):')
sweet_spot = tech_summary_filtered[
    (tech_summary_filtered['Mean_gain'] > 0) & 
    (tech_summary_filtered['Slope'] > 0)
]
if len(sweet_spot) > 0:
    for _, row in sweet_spot.head(5).iterrows():
        print(f'   - {row["Technique"]}: gain={row["Mean_gain"]:.2f}, slope={row["Slope"]:.4f}')
else:
    print('   None found')

print('\n' + '='*80)