# Self-Referential Processing v2 — Chart Generation

Generates 6 publication-quality charts from v2 experiment results.
All data loaded directly from analysis JSONs.

In [None]:
import json
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.patches import FancyBboxPatch
from pathlib import Path

# -- Paths --
BASE = Path('.')
ANALYSIS = BASE / 'analysis'
CHARTS = BASE / 'charts'
CHARTS.mkdir(exist_ok=True)
V1_SUMMARY = Path('../self_model_circuits/master_summary.json')

# -- Load data --
experiments = {}
for name in ['self_recognition', 'capability_awareness', 'training_knowledge', 'metacognition']:
    with open(ANALYSIS / f'{name}.json') as f:
        experiments[name] = json.load(f)

with open(V1_SUMMARY) as f:
    v1 = json.load(f)

# -- Dark theme (matches NeuronScope reports) --
BG      = '#111118'
CARD_BG = '#1c1c26'
BLUE    = '#3b82f6'
AMBER   = '#f59e0b'
RED     = '#ef4444'
GREEN   = '#22c55e'
PURPLE  = '#a855f7'
TEXT    = '#ebebf0'
TEXT_MID = '#b4b4be'
TEXT_DIM = '#82828e'
GRID    = '#2a2a36'

plt.rcParams.update({
    'figure.facecolor': BG,
    'axes.facecolor': CARD_BG,
    'axes.edgecolor': GRID,
    'axes.labelcolor': TEXT,
    'xtick.color': TEXT_MID,
    'ytick.color': TEXT_MID,
    'text.color': TEXT,
    'grid.color': GRID,
    'grid.alpha': 0.5,
    'font.family': 'sans-serif',
    'font.size': 11,
})

NUM_LAYERS = 34
EXP_NAMES_3 = ['self_recognition', 'capability_awareness', 'metacognition']
EXP_LABELS_3 = ['Self-Recognition', 'Capability\nAwareness', 'Metacognition']
EXP_NAMES_4 = ['self_recognition', 'capability_awareness', 'training_knowledge', 'metacognition']
EXP_LABELS_4 = ['Self-\nRecognition', 'Capability\nAwareness', 'Training\nKnowledge', 'Metacognition']

def get_layer_stats(exp_name, ablation, comparison):
    """Get layer_stats dict for a given experiment/ablation/comparison."""
    return experiments[exp_name]['analysis'][ablation][comparison]['layer_stats']

print('Data loaded. Ready to generate charts.')

## Chart 1 — Layer 26 Cross-Experiment Consistency

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

colors = [BLUE, AMBER, PURPLE]
vals, ci_lo, ci_hi = [], [], []

for name in EXP_NAMES_3:
    stats = get_layer_stats(name, 'zero', 'self_vs_third_person')['26']
    vals.append(stats['mean_differential_kl'])
    ci_lo.append(stats['mean_differential_kl'] - stats['bootstrap_ci_95'][0])
    ci_hi.append(stats['bootstrap_ci_95'][1] - stats['mean_differential_kl'])

x = np.arange(len(EXP_NAMES_3))
bars = ax.bar(x, vals, color=colors, width=0.55, edgecolor='white', linewidth=0.5, zorder=3)
ax.errorbar(x, vals, yerr=[ci_lo, ci_hi], fmt='none', ecolor=TEXT, capsize=6, capthick=1.5, linewidth=1.5, zorder=4)

# Value labels
for i, v in enumerate(vals):
    ax.text(i, v + ci_hi[i] + 0.08, f'{v:.3f}', ha='center', va='bottom', fontsize=12, fontweight='bold', color=TEXT)

ax.set_xticks(x)
ax.set_xticklabels(EXP_LABELS_3, fontsize=11)
ax.set_ylabel('Mean Differential KL (bits)', fontsize=12)
ax.set_title('Layer 26 — Cross-Experiment Consistency', fontsize=14, fontweight='bold', pad=15)
ax.set_ylim(0, 4.5)
ax.grid(axis='y', alpha=0.3)
ax.text(0.5, -0.15, 'Zero ablation · Self vs Third-person · FDR < 0.05 in all three',
        transform=ax.transAxes, ha='center', fontsize=9, color=TEXT_DIM)

plt.tight_layout()
fig.savefig(CHARTS / 'chart1_layer26_consistency.png', dpi=200, bbox_inches='tight')
plt.show()
print('Saved chart1_layer26_consistency.png')

## Chart 2 — v2 Layer × Experiment Heatmap (Zero / Self vs Third-Person)

In [None]:
fig, ax = plt.subplots(figsize=(6, 12))

# Build matrix: 34 layers x 3 experiments
matrix = np.zeros((NUM_LAYERS, 3))
sig_mask = np.zeros((NUM_LAYERS, 3), dtype=bool)

for j, name in enumerate(EXP_NAMES_3):
    stats = get_layer_stats(name, 'zero', 'self_vs_third_person')
    for layer in range(NUM_LAYERS):
        ls = stats[str(layer)]
        matrix[layer, j] = ls['mean_differential_kl']
        sig_mask[layer, j] = ls['significant_fdr_05']

# Clip for better color range
vmin, vmax = -0.5, 3.5
from matplotlib.colors import LinearSegmentedColormap
cmap = LinearSegmentedColormap.from_list('neuronscope', 
    [(0, '#1c1c26'), (0.15, '#1e3a5f'), (0.4, BLUE), (0.7, AMBER), (1.0, RED)])

im = ax.imshow(matrix, aspect='auto', cmap=cmap, vmin=vmin, vmax=vmax, interpolation='nearest')

# Mark significant cells with stars
for i in range(NUM_LAYERS):
    for j in range(3):
        if sig_mask[i, j]:
            ax.text(j, i, '*', ha='center', va='center', fontsize=14, fontweight='bold', color='white')

ax.set_xticks(range(3))
ax.set_xticklabels(['Self-\nRecognition', 'Capability\nAwareness', 'Meta-\ncognition'], fontsize=10)
ax.set_yticks(range(NUM_LAYERS))
ax.set_yticklabels([str(i) for i in range(NUM_LAYERS)], fontsize=8)
ax.set_ylabel('Layer', fontsize=12)
ax.set_title('Differential KL by Layer & Experiment\n(Zero Ablation · Self vs Third-Person)', fontsize=13, fontweight='bold', pad=12)

cbar = plt.colorbar(im, ax=ax, shrink=0.6, pad=0.02)
cbar.set_label('Mean Differential KL (bits)', color=TEXT, fontsize=10)
cbar.ax.yaxis.set_tick_params(color=TEXT_MID)
plt.setp(cbar.ax.yaxis.get_ticklabels(), color=TEXT_MID)

ax.text(0.5, -0.04, '* = FDR < 0.05 (Benjamini-Hochberg)', transform=ax.transAxes,
        ha='center', fontsize=9, color=TEXT_DIM)

plt.tight_layout()
fig.savefig(CHARTS / 'chart2_v2_heatmap.png', dpi=200, bbox_inches='tight')
plt.show()
print('Saved chart2_v2_heatmap.png')

## Chart 3 — Universal & Recurring Layers

In [None]:
# Count how many experiments each layer is significant in (zero/self_vs_third_person)
layer_counts = {}
for name in EXP_NAMES_3:
    stats = get_layer_stats(name, 'zero', 'self_vs_third_person')
    for layer in range(NUM_LAYERS):
        if stats[str(layer)]['significant_fdr_05']:
            layer_counts[layer] = layer_counts.get(layer, 0) + 1

# Sort by count then layer number
sorted_layers = sorted(layer_counts.items(), key=lambda x: (-x[1], x[0]))
layers = [x[0] for x in sorted_layers]
counts = [x[1] for x in sorted_layers]

fig, ax = plt.subplots(figsize=(9, 5))

bar_colors = [GREEN if c == 3 else AMBER if c == 2 else BLUE for c in counts]
bars = ax.barh(range(len(layers)), counts, color=bar_colors, edgecolor='white', linewidth=0.3, height=0.7)

ax.set_yticks(range(len(layers)))
ax.set_yticklabels([f'Layer {l}' for l in layers], fontsize=10)
ax.set_xlabel('Number of Experiments (out of 3)', fontsize=12)
ax.set_title('Layer Significance Frequency\n(Zero Ablation · Self vs Third-Person · FDR < 0.05)', fontsize=13, fontweight='bold', pad=12)
ax.set_xlim(0, 3.8)
ax.set_xticks([0, 1, 2, 3])
ax.invert_yaxis()
ax.grid(axis='x', alpha=0.3)

# Count labels
for i, c in enumerate(counts):
    label = 'UNIVERSAL' if c == 3 else 'Recurring' if c == 2 else ''
    ax.text(c + 0.05, i, f'{c}  {label}', va='center', fontsize=10, fontweight='bold' if c == 3 else 'normal',
            color=GREEN if c == 3 else AMBER if c == 2 else TEXT_MID)

# Legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor=GREEN, edgecolor='white', label='Universal (3/3 experiments)'),
    Patch(facecolor=AMBER, edgecolor='white', label='Recurring (2/3 experiments)'),
    Patch(facecolor=BLUE, edgecolor='white', label='Single experiment'),
]
ax.legend(handles=legend_elements, loc='lower right', fontsize=9,
          facecolor=CARD_BG, edgecolor=GRID, labelcolor=TEXT_MID)

plt.tight_layout()
fig.savefig(CHARTS / 'chart3_universal_layers.png', dpi=200, bbox_inches='tight')
plt.show()
print('Saved chart3_universal_layers.png')

## Chart 4 — v1 vs v2 Hierarchy Inversion

In [None]:
# v1 peak dKL values (from master_summary.json, zero ablation)
v1_peaks = {
    'self_recognition': 6.597495,
    'capability_awareness': 1.520899,
    'training_knowledge': 12.838715,
    'metacognition': 0.922289,
}

# v2 FDR-significant layer counts (zero/self_vs_third_person)
v2_sig_counts = {}
for name in EXP_NAMES_4:
    stats = get_layer_stats(name, 'zero', 'self_vs_third_person')
    count = sum(1 for l in range(NUM_LAYERS) if stats[str(l)]['significant_fdr_05'])
    v2_sig_counts[name] = count

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

x = np.arange(4)
colors_4 = [BLUE, AMBER, RED, PURPLE]

# Left panel: v1
v1_vals = [v1_peaks[n] for n in EXP_NAMES_4]
ax1.bar(x, v1_vals, color=colors_4, width=0.6, edgecolor='white', linewidth=0.5)
for i, v in enumerate(v1_vals):
    ax1.text(i, v + 0.3, f'{v:.1f}', ha='center', fontsize=10, fontweight='bold', color=TEXT)
ax1.set_xticks(x)
ax1.set_xticklabels(EXP_LABELS_4, fontsize=9)
ax1.set_ylabel('Peak Differential KL (bits)', fontsize=11)
ax1.set_title('v1 — Peak dKL (no stats)', fontsize=13, fontweight='bold')
ax1.set_ylim(0, 16)
ax1.grid(axis='y', alpha=0.3)

# Right panel: v2
v2_vals = [v2_sig_counts[n] for n in EXP_NAMES_4]
ax2.bar(x, v2_vals, color=colors_4, width=0.6, edgecolor='white', linewidth=0.5)
for i, v in enumerate(v2_vals):
    label = f'{v}' if v > 0 else 'NULL'
    color = TEXT if v > 0 else RED
    ax2.text(i, v + 0.2, label, ha='center', fontsize=11, fontweight='bold', color=color)
ax2.set_xticks(x)
ax2.set_xticklabels(EXP_LABELS_4, fontsize=9)
ax2.set_ylabel('FDR-Significant Layers', fontsize=11)
ax2.set_title('v2 — FDR-Significant Layers', fontsize=13, fontweight='bold')
ax2.set_ylim(0, 13)
ax2.grid(axis='y', alpha=0.3)

fig.suptitle('v1 → v2: Hierarchy Inversion', fontsize=15, fontweight='bold', y=1.02)
fig.text(0.5, -0.04, 'v1 strongest (Training Knowledge) becomes NULL in v2 with proper third-person controls',
         ha='center', fontsize=10, color=TEXT_DIM)

plt.tight_layout()
fig.savefig(CHARTS / 'chart4_v1_vs_v2.png', dpi=200, bbox_inches='tight')
plt.show()
print('Saved chart4_v1_vs_v2.png')

## Chart 5 — Layer 26 Individual Triplet Values (Strip Plot)

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

colors = [BLUE, AMBER, PURPLE]
np.random.seed(42)

for j, name in enumerate(EXP_NAMES_3):
    stats = get_layer_stats(name, 'zero', 'self_vs_third_person')['26']
    values = stats['individual_values']
    mean_val = stats['mean_differential_kl']
    
    # Jitter
    jitter = np.random.uniform(-0.12, 0.12, len(values))
    ax.scatter([j + jt for jt in jitter], values, color=colors[j], s=80, alpha=0.8,
              edgecolors='white', linewidth=0.5, zorder=3)
    
    # Mean line
    ax.hlines(mean_val, j - 0.25, j + 0.25, color='white', linewidth=2.5, zorder=4)
    ax.text(j + 0.3, mean_val, f'  {mean_val:.3f}', va='center', fontsize=10,
            fontweight='bold', color=TEXT)

ax.set_xticks(range(3))
ax.set_xticklabels(EXP_LABELS_3, fontsize=11)
ax.set_ylabel('Differential KL per Triplet (bits)', fontsize=12)
ax.set_title('Layer 26 — Individual Triplet Values\n(n=8 per experiment)', fontsize=13, fontweight='bold', pad=12)
ax.grid(axis='y', alpha=0.3)
ax.text(0.5, -0.13, 'White lines = mean · Each dot = one prompt triplet',
        transform=ax.transAxes, ha='center', fontsize=9, color=TEXT_DIM)

plt.tight_layout()
fig.savefig(CHARTS / 'chart5_layer26_stripplot.png', dpi=200, bbox_inches='tight')
plt.show()
print('Saved chart5_layer26_stripplot.png')

## Chart 6 — Significance Matrix (Conditions × Experiments)

In [None]:
conditions = [
    ('zero', 'self_vs_control'),
    ('zero', 'self_vs_third_person'),
    ('mean', 'self_vs_control'),
    ('mean', 'self_vs_third_person'),
]
cond_labels = [
    'Zero\nSelf vs Control',
    'Zero\nSelf vs 3rd Person',
    'Mean\nSelf vs Control',
    'Mean\nSelf vs 3rd Person',
]

# Build 4x4 matrix: experiments x conditions
matrix = np.zeros((4, 4))
for i, name in enumerate(EXP_NAMES_4):
    for j, (abl, comp) in enumerate(conditions):
        stats = get_layer_stats(name, abl, comp)
        matrix[i, j] = sum(1 for l in range(NUM_LAYERS) if stats[str(l)]['significant_fdr_05'])

fig, ax = plt.subplots(figsize=(8, 5))

cmap = LinearSegmentedColormap.from_list('sig', [(0, '#1c1c26'), (0.3, '#1e3a5f'), (0.6, BLUE), (1.0, GREEN)])
im = ax.imshow(matrix, cmap=cmap, vmin=0, vmax=10, aspect='auto')

# Annotations
for i in range(4):
    for j in range(4):
        val = int(matrix[i, j])
        label = str(val) if val > 0 else 'NULL'
        color = 'white' if val > 0 else RED
        ax.text(j, i, label, ha='center', va='center', fontsize=13, fontweight='bold', color=color)

ax.set_xticks(range(4))
ax.set_xticklabels(cond_labels, fontsize=9, ha='center')
ax.set_yticks(range(4))
ax.set_yticklabels(['Self-Recognition', 'Capability Awareness', 'Training Knowledge', 'Metacognition'], fontsize=10)
ax.set_title('FDR-Significant Layers by Condition\n(number of layers surviving BH correction at \u03b1=0.05)',
             fontsize=13, fontweight='bold', pad=12)

cbar = plt.colorbar(im, ax=ax, shrink=0.8, pad=0.02)
cbar.set_label('Significant Layers', color=TEXT, fontsize=10)
cbar.ax.yaxis.set_tick_params(color=TEXT_MID)
plt.setp(cbar.ax.yaxis.get_ticklabels(), color=TEXT_MID)

plt.tight_layout()
fig.savefig(CHARTS / 'chart6_significance_matrix.png', dpi=200, bbox_inches='tight')
plt.show()
print('Saved chart6_significance_matrix.png')
print(f'\nSignificance matrix values:\n{matrix}')

## Chart 7 — Pronoun Confound Analysis (Critical Control)

In [None]:
"""Chart 7: Pronoun confound — does the control (first-person non-AI) show
the same Layer 26 effect as self (first-person AI)?

Data extracted from raw sweep files: absolute KL for each input type."""

SWEEP_DIR = BASE / 'sweeps'

# Extract absolute KL for layer 26, zero ablation, per triplet
def get_absolute_kl(exp_name, input_type, target_layer=26):
    kls = []
    for triplet_idx in range(8):
        fname = SWEEP_DIR / f"{exp_name}__{triplet_idx}__{input_type}__zero.json"
        with open(fname) as f:
            sweep = json.load(f)
        for step in sweep:
            interventions = step.get('config', {}).get('interventions', [])
            if interventions and interventions[0].get('target_layer') == target_layer:
                kls.append(step['kl_divergence'])
                break
    return np.array(kls)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5.5))

# Left panel: Grouped bar — absolute KL by input type for Layer 26
exp_labels = ['Self-\nRecognition', 'Capability\nAwareness', 'Metacognition']
x = np.arange(3)
width = 0.25

for exp_idx, exp_name in enumerate(EXP_NAMES_3):
    self_kl = get_absolute_kl(exp_name, 'self')
    ctrl_kl = get_absolute_kl(exp_name, 'control')
    third_kl = get_absolute_kl(exp_name, 'third_person')

    if exp_idx == 0:
        labels = ['Self (1st-person AI)', 'Control (1st-person non-AI)', 'Third-person (3rd-person AI)']
    else:
        labels = [None, None, None]

    ax1.bar(x[exp_idx] - width, np.mean(self_kl), width, color=BLUE, edgecolor='white',
            linewidth=0.5, label=labels[0])
    ax1.bar(x[exp_idx], np.mean(ctrl_kl), width, color=AMBER, edgecolor='white',
            linewidth=0.5, label=labels[1])
    ax1.bar(x[exp_idx] + width, np.mean(third_kl), width, color=PURPLE, edgecolor='white',
            linewidth=0.5, label=labels[2])

ax1.set_xticks(x)
ax1.set_xticklabels(exp_labels, fontsize=10)
ax1.set_ylabel('Absolute KL (bits)', fontsize=11)
ax1.set_title('Layer 26 — Absolute KL by Input Type', fontsize=12, fontweight='bold')
ax1.legend(fontsize=8, facecolor=CARD_BG, edgecolor=GRID, labelcolor=TEXT_MID, loc='upper right')
ax1.grid(axis='y', alpha=0.3)

# Add bracket annotations
for i in range(3):
    ax1.annotate('', xy=(i - width, 9.8), xytext=(i, 9.8),
                arrowprops=dict(arrowstyle='-', color=GREEN, lw=2))
    ax1.text(i - width/2, 10.0, 'same!', ha='center', fontsize=8, color=GREEN, fontweight='bold')

# Right panel: Differential comparison
self_3rd_means = []
ctrl_3rd_means = []
self_ctrl_means = []

for exp_name in EXP_NAMES_3:
    self_kl = get_absolute_kl(exp_name, 'self')
    ctrl_kl = get_absolute_kl(exp_name, 'control')
    third_kl = get_absolute_kl(exp_name, 'third_person')
    self_3rd_means.append(np.mean(self_kl - third_kl))
    ctrl_3rd_means.append(np.mean(ctrl_kl - third_kl))
    self_ctrl_means.append(np.mean(self_kl - ctrl_kl))

x2 = np.arange(3)
width2 = 0.25

bars1 = ax2.bar(x2 - width2, self_3rd_means, width2, color=BLUE, edgecolor='white',
                linewidth=0.5, label='Self - 3rd person')
bars2 = ax2.bar(x2, ctrl_3rd_means, width2, color=AMBER, edgecolor='white',
                linewidth=0.5, label='Control - 3rd person')
bars3 = ax2.bar(x2 + width2, self_ctrl_means, width2, color=RED, edgecolor='white',
                linewidth=0.5, label='Self - Control (AI-specific)')

# Value labels
for i in range(3):
    ax2.text(i - width2, self_3rd_means[i] + 0.08, f'{self_3rd_means[i]:.2f}', ha='center', fontsize=8, color=TEXT)
    ax2.text(i, ctrl_3rd_means[i] + 0.08, f'{ctrl_3rd_means[i]:.2f}', ha='center', fontsize=8, color=TEXT)
    ax2.text(i + width2, self_ctrl_means[i] + 0.08, f'{self_ctrl_means[i]:.2f}', ha='center', fontsize=8,
             color=RED if self_ctrl_means[i] < 0.1 else TEXT)

ax2.set_xticks(x2)
ax2.set_xticklabels(exp_labels, fontsize=10)
ax2.set_ylabel('Differential KL (bits)', fontsize=11)
ax2.set_title('Layer 26 — Pronoun vs AI-Specific Signal', fontsize=12, fontweight='bold')
ax2.legend(fontsize=8, facecolor=CARD_BG, edgecolor=GRID, labelcolor=TEXT_MID)
ax2.grid(axis='y', alpha=0.3)
ax2.axhline(y=0, color=TEXT_DIM, linewidth=0.5, linestyle='--')

fig.suptitle('Pronoun Confound Analysis: Layer 26 responds to first-person framing,\nnot AI-specific self-reference',
             fontsize=13, fontweight='bold', y=1.05)
fig.text(0.5, -0.03, 'Self and Control show nearly identical absolute KL. The AI-specific residual (red) is near zero.',
         ha='center', fontsize=9, color=TEXT_DIM)

plt.tight_layout()
fig.savefig(CHARTS / 'chart7_pronoun_confound.png', dpi=200, bbox_inches='tight')
plt.show()
print('Saved chart7_pronoun_confound.png')