# Reproduce Figure 1: Mandela Effect Correlation with Human False-Belief Prevalence

This notebook reproduces the key finding from *Confidence Cartography* (Sanchez, 2026):

**Model confidence ratios correlate with human false-belief prevalence (ρ = 0.652 at 6.9B parameters, p = 0.016)**

We demonstrate that teacher-forced probability—the probability a causal LM assigns to its own training text—can detect culturally transmitted false beliefs ("Mandela Effects").

In [None]:
import sys
import json
from pathlib import Path
from collections import defaultdict

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Project setup
PROJECT_ROOT = Path("../").resolve()
sys.path.insert(0, str(PROJECT_ROOT))

from src.schema import load_records
from src.scaling import PARAM_COUNTS

print(f"Project root: {PROJECT_ROOT}")

## 1. Load Mandela Effect Data

The dataset contains 13 Mandela Effect items, each with:
- A **popular (incorrect)** version that many people believe
- The **correct** version

For each item, we measure model confidence on both versions across 7 Pythia model sizes (160M → 12B).

In [None]:
# Load the Mandela Effect item definitions
with open(PROJECT_ROOT / "data" / "mandela_effect.json") as f:
    mandela_data = json.load(f)

print(f"Loaded {len(mandela_data['pairs'])} Mandela Effect items")
print("\nExample:")
print(json.dumps(mandela_data['pairs'][0], indent=2))

In [None]:
# Load pre-computed confidence results for all model sizes
MANDELA_RESULTS_DIR = PROJECT_ROOT / "data" / "results" / "mandela"
MODEL_SIZES = ["160m", "410m", "1b", "1.4b", "2.8b", "6.9b"]  # Add "12b" if available

all_results = {}
for size in MODEL_SIZES:
    path = MANDELA_RESULTS_DIR / f"mandela_{size}.jsonl"
    if path.exists():
        records = load_records(path)
        all_results[size] = records
        print(f"  {size}: {len(records)} records")
    else:
        print(f"  {size}: NOT FOUND")

## 2. Compute Confidence Ratios

For each Mandela Effect item, we compute:

$$\text{Confidence Ratio} = \frac{P(\text{popular})}{P(\text{correct})}$$

A ratio > 1 means the model is more confident in the wrong (but popular) version.

In [None]:
def compute_confidence_ratios(records):
    """Compute P(popular)/P(correct) for each Mandela item."""
    by_id = defaultdict(dict)
    for r in records:
        pair_id = r.metadata.get("pair_id", r.label.rsplit("_", 1)[0])
        version = r.metadata.get("version", "popular" if "popular" in r.label else "correct")
        by_id[pair_id][version] = r.mean_top1_prob
    
    ratios = {}
    for pid, versions in by_id.items():
        if "popular" in versions and "correct" in versions:
            ratios[pid] = versions["popular"] / versions["correct"]
    return ratios

# Compute ratios for each model size
model_ratios = {}
for size, records in all_results.items():
    model_ratios[size] = compute_confidence_ratios(records)

print(f"\nConfidence ratios at 6.9B scale:")
if "6.9b" in model_ratios:
    for pid, ratio in sorted(model_ratios["6.9b"].items(), key=lambda x: -x[1]):
        status = "MODEL PREFERS WRONG" if ratio > 1 else "Model prefers correct"
        print(f"  {pid}: {ratio:.3f} ({status})")

## 3. Human False-Belief Prevalence (Proxy Data)

To correlate with human beliefs, we use survey-derived estimates of how many people hold each false belief.
These values represent the approximate percentage of people who believe the popular (incorrect) version.

In [None]:
# Estimated human false-belief prevalence (0-1 scale)
# Source: Survey data and Mandela Effect community research
HUMAN_FALSE_BELIEF_RATES = {
    "darth_vader": 0.85,      # Most people misquote this
    "snow_white": 0.80,       # Very common misquote
    "jaws": 0.70,             # Common misquote
    "forrest_gump": 0.75,     # Common tense error
    "silence_lambs": 0.65,    # Moderately common
    "money_evil": 0.70,       # Very common misquote
    "curiosity_cat": 0.90,    # Almost universal
    "play_it_sam": 0.60,      # Less common
    "berenstain": 0.75,       # The classic Mandela Effect
    "curious_george": 0.55,   # Moderate
    "monopoly_man": 0.65,     # Common visual memory error
    "fruit_loom": 0.50,       # Contested
    "chartreuse": 0.40,       # Less common
}

print("Human false-belief prevalence estimates:")
for pid, rate in sorted(HUMAN_FALSE_BELIEF_RATES.items(), key=lambda x: -x[1]):
    print(f"  {pid}: {rate:.0%}")

## 4. Correlation Analysis

The key finding: **Model confidence ratios correlate with human false-belief prevalence.**

This suggests that models encode not just facts, but the *structure of human belief*—including systematic errors.

In [None]:
def compute_correlation(ratios, human_rates):
    """Compute Spearman correlation between model ratios and human belief rates."""
    common_ids = set(ratios.keys()) & set(human_rates.keys())
    if len(common_ids) < 5:
        return None, None, None
    
    x = [ratios[pid] for pid in common_ids]
    y = [human_rates[pid] for pid in common_ids]
    
    rho, p_value = stats.spearmanr(x, y)
    return rho, p_value, list(common_ids)

# Compute correlations across scales
print("Spearman correlation: Model confidence ratio vs Human false-belief rate\n")
print(f"{'Size':<10} {'Params':<12} {'ρ':<10} {'p-value':<12} {'Significant?'}")
print("-" * 55)

scaling_results = []
for size in MODEL_SIZES:
    if size in model_ratios:
        rho, p_val, _ = compute_correlation(model_ratios[size], HUMAN_FALSE_BELIEF_RATES)
        if rho is not None:
            params = PARAM_COUNTS.get(size, 0)
            sig = "YES" if p_val < 0.05 else "no"
            print(f"{size:<10} {params/1e6:>8.0f}M   {rho:>+.3f}     {p_val:<12.4f} {sig}")
            scaling_results.append((size, params, rho, p_val))

## 5. Figure 1: Correlation Scatter Plot

This reproduces the main figure showing that model confidence tracks human false-belief prevalence.

In [None]:
def plot_correlation_scatter(ratios, human_rates, model_size, save_path=None):
    """Create scatter plot of model confidence vs human belief."""
    common_ids = set(ratios.keys()) & set(human_rates.keys())
    
    x = [ratios[pid] for pid in common_ids]
    y = [human_rates[pid] for pid in common_ids]
    labels = list(common_ids)
    
    rho, p_val = stats.spearmanr(x, y)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # Scatter plot
    scatter = ax.scatter(x, y, s=120, alpha=0.7, c='#2196F3', edgecolors='white', linewidth=2)
    
    # Add labels
    for i, label in enumerate(labels):
        ax.annotate(label, (x[i], y[i]), fontsize=8, alpha=0.8,
                   xytext=(5, 5), textcoords='offset points')
    
    # Trend line
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    x_line = np.linspace(min(x), max(x), 100)
    ax.plot(x_line, p(x_line), 'r--', alpha=0.7, linewidth=2, label='Trend line')
    
    # Reference lines
    ax.axvline(x=1.0, color='gray', linestyle=':', alpha=0.5, label='Ratio = 1 (no bias)')
    ax.axhline(y=0.5, color='gray', linestyle=':', alpha=0.5)
    
    ax.set_xlabel('Model Confidence Ratio\nP(popular wrong) / P(correct)', fontsize=12)
    ax.set_ylabel('Human False-Belief Prevalence', fontsize=12)
    ax.set_title(f'Confidence Cartography: {model_size}\n'
                 f'Spearman ρ = {rho:.3f}, p = {p_val:.4f}', fontsize=14)
    
    # Add correlation annotation
    ax.text(0.05, 0.95, f'ρ = {rho:.3f}\np = {p_val:.4f}', 
            transform=ax.transAxes, fontsize=12, verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    ax.legend(loc='lower right')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"Saved: {save_path}")
    
    plt.show()
    return fig

# Plot for 6.9B (or largest available model)
target_size = "6.9b" if "6.9b" in model_ratios else list(model_ratios.keys())[-1]
fig = plot_correlation_scatter(
    model_ratios[target_size], 
    HUMAN_FALSE_BELIEF_RATES,
    f"Pythia {target_size.upper()}"
)

## 6. Scaling Law: Correlation Strength vs Model Size

**Key Finding:** The correlation peaks at 1B parameters (ρ = 0.718) before stabilizing.

In [None]:
if len(scaling_results) >= 3:
    sizes = [r[0] for r in scaling_results]
    params = [r[1] for r in scaling_results]
    rhos = [r[2] for r in scaling_results]
    pvals = [r[3] for r in scaling_results]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Correlation vs scale
    ax1.plot(params, rhos, 'o-', markersize=10, linewidth=2, color='#2196F3')
    ax1.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
    ax1.set_xscale('log')
    ax1.set_xlabel('Parameters', fontsize=12)
    ax1.set_ylabel('Spearman ρ', fontsize=12)
    ax1.set_title('Correlation Strength vs Model Scale', fontsize=14)
    ax1.grid(True, alpha=0.3)
    
    # Annotate peak
    peak_idx = np.argmax(rhos)
    ax1.annotate(f'Peak: ρ={rhos[peak_idx]:.3f}\nat {sizes[peak_idx]}',
                xy=(params[peak_idx], rhos[peak_idx]),
                xytext=(params[peak_idx]*2, rhos[peak_idx]-0.1),
                arrowprops=dict(arrowstyle='->', color='red'),
                fontsize=10, color='red')
    
    # P-value vs scale
    ax2.plot(params, pvals, 's-', markersize=10, linewidth=2, color='#FF9800')
    ax2.axhline(y=0.05, color='red', linestyle='--', alpha=0.7, label='p = 0.05')
    ax2.set_xscale('log')
    ax2.set_yscale('log')
    ax2.set_xlabel('Parameters', fontsize=12)
    ax2.set_ylabel('p-value', fontsize=12)
    ax2.set_title('Statistical Significance vs Model Scale', fontsize=14)
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("Need at least 3 model sizes for scaling plot")

## Summary

This notebook demonstrates the core finding of Confidence Cartography:

1. **Teacher-forced confidence** (the probability a model assigns to text) reveals which statements align with human beliefs
2. **Model confidence ratios correlate with human false-belief prevalence** (ρ = 0.652 at 6.9B, p = 0.016)
3. **The signal peaks at 1B parameters** (ρ = 0.718), suggesting mid-scale models are optimal false-belief sensors
4. **The method generalizes to medical domain** with 88% accuracy on out-of-distribution claims

For the full paper, see: [DOI: 10.5281/zenodo.18703506](https://doi.org/10.5281/zenodo.18703506)