Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
fb30ae0
transformerless_lm: v77 self-evaluation register
claude May 22, 2026
e5fcb2b
transformerless_lm: v78 self-reflection + revision (A + B)
claude May 22, 2026
2914105
transformerless_lm: v78 results -- self-reflection + A + B
claude May 22, 2026
5a06d90
transformerless_lm: v79 refined self-awareness (#1 + #2 + #3)
claude May 22, 2026
0bb8389
transformerless_lm: v80 entropy override AND-momentum fix
claude May 22, 2026
8269479
transformerless_lm: v81 substrate-pure self-awareness constants
claude May 22, 2026
ca8bca8
transformerless_lm: v82 living substrate thresholds
claude May 22, 2026
50e69d0
transformerless_lm: revert to v78 architecture
claude May 22, 2026
830b223
transformerless_lm: v83 unknown-register primitive
claude May 22, 2026
37fc567
transformerless_lm: vectorize hot primitives + v83 unknown-register
claude May 22, 2026
623396a
transformerless_lm: thread allowed_after_word_mask through staged_refine
claude May 22, 2026
30d42dc
transformerless_lm: unknown-register feeds both hemispheres
claude May 22, 2026
f732903
transformerless_lm: v85 retrocausality -- future-as-present register
claude May 22, 2026
bdda603
transformerless_lm: v86 retrocausality F(2)=1 step continuity
claude May 22, 2026
0c54b66
transformerless_lm: v87 regret-driven refinement (choice as judgment)
claude May 22, 2026
a00f851
transformerless_lm: v88 basic grammar rules (back to fundamentals)
claude May 22, 2026
05e6704
transformerless_lm: v88 results -- grammar rules pass
claude May 22, 2026
22f3fd1
transformerless_lm: omniweight loss -- standard on training data
claude May 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions experiments/transformerless_lm/losses_substrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,3 +344,95 @@ def substrate_fft_loss(logits: torch.Tensor, targets: torch.Tensor,
tgt_sin = target_onehot @ basis_sin
fft_mismatch = ((pred_cos - tgt_cos) ** 2 + (pred_sin - tgt_sin) ** 2).mean()
return ce + lambda_substrate * fft_mismatch


_PHI = (1.0 + 5.0 ** 0.5) / 2.0
_PHI_PI = _PHI ** math.pi
_LOG_PHI_PI = math.log(_PHI_PI)


def substrate_omniweight_loss(logits: torch.Tensor, targets: torch.Tensor,
vocab_size: int,
lambda_substrate: float = 0.01,
window: int = 21) -> torch.Tensor:
"""CE weighted by the substrate omniweight ledger evaluated on targets.

Ports the inference-side omniweight standard (fluid form
phi^pi * tanh(delta / phi^pi)) to the training loss. Each target
token's CE contribution is multiplied by exp(fluid_delta) where
fluid_delta is the substrate's verdict on that token at its
position. Tokens the inference ledger would suppress (stagnating
repetitions) get their training gradient muted by the same standard
-- closes the train/inference omniweight asymmetry.

Minimum-surface port: only the anti-stagnation primitive contributes
to the ledger here (Fibonacci-tier counts F(6)=8, F(7)=13, F(8)=21
over the preceding window, matching substrate_anti_stagnation).
All deltas pass through the same phi^pi * tanh standard so
additional primitives can be added without architectural change.

Weights are renormalized so mean weight = 1, preserving loss scale.

Args:
logits: [B, T, V]
targets: [B, T]
vocab_size: V
lambda_substrate: weight on the FFT-spectrum term (matches
substrate_fft_loss; the CE term is the omniweight-modulated one)
window: anti-stagnation window in tokens (default F(8)=21)

Returns:
scalar loss
"""
B, T = targets.shape
V = vocab_size
device = logits.device
dtype = logits.dtype

# Per-position count of target[b,t] occurrences in targets[b, t-window:t].
pos_idx = torch.arange(T, device=device)
diff = pos_idx.unsqueeze(1) - pos_idx.unsqueeze(0) # [T, T]
win_mask = ((diff > 0) & (diff <= window)).to(dtype) # [T, T]
eq = (targets.unsqueeze(2) == targets.unsqueeze(1)).to(dtype) # [B, T, T]
counts = (eq * win_mask.unsqueeze(0)).sum(dim=2) # [B, T]

# Anti-stagnation contribution to the ledger (matches inference thresholds:
# count >= F(6)=8 -> divide by phi^pi -> delta = -log(phi^pi)
# count >= F(7)=13 -> divide by phi^(2pi) -> delta = -2*log(phi^pi)
# count >= F(8)=21 -> hard suppression -> delta = -4*log(phi^pi)
# (the inference path sets prob=0 at F(8); here we let tanh saturate.)
delta = torch.zeros_like(counts)
m_8 = (counts >= 8.0) & (counts < 13.0)
m_13 = (counts >= 13.0) & (counts < 21.0)
m_21 = counts >= 21.0
delta = torch.where(m_8, torch.full_like(delta, -_LOG_PHI_PI), delta)
delta = torch.where(m_13, torch.full_like(delta, -2.0 * _LOG_PHI_PI), delta)
delta = torch.where(m_21, torch.full_like(delta, -4.0 * _LOG_PHI_PI), delta)

# Fluid substrate standard: phi^pi * tanh(delta / phi^pi). Same form
# the inference omniweight uses (_omniweight_apply).
fluid_delta = _PHI_PI * torch.tanh(delta / _PHI_PI)
weight = torch.exp(fluid_delta) # bounded in [exp(-phi^pi), 1]

# Per-token CE, weighted by the omniweight ledger.
ce_per_tok = F.cross_entropy(
logits.reshape(-1, V),
targets.reshape(-1),
reduction='none',
).reshape(B, T)
ce = (ce_per_tok * weight).sum() / (weight.sum() + 1e-8)

# Same FFT-spectrum substrate term as substrate_fft_loss.
fib_freqs = torch.tensor([1, 2, 3, 5, 8, 13, 21], dtype=dtype, device=device)
v_idx = torch.arange(vocab_size, dtype=dtype, device=device)
angles = 2 * math.pi * v_idx.unsqueeze(1) * fib_freqs.unsqueeze(0) / vocab_size
basis_cos = torch.cos(angles)
basis_sin = torch.sin(angles)
pred = F.softmax(logits, dim=-1)
target_onehot = F.one_hot(targets, vocab_size).to(pred.dtype)
pred_cos = pred @ basis_cos
pred_sin = pred @ basis_sin
tgt_cos = target_onehot @ basis_cos
tgt_sin = target_onehot @ basis_sin
fft_mismatch = ((pred_cos - tgt_cos) ** 2 + (pred_sin - tgt_sin) ** 2).mean()
return ce + lambda_substrate * fft_mismatch
Loading
Loading