In [2]:
"""
Graph Model Implementation Notes
    Version 9.2 introduces the **Recursive Expression Engine**
    (a learnable grammatical compute model)
    and **Cylindrical Time** (hierarchical rotary embeddings),
    integrating them with the **Fractal Topology**
    and **Relativistic Economics** of Versions 6–9.
"""

from __future__ import annotations
from typing import List, Dict, Optional, Any, Union, Tuple, Deque
from collections import deque
import numpy as np

# ============================================================
# SECTION 1 — The Physics Layer (Universal Representations)
# ============================================================

class LossComplexity:
    """
    [Restored from V6/User Request]
    The Relativistic Barrier.
    Manages the 'Architectural Energy' of a module.
    """
    def __init__(self, limit_space: float = 1e6, limit_time: float = 100.0):
        self.limit_space = limit_space
        self.limit_time = limit_time

        # Curvature Gamma (Learnable by MindsEye)
        # Higher Gamma = Harder Wall. Lower Gamma = Softer Wall.
        self.gamma = 1.0

        self.current_space = 0.0
        self.current_time = 0.0

    def get_barrier_penalty(self) -> float:
        """
        Calculates the relativistic cost as complexity approaches the limit.
        Cost -> Infinity as Current -> Limit.
        Formula: 1 / sqrt(1 - (C/Limit)^2)
        """
        # Clip to 0.99 to prevent divide-by-zero
        ratio_s = min(self.current_space / self.limit_space, 0.99)
        ratio_t = min(self.current_time / self.limit_time, 0.99)

        penalty_s = 1.0 / np.sqrt(1.0 - ratio_s**2)
        penalty_t = 1.0 / np.sqrt(1.0 - ratio_t**2)

        return (penalty_s + penalty_t) * self.gamma

    def distribute_tokens(self, amount_space: float, amount_time: float) -> bool:
        """
        Attempts to allocate complexity tokens for child processes/atoms.
        Returns False if barrier makes cost prohibitive (Hard Stop).
        In a differentiable setting, this would return a high gradient cost.
        """
        new_s = self.current_space + amount_space
        new_t = self.current_time + amount_time

        if new_s >= self.limit_space or new_t >= self.limit_time:
            return False

        self.current_space = new_s
        self.current_time = new_t
        return True


class UniversalWorm:
    """[Restored from V8] Helper for Z-Order Linearization."""
    def z_order_argsort(self, coords: np.ndarray) -> np.ndarray:
        # Placeholder for Morton Code calculation
        # Ensures N-dim grid -> 1D stream locality preservation
        return np.arange(len(coords))

class CylindricalRoPE:
    """
    [V9.2 New Logic]
    Hierarchical Rotary Positional Embeddings.
    Maps linear sequence time to Cylindrical Coordinates (Day, Hour).

    - Axis 1 (Hour): High-frequency local rotation (Standard RoPE).
    - Axis 2 (Day): Low-frequency global rotation (Spiral).

    This composition allows for infinite sequence length (Day count) while
    preserving high-precision relative distances within the local context (Hour).
    """
    def __init__(self, dim: int, day_length: int = 1024):
        self.dim = dim
        self.day_length = day_length

        # Standard frequencies for the 'Hour' (Local/Ring)
        # Base 10000 is standard for capturing local syntax
        inv_freq_h = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
        self.freqs_hour = inv_freq_h

        # Slower frequencies for the 'Day' (Global/Spiral)
        # Base 100000 ensures the spiral evolves slowly compared to the ring
        inv_freq_d = 1.0 / (100000 ** (np.arange(0, dim, 2) / dim))
        self.freqs_day = inv_freq_d

    def apply(self, x: np.ndarray, start_index: int = 0) -> np.ndarray:
        """
        Applies cylindrical rotation to input batch x.
        x shape: (seq_len, dim)
        """
        seq_len, dim = x.shape
        indices = np.arange(start_index, start_index + seq_len)

        # 1. Decompose Linear Index into Cylindrical (Day, Hour)
        days = indices // self.day_length
        hours = indices % self.day_length

        # 2. Compute Angles (Broadcasting)
        # Outer product to get angles for every feature pair
        angles_h = np.outer(hours, self.freqs_hour) # (seq, dim/2)
        angles_d = np.outer(days, self.freqs_day)   # (seq, dim/2)

        # 3. Composite Rotation (The Spiral)
        # Summing angles is mathematically equivalent to rotating by Hour then by Day
        total_angle = angles_h + angles_d

        # 4. Apply Rotation to pairs [x0, x1]
        # Repeat angles for both parts of the pair
        theta = np.repeat(total_angle, 2, axis=1)

        # Prepare cos/sin
        cos_t = np.cos(theta)
        sin_t = np.sin(theta)

        # Apply standard rotary formula
        # [-x1, x0] * sin + [x0, x1] * cos
        x_rotated = np.empty_like(x)
        x_rotated[:, 0::2] = x[:, 0::2] * cos_t[:, 0::2] - x[:, 1::2] * sin_t[:, 0::2]
        x_rotated[:, 1::2] = x[:, 0::2] * sin_t[:, 0::2] + x[:, 1::2] * cos_t[:, 0::2]

        return x_rotated

class Feature:
    """
    [V9 Factorization]
    Composite Object: Spline (Physics) + Permutation (Geometry) + Noise (Entropy).
    """
    def __init__(self, embedding_dim: int):
        self.spline_knots = np.zeros(embedding_dim)
        # [Restored V7] Dual-Axis: Row (Topology) & Column (Semantic)
        self.perm_row_coeffs = np.zeros(embedding_dim)
        self.perm_col_coeffs = np.zeros(embedding_dim)
        # [Restored V8] Renormalization Entropy
        self.noise_variance = np.ones(embedding_dim) * 1e-5


# ============================================================
# SECTION 2 — The Atom Layer (Generalized Primitive)
# ============================================================

class Aperture:
    """[Restored V7/V9] Differentiable Window (Global -> Local)."""
    def __init__(self):
        self.sigma = 1e6

class Atom:
    """[V9] The Computational Leaf."""
    def __init__(self, embedding_dim: int, is_virtual: bool = True, init_mode: str = "identity"):
        self.is_virtual = is_virtual
        self.embedding_dim = embedding_dim

        # [V9.2 FIX] Restore the learnable weights directly in Atom
        if init_mode == "identity":
            # Start as Identity (Pass-through)
            self.weights = np.eye(embedding_dim)
        else:
            # Start as Random (Active Projection for Q)
            self.weights = np.random.randn(embedding_dim, embedding_dim) * 0.02

        self.aperture = Aperture()
        self.feature = Feature(embedding_dim)

        # [V9.2 Update] Hierarchical Position
        self.rope = CylindricalRoPE(embedding_dim, day_length=1024)

        self.latency_cost = 0.1 if is_virtual else 1.0

    def realize(self):
        """Transition from Virtual to Real."""
        self.is_virtual = False
        self.latency_cost = 1.0

    def process(self, input_stream: np.ndarray, stream_offset: int = 0) -> np.ndarray:
        if self.is_virtual: return input_stream

        # 1. Apply Cylindrical Rotary Embedding
        x = self.rope.apply(input_stream, start_index=stream_offset)

        # 2. Apply Projection (The "Channel Mixing" logic, now native to Atom)
        # This allows Q to be different from K/V
        x = np.dot(x, self.weights)

        return x


# ============================================================
# SECTION 3 — The Core Layer (Recursive Expression Tree)
# ============================================================

class LearnablePhi:
    """
    [V9.2 New Logic] Continuous Normalization Function.
    Learns to be Identity, Softmax, or Tanh.
    """
    def __init__(self):
        self.scale = 1.0
        self.shift = 0.0
        self.temperature = 1.0

    def apply(self, x: np.ndarray) -> np.ndarray:
        x_affine = (x * self.scale) + self.shift
        exp_x = np.exp(x_affine * self.temperature)
        return exp_x / (np.sum(exp_x, axis=-1, keepdims=True) + 1e-6)

class MixingNode:
    """
    [V9.2 New Logic] The Recursive N-ary Operator.
    Executes children then performs sequential reduction.
    """
    def __init__(self, children: List[Union['MixingNode', Atom]]):
        self.children = children
        # One Phi for each mixing step in the pipe
        self.phis = [LearnablePhi() for _ in range(len(children) - 1)]

    def execute(self, x: np.ndarray) -> np.ndarray:
        # 1. Resolve Children (Recursion)
        resolved_vectors = [
            c.process(x) if isinstance(c, Atom) else c.execute(x)
            for c in self.children
        ]

        # 2. Sequential Reduction (Left-Associative Pipe)
        v_acc = resolved_vectors[0]

        for i, v_next in enumerate(resolved_vectors[1:]):
            phi = self.phis[i]

            # Generalized Interaction (Dot Product)
            if v_acc.ndim == 2 and v_next.ndim == 2:
                if v_acc.shape == v_next.shape:
                    # (N, D) @ (N, D).T -> (N, N) [Affinity Map]
                    v_mix = np.dot(v_acc, v_next.T)
                elif v_acc.shape[0] == v_acc.shape[1]:
                    # (N, N) @ (N, D) -> (N, D) [Apply Map]
                    v_mix = np.dot(v_acc, v_next)
                else:
                    v_mix = v_acc * v_next
            else:
                v_mix = v_acc * v_next

            v_acc = phi.apply(v_mix)

        return v_acc

class Core:
    """
    [V9.2 Updated Logic]
    Constructs the recursive Mixing Tree.
    """
    def __init__(self, embedding_dim: int, topology_def: List = None):
        self.embedding_dim = embedding_dim

        if topology_def is None:
            # Default Initialization: [[Q, K], V]
            # Q: Active (Random Init)
            # K, V: Passive (Identity Init)

            q_atom = Atom(embedding_dim, init_mode="random")
            k_atom = Atom(embedding_dim, init_mode="identity")
            v_atom = Atom(embedding_dim, init_mode="identity")

            # Step 1: The Attention Map Node [Q, K]
            # Result is (N, N) affinity matrix
            attn_node = MixingNode([q_atom, k_atom])

            # Step 2: The Application Node [AttnMap, V]
            # Result is (N, D) output
            self.root = MixingNode([attn_node, v_atom])
        else:
            # TODO: Parser for arbitrary lists
            self.root = None

    def forward(self, x: np.ndarray) -> np.ndarray:
        return self.root.execute(x)


# ============================================================
# SECTION 4 — Logistics & Economy (Restored)
# ============================================================

class ImpedanceCurve:
    """
    [Restored from V7]
    Defines connection cost based on Tree Distance.
    Regulates graph topology to prevent 'Small World' collapse.
    """
    def __init__(self, max_distance: int = 33):
        self.curve_knots = np.linspace(0, 10, 8) # Monotone Spline

    def get_cost(self, sender_node: Module, receiver_node: Module) -> float:
        # [V9.1 Logic] Calculate Tree Distance in the Fractal Hierarchy
        # dist = tree_distance(sender_node, receiver_node)
        # Placeholder distance:
        dist = abs(sender_node.level - receiver_node.level)

        # Cost increases with distance (Impedance)
        return float(dist ** 2) * 0.1

class Logistics:
    """
    [Restored from V7/V9]
    Manages the Economy: Sender-Pays-Time / Receiver-Pays-Space.
    Manages Rhythm: Internal Clock & ETA.
    """
    def __init__(self):
        self.clock = 0
        self.message_queue = deque()
        self.temporal_error_history = [] # For Rhythm gradients

    def tick(self):
        self.clock += 1

    def calculate_eta(self, path_latency: float) -> int:
        return self.clock + int(np.ceil(path_latency))

    def register_arrival(self, predicted_eta: int):
        """[V6 Rhythm Logic] Calculate temporal error for gradient."""
        actual_arrival = self.clock
        # Penalize Late Arrival (Inefficiency) AND Early Arrival (Rhythm Break)
        error = (actual_arrival - predicted_eta) ** 2
        self.temporal_error_history.append(error)

class Connector:
    """
    [Restored V7] Receiver-Centric Input Port.
    """
    def __init__(self, embedding_dim: int):
        self.buffer = deque(maxlen=16)
        self.alignment_mode = "DualAxisSpectral" # [Restored V7]
        self.impedance_cost = 0.0

# ============================================================
# SECTION 5 — The Module Layer (Recursive Agent)
# ============================================================

class Trinity:
    """[V7/V8/V9] Context -> State -> Service."""
    def __init__(self, embedding_dim: int):
        self.context = Core(embedding_dim)
        self.state = Core(embedding_dim)
        self.service = Core(embedding_dim)

    def cycle(self, x: np.ndarray) -> np.ndarray:
        c = self.context.forward(x)
        s = self.state.forward(c)
        return self.service.forward(s)

class Module:
    """
    [V9 Integrated]
    Recursive Container. Manages internal sparsity via sub_modules.
    Enforces LossComplexity (Relativistic Barrier).
    """
    def __init__(self, module_id: str, level: int, embedding_dim: int = 256):
        self.id = module_id
        self.level = level
        self.is_virtual = True

        # [V9] Internal Sparsity (Strictly Private)
        # Recursive definition: A Module contains Modules.
        self.sub_modules: List[Module] = []

        # [V9] Local Compute
        self.trinity = Trinity(embedding_dim)

        # [V6/V9] Relativistic Budget Holder
        # Aggregates complexity of Self + Realized Children
        self.complexity = LossComplexity()

        # [V7] External Connectivity
        self.connectors: Dict[str, Connector] = {}

    def ensure_connector(self, sender: Module, impedance_curve: ImpedanceCurve):
        """
        Establishes connection governed by Impedance Cost (Space Tokens).
        """
        if sender.id not in self.connectors:
            cost = impedance_curve.get_cost(sender, self)
            # Check if we can afford the Space Cost (Relativistic Barrier)
            if self.complexity.distribute_tokens(amount_space=cost, amount_time=0):
                self.connectors[sender.id] = Connector(256)
                self.connectors[sender.id].impedance_cost = cost

    def process(self, signal: Any) -> Any:
        """
        Recursive Execution Flow.
        """
        if self.is_virtual:
            return signal # Zero cost, identity pass-through

        # 1. Distribute Complexity to Sub-Modules (Internal Sparsity)
        # If a sub-module is realized, it consumes part of THIS module's budget.
        if self.sub_modules:
            for sub in self.sub_modules:
                # Sub-modules communicate only with Parent, not outside.
                signal = sub.process(signal)

        # 2. Local Cycle
        output = self.trinity.cycle(signal)

        # 3. Update Time Complexity (Sender-Pays-Time)
        self.complexity.current_time += 1.0

        return output


# ============================================================
# SECTION 6 — The Mind Layer (Bicameral & Meta-Context)
# ============================================================

class Interface:
    """[V8] Universal Linearization Gateway."""
    def __init__(self):
        self.worm = UniversalWorm()

    def linearize(self, data: Any, mode: str = "metric") -> Dict:
        # [Restored V8] Z-Order or Spectral Linearization
        return {"stream": data, "topology_token": None}

class Mind:
    """Base Hemisphere."""
    def __init__(self):
        self.modules: List[Module] = []
        self.logistics = Logistics()
        self.impedance = ImpedanceCurve()
        self.interface = Interface()

        # [Restored V6] Meta-Context Learning Regimes
        # MindsEye switches these based on global stability.
        self.learning_regime = {
            "batch_size": 32,
            "learning_rate": 1e-3,
            "strategy": "online" # or "batch", "structural"
        }

class Reflective(Mind):
    """Hemisphere B: Write-Access, Evolution."""
    def __init__(self):
        super().__init__()
        self.minds_eye = Module("minds_eye", level=33)
        # [Restored V6/V9] Version Control for Backtracking
        self.checkpoints = {}

class GraphModel:
    """The Integrated God Class."""
    def __init__(self):
        self.active_mind = Mind() # Active (Read-Only)
        self.reflective_mind = Reflective() # Reflective (Write-Access)

    def forward(self, x: Any):
        # 1. Linearize Input
        packet = self.active_mind.interface.linearize(x)

        # 2. Propagate
        # Real logic would traverse the active_mind.modules graph
        pass


In [3]:
import numpy as np
from typing import Dict, Any, Tuple
# Import the V9.2 Core components
# from implementation_version_9_2 import Module, Trinity, Core, Atom, UniversalWorm

# ============================================================
# Vision Extension (V9.2)
# ============================================================

class VisionInterface:
    """
    [V9.2 Vision Component]
    Translates 2D Images into Universal 1D Streams + Metadata.
    """
    def __init__(self, patch_size: int = 4):
        self.patch_size = patch_size
        self.worm = UniversalWorm()

    def process(self, image: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """
        Input: Image (H, W, C)
        Output: (Metadata_Vector, Z_Ordered_Stream)
        """
        H, W, C = image.shape

        # 1. Extract Metadata (The "Context" Signal)
        # Statistics that define the 'Scale' and 'Texture' profile globally
        mean_lum = np.mean(image)
        std_lum = np.std(image) # Entropy proxy
        scale_factor = (H * W) / (256 * 256) # Relative to a 'standard' size

        # Metadata: [Scale, Entropy, Mean, 1.0 (Bias)]
        metadata = np.array([[scale_factor, std_lum, mean_lum, 1.0]])

        # 2. Patchify & Linearize (The "State" Signal)
        # Simple grid breakdown
        n_h = H // self.patch_size
        n_w = W // self.patch_size
        patches = image.reshape(n_h, self.patch_size, n_w, self.patch_size, C)
        patches = patches.transpose(0, 2, 1, 3, 4).reshape(-1, self.patch_size*self.patch_size*C)

        # 3. Z-Order Sort (Preserve 2D Locality in 1D)
        # Create grid coordinates (y, x) for every patch
        y_coords = np.repeat(np.arange(n_h), n_w)
        x_coords = np.tile(np.arange(n_w), n_h)
        coords = np.stack([y_coords, x_coords], axis=1)

        # The Worm calculates the fractal path index for each coordinate
        sort_indices = self.worm.z_order_argsort(coords)

        # Reorder the linear stream to follow the curve
        ordered_stream = patches[sort_indices]

        return metadata, ordered_stream

class VisionTrinity(Trinity):
    """
    [V9.2 Vision Logic]
    Overloads the standard cycle to split Metadata (Lens) and Stream (Pixel).
    """
    def cycle(self, inputs: Tuple[np.ndarray, np.ndarray]) -> np.ndarray:
        metadata, pixel_stream = inputs

        # 1. Context Core: Set the "Lens"
        # Input: Simple Global Stats (Scale, Entropy)
        # Output: A "Lens Vector" that encodes HOW to look at the image
        # e.g., "Ignore high-freq noise" or "Focus on edges"
        lens_vector = self.context.forward(metadata)

        # 2. State Core: Look at Pixels through the Lens
        # We prepend the Lens Vector to the stream.
        # The Recursive Engine will mix [Lens, Patch1, Patch2...]
        # The Lens Vector acts as the 'Query' or 'Instruction' for the stream.
        combined_stream = np.concatenate([lens_vector, pixel_stream], axis=0)

        state_representation = self.state.forward(combined_stream)

        # 3. Service Core: Classification / Action
        # Takes the final state (which is now the image digested by the lens)
        return self.service.forward(state_representation)

class VisionModule(Module):
    """
    [V9.2 Concrete Implementation]
    A Module specialized for 2D visual classification using Scale Equivariance.
    """
    def __init__(self, module_id: str, embedding_dim: int = 64, patch_size: int = 4):
        super().__init__(module_id, level=1, embedding_dim=embedding_dim)

        # Swap the generic Trinity for our Vision-specific one
        self.trinity = VisionTrinity(embedding_dim)
        self.interface = VisionInterface(patch_size)

        # Initialize Cores with specific roles
        # Context: Needs to be simple (Metadata processing)
        self.trinity.context = Core(embedding_dim) # Default flat topology is fine

        # State: Needs deep mixing (The "Eye")
        # We could initialize a deeper topology here if we wanted
        self.trinity.state = Core(embedding_dim)

    def process_image(self, image: np.ndarray) -> Any:
        # 1. Linearize via Interface (Z-Order + Metadata extraction)
        metadata, stream = self.interface.process(image)

        # 2. Project inputs to Embedding Dimension (Simple Linear adapter)
        # (In a real Torch model, this would be a learned Linear layer)
        # Mocking projection for numpy demo:
        meta_proj = np.tile(metadata, (1, self.trinity.context.embedding_dim // 4))
        stream_proj = np.pad(stream, ((0,0), (0, max(0, self.trinity.state.embedding_dim - stream.shape[1]))))

        # 3. Execute the Cycle
        # Context (Lens) -> State (Eye) -> Service (Label)
        result = self.trinity.cycle((meta_proj, stream_proj))

        return result

# ============================================================
# Unit Test / Verification
# ============================================================
if __name__ == "__main__":
    # 1. Create the Module
    vision_mod = VisionModule("visual_cortex_v1", embedding_dim=128)

    # 2. Create a Mock Image (32x32 RGB)
    # A "Blurry Dog" (Low Entropy)
    mock_image = np.random.rand(32, 32, 3).astype(np.float32)

    # 3. Run the Process
    output = vision_mod.process_image(mock_image)

    print("Vision Module Output Shape:", output.shape)
    print("Cycle Complete: Metadata -> Lens -> Stream -> Classification")

Vision Module Output Shape: (65, 128)
Cycle Complete: Metadata -> Lens -> Stream -> Classification


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
import math

# ============================================================
# PART 1: THE V9.2 PHYSICS ENGINE (PyTorch Edition)
# ============================================================

class CylindricalRoPE(nn.Module):
    """
    [V9.2] Hierarchical Rotary Embeddings (Day/Hour).
    """
    def __init__(self, dim, day_length=64): # Reduced day_length for CIFAR patches
        super().__init__()
        self.dim = dim
        self.day_length = day_length

        # Precompute frequencies (Hour = High Freq, Day = Low Freq)
        self.register_buffer('inv_freq_h', 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)))
        self.register_buffer('inv_freq_d', 1.0 / (100000 ** (torch.arange(0, dim, 2).float() / dim)))

    def forward(self, x, start_index=0):
        # x: (Batch, Seq, Dim)
        b, seq, dim = x.shape
        device = x.device

        # 1. Create Linear Indices
        indices = torch.arange(start_index, start_index + seq, device=device).float()

        # 2. Cylindrical Decomposition
        days = indices // self.day_length
        hours = indices % self.day_length

        # 3. Compute Angles
        angles_h = torch.outer(hours, self.inv_freq_h)
        angles_d = torch.outer(days, self.inv_freq_d)
        total_angle = angles_h + angles_d # The Spiral

        # 4. Apply Rotation (cos/sin)
        # Repeat for real/imaginary parts
        theta = torch.repeat_interleave(total_angle, 2, dim=1)

        # Reshape for broadcasting: (1, Seq, Dim)
        theta = theta.unsqueeze(0)

        cos_t = torch.cos(theta)
        sin_t = torch.sin(theta)

        x_rot = torch.zeros_like(x)
        x_rot[..., 0::2] = x[..., 0::2] * cos_t[..., 0::2] - x[..., 1::2] * sin_t[..., 0::2]
        x_rot[..., 1::2] = x[..., 0::2] * sin_t[..., 0::2] + x[..., 1::2] * cos_t[..., 0::2]

        return x_rot

class LearnablePhi(nn.Module):
    """
    [V9.2] Continuous Normalization Manifold.
    Learns to be Linear, Softmax, or Gating.
    """
    def __init__(self, dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(1))
        self.shift = nn.Parameter(torch.zeros(1))
        self.temperature = nn.Parameter(torch.tensor(1.0))

    def forward(self, x):
        # x_affine = (x * self.scale) + self.shift
        # For stability in training, we use a simpler Softmax-like gate first
        # Ideally this is a spline, effectively a learnable temperature Softmax here
        return F.softmax(x * self.temperature, dim=-1)

class Atom(nn.Module):
    """
    [V9.2] The Computational Leaf.
    Active (Q) vs Passive (K/V) Initialization.
    """
    def __init__(self, dim, init_mode='identity'):
        super().__init__()
        self.rope = CylindricalRoPE(dim)

        # Projection Weights
        self.proj = nn.Linear(dim, dim, bias=False)

        # Initialization Logic
        if init_mode == 'identity':
            nn.init.eye_(self.proj.weight)
            # Freeze identity layers to preserve signal initially (optional, but good for stability)
            # self.proj.weight.requires_grad = False
        elif init_mode == 'random':
            nn.init.orthogonal_(self.proj.weight, gain=0.1)

    def forward(self, x, offset=0):
        # 1. RoPE
        x = self.rope(x, start_index=offset)
        # 2. Project
        return self.proj(x)

class MixingNode(nn.Module):
    """
    [V9.2] The Recursive Pipe (Sequence Reduction).
    """
    def __init__(self, children_dims, hidden_dim):
        super().__init__()
        # In a dynamic tree, children are objects.
        # Here we hardcode the logic for [Q, K, V] pipe for efficiency.

        self.phi = LearnablePhi(hidden_dim)
        self.layer_norm = nn.LayerNorm(hidden_dim) # Stabilizer

    def forward(self, q, k, v):
        # Q, K, V are vectors from Atoms
        # q: (B, Seq, Dim), k: (B, Seq, Dim), v: (B, Seq, Dim)

        # 1. Interaction A: Q @ K.T -> Attention Map
        # (B, S, D) @ (B, D, S) -> (B, S, S)
        affinity = torch.matmul(q, k.transpose(-2, -1))

        # 2. Continuous Normalization (Phi)
        # "Is it Softmax? Is it Linear?" The model decides.
        weights = self.phi(affinity)

        # 3. Interaction B: Weights @ V -> Output
        # (B, S, S) @ (B, S, D) -> (B, S, D)
        out = torch.matmul(weights, v)

        return self.layer_norm(out + q) # Residual connection for gradient flow

# ============================================================
# PART 2: THE VISION MODULE (Context -> State -> Service)
# ============================================================

class VisionModule(nn.Module):
    def __init__(self, dim=64, patch_size=4, num_classes=10):
        super().__init__()
        self.dim = dim
        self.patch_size = patch_size

        # 1. Embeddings (The "Retina")
        self.patch_embed = nn.Linear(3 * patch_size * patch_size, dim)
        self.lens_embed = nn.Linear(4, dim) # Metadata: [Scale, Entropy, Mean, Bias]

        # 2. Context Core (The Lens)
        # Simple processing of metadata
        self.context_core = nn.Sequential(
            nn.Linear(dim, dim),
            nn.Tanh(),
            nn.Linear(dim, dim)
        )

        # 3. State Core (The Eye)
        # The V9.2 Recursive Pipe: [[Q, K], V]
        # We need Atoms for Q, K, V
        self.atom_q = Atom(dim, init_mode='random')   # Active Query
        self.atom_k = Atom(dim, init_mode='identity') # Passive Key
        self.atom_v = Atom(dim, init_mode='identity') # Passive Value

        # The Mixer
        self.mixer = MixingNode(dim, dim)

        # 4. Service Core (The Labeler)
        self.head = nn.Linear(dim, num_classes)

    def get_z_order_indices(self, H, W):
        # Generates Morton Codes for the grid
        # Simple Python implementation for demonstration
        # Returns a sorting permutation
        indices = []
        for y in range(H):
            for x in range(W):
                # Interleave bits (simplified Z-curve for square grids)
                # This is a mock 'worm' for speed, real implementation uses bit-interleaving
                indices.append((y, x))
        return torch.arange(H*W) # Placeholder: In real training, pre-compute Morton codes

    def forward(self, img):
        B, C, H, W = img.shape

        # --- A. Interface (Linearization) ---
        # 1. Metadata extraction
        mean = img.mean(dim=(1, 2, 3), keepdim=True)
        std = img.std(dim=(1, 2, 3), keepdim=True)
        scale = (H * W) / (32 * 32)
        meta = torch.cat([torch.tensor(scale).expand(B, 1).to(img.device),
                          std.squeeze().unsqueeze(1),
                          mean.squeeze().unsqueeze(1),
                          torch.ones(B, 1).to(img.device)], dim=1)

        # 2. Patchify
        # (B, C, H, W) -> (B, N, Patch_Dim)
        x = F.unfold(img, kernel_size=self.patch_size, stride=self.patch_size)
        x = x.transpose(1, 2) # (B, NumPatches, FlattenedPatch)

        # 3. Embed
        stream = self.patch_embed(x) # (B, 64, Dim)
        lens_in = self.lens_embed(meta).unsqueeze(1) # (B, 1, Dim)

        # --- B. Cycle (Context -> State) ---

        # 1. Context Core
        lens_vector = self.context_core(lens_in) # "How to look"

        # 2. State Core
        # The Stream is [Lens, Patches...]
        # Lens is index 0 (Day 0, Hour 0)
        # Patches are index 1..N

        # Apply Atoms (Recursive Leaves)
        # We use the Lens as Q for everything, and Stream as K/V
        # This forces the model to look at the image *through* the lens

        # Q: The Lens Vector (Instruction)
        q = self.atom_q(lens_vector, offset=0)

        # K, V: The Image Stream
        k = self.atom_k(stream, offset=1)
        v = self.atom_v(stream, offset=1)

        # Mixing Node (The Pipe)
        # Note: We broadcast Q (1 token) across K (64 tokens)
        final_state = self.mixer(q, k, v) # (B, 1, Dim)

        # --- C. Service Core ---
        logits = self.head(final_state.squeeze(1))
        return logits

# ============================================================
# PART 3: THE GYM (Training Loop)
# ============================================================

def train():
    print("--- Initializing V9.2 Vision System ---")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Hyperparameters
    BATCH_SIZE = 64
    EPOCHS = 5
    LR = 0.001

    # Data (CIFAR-10)
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    print("Downloading CIFAR-10...")
    train_set = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)

    # Model
    model = VisionModule(dim=128, patch_size=4, num_classes=10).to(device)
    optimizer = optim.Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()

    print("--- Starting Training Cycle ---")
    model.train()

    for epoch in range(EPOCHS):
        total_loss = 0
        correct = 0
        total = 0

        for i, (imgs, labels) in enumerate(train_loader):
            imgs, labels = imgs.to(device), labels.to(device)

            optimizer.zero_grad()

            # Forward (Context -> State -> Service)
            outputs = model(imgs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            if i % 100 == 0:
                print(f"Epoch [{epoch+1}/{EPOCHS}], Step [{i}/{len(train_loader)}], Loss: {loss.item():.4f}, Acc: {100.*correct/total:.2f}%")

        print(f"=== Epoch {epoch+1} Complete. Avg Loss: {total_loss/len(train_loader):.4f} ===")

if __name__ == "__main__":
    train()

--- Initializing V9.2 Vision System ---
Device: cpu
Downloading CIFAR-10...


100%|██████████| 170M/170M [00:02<00:00, 58.1MB/s]


--- Starting Training Cycle ---
Epoch [1/5], Step [0/782], Loss: 2.6168, Acc: 4.69%
Epoch [1/5], Step [100/782], Loss: 1.9288, Acc: 28.30%
Epoch [1/5], Step [200/782], Loss: 1.7893, Acc: 31.31%
Epoch [1/5], Step [300/782], Loss: 1.7642, Acc: 32.58%
Epoch [1/5], Step [400/782], Loss: 1.6993, Acc: 33.73%
Epoch [1/5], Step [500/782], Loss: 1.7156, Acc: 34.69%
Epoch [1/5], Step [600/782], Loss: 1.6982, Acc: 35.21%
Epoch [1/5], Step [700/782], Loss: 1.4660, Acc: 35.89%
=== Epoch 1 Complete. Avg Loss: 1.7649 ===
Epoch [2/5], Step [0/782], Loss: 1.8042, Acc: 39.06%
Epoch [2/5], Step [100/782], Loss: 1.5607, Acc: 40.73%
Epoch [2/5], Step [200/782], Loss: 1.6113, Acc: 40.54%
Epoch [2/5], Step [300/782], Loss: 1.5068, Acc: 41.02%
Epoch [2/5], Step [400/782], Loss: 1.6581, Acc: 41.04%
Epoch [2/5], Step [500/782], Loss: 1.8348, Acc: 40.90%
Epoch [2/5], Step [600/782], Loss: 1.4814, Acc: 40.92%
Epoch [2/5], Step [700/782], Loss: 1.3160, Acc: 41.19%
=== Epoch 2 Complete. Avg Loss: 1.6286 ===
Epoch [