<a href="https://colab.research.google.com/github/OneFineStarstuff/Cosmic-Brilliance/blob/main/rge_end_to_end_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
# rge_end_to_end.py
# A complete, ready-to-run pipeline: abstraction → theory construction → simulation → validation → ranking
# Dependencies: numpy (stdlib otherwise)

from __future__ import annotations
import numpy as np
from dataclasses import dataclass, asdict
from typing import List, Dict, Any, Optional, Tuple, Iterable, Callable
import time, json, os, re, math
from pathlib import Path

# ========== Utilities & types ==========

@dataclass
class ScoreReport:
    total: float
    passed: bool
    components: Dict[str, float]
    details: Dict[str, Any]
    threshold: float
    version: str = "meta-validator/1.0.0"
    timestamp: float = None

    def to_json(self) -> str:
        d = asdict(self)
        if self.timestamp is None:
            d["timestamp"] = time.time()
        return json.dumps(d, indent=2, sort_keys=True)

class SimOutput:
    def __init__(self, t: np.ndarray, y: np.ndarray, states: List[str]):
        self.t = t               # shape (T,)
        self.y = y               # shape (S, T)
        self.states = states     # e.g. ['x','y']

# ========== MetaValidator ==========

Token = str

def _to_tokens(eq: Any) -> List[Token]:
    s = str(eq)
    return [tok for tok in re.findall(r"[A-Za-z_]+|\d+\.\d+|\d+|[+\-*/^=()=]", s) if tok.strip()]

def _description_length(equations: Iterable[Any]) -> int:
    weights = {"+":1, "-":1, "*":1, "/":2, "^":2, "=":0, "sin":3, "cos":3, "exp":3, "log":3}
    total = 0
    for eq in equations:
        for tok in _to_tokens(eq):
            total += weights.get(tok, 1)
    return total

def _simplicity_score(desc_len: int, lam: float = 0.03) -> float:
    return float(np.exp(-lam * max(0, desc_len)))

def _masked_var(y_true: np.ndarray) -> float:
    mask = ~np.isnan(y_true)
    if not np.any(mask):
        return 0.0
    centered = y_true[mask] - np.nanmean(y_true)
    return float(np.mean(centered**2))

def _accuracy_with_gt(y_pred: np.ndarray, y_true: np.ndarray) -> Tuple[float, Dict[str, Any]]:
    if y_pred.shape != y_true.shape:
        raise ValueError(f"Shape mismatch: pred {y_pred.shape} vs gt {y_true.shape}")
    mask = ~np.isnan(y_true)
    if not np.any(mask):
        return 0.0, {"mse": None, "var": None, "masked_fraction": 0.0}
    diff = (y_pred - y_true)[mask]
    mse = float(np.mean(diff**2))
    var = _masked_var(y_true)
    if var > 0:
        err = mse / var
    else:
        amp = float(np.nanmax(np.abs(y_true)) or 1.0)
        err = mse / (amp**2 if amp > 0 else 1.0)
    acc = float(np.exp(-err))
    return acc, {"mse": mse, "var": var, "norm_err": err}

def _finite_diff(y: np.ndarray, t: np.ndarray) -> np.ndarray:
    dt = np.diff(t)
    if np.any(dt <= 0):
        raise ValueError("Time vector must be strictly increasing")
    S, T = y.shape
    dy = np.empty_like(y)
    dy[:, 0] = (y[:, 1] - y[:, 0]) / dt[0]
    dy[:, -1] = (y[:, -1] - y[:, -2]) / dt[-1]
    mid_dt = (t[2:] - t[:-2]) / 2.0
    dy[:, 1:-1] = (y[:, 2:] - y[:, :-2]) / mid_dt
    return dy

def _smoothness_score(y: np.ndarray, t: np.ndarray) -> float:
    dy = _finite_diff(y, t)
    d2y = _finite_diff(dy, t)
    num = np.nanmean(d2y**2)
    denom = (np.nanmean(y**2) or 1.0)
    ratio = float(num / denom)
    return float(np.exp(-0.1 * ratio))

def _reversibility_score(y: np.ndarray, t: np.ndarray) -> float:
    y_f = (y - np.nanmean(y, axis=1, keepdims=True)) / (np.nanstd(y, axis=1, keepdims=True) + 1e-12)
    y_r = np.flip(y_f, axis=1)
    corrs = []
    for i in range(y.shape[0]):
        if y.shape[1] > 3:
            c = np.corrcoef(y_f[i], y_r[i])[0, 1]
            if not np.isnan(c):
                corrs.append(abs(c))
    corr = np.mean(corrs) if corrs else 0.0
    return float(0.5 + 0.5 * corr)

def _energy_drift_score(y: np.ndarray, t: np.ndarray, energy_fn: Optional[Callable[[np.ndarray], np.ndarray]]) -> Tuple[float, Optional[float]]:
    if energy_fn is None:
        return 0.5, None
    E = energy_fn(y)  # shape (T,)
    if E is None or np.all(np.isnan(E)):
        return 0.5, None
    med = float(np.nanmedian(E))
    mad = float(np.nanmedian(np.abs(E - med))) + 1e-9
    drift = float(np.nanmax(np.abs(E - med)) / mad)
    score = float(np.exp(-0.2 * drift))
    return score, drift

def _kl_divergence(p: np.ndarray, q: np.ndarray) -> float:
    eps = 1e-12
    p = p + eps
    q = q + eps
    p = p / p.sum()
    q = q / q.sum()
    return float(np.sum(p * np.log(p / q)))

def _token_hist(tokens: List[Token], vocab: List[Token]) -> np.ndarray:
    idx = {tok: i for i, tok in enumerate(vocab)}
    counts = np.zeros(len(vocab), dtype=float)
    for t in tokens:
        if t in idx:
            counts[idx[t]] += 1
    if counts.sum() == 0:
        counts += 1.0
    return counts

def _novelty_score(equations: Iterable[Any], prior_vocab: List[Token], prior_freq: np.ndarray, beta: float = 0.6) -> Tuple[float, Dict[str, Any]]:
    toks: List[Token] = []
    for eq in equations:
        toks.extend(_to_tokens(eq))
    p_hist = _token_hist(toks, prior_vocab)
    kl = _kl_divergence(p_hist, prior_freq)
    score = float(1.0 - np.exp(-beta * kl))
    return score, {"kl": kl, "token_count": int(p_hist.sum())}

class MetaValidator:
    def __init__(
        self,
        threshold: float = 0.8,
        alpha: Tuple[float, float, float] = (0.5, 0.3, 0.2),
        prior_vocab: Optional[List[Token]] = None,
        prior_freq: Optional[np.ndarray] = None,
        seed: Optional[int] = 42,
    ):
        self.threshold = float(threshold)
        self.w_acc, self.w_simp, self.w_nov = alpha
        self.rng = np.random.default_rng(seed)
        default_vocab = ["x","y","z","t","+","-","*","/","^","=","sin","cos","exp","log","1","2","0"]
        self.prior_vocab = prior_vocab or default_vocab
        if prior_freq is None:
            self.prior_freq = np.array([3,3,1,2,5,4,5,3,2,0,1,1,1,1,2,1,2], dtype=float)
        else:
            self.prior_freq = np.array(prior_freq, dtype=float)
        if len(self.prior_vocab) != len(self.prior_freq):
            raise ValueError("prior_vocab and prior_freq must align")

    def score(
        self,
        theory: Dict[str, Any],
        sim_output: SimOutput,
        ground_truth: Optional[np.ndarray] = None,
        energy_fn: Optional[Callable[[np.ndarray], np.ndarray]] = None,
    ) -> ScoreReport:
        t = np.asarray(sim_output.t)
        y = np.asarray(sim_output.y)
        if y.ndim != 2:
            raise ValueError(f"sim_output.y must be 2D (S,T), got {y.shape}")
        if t.ndim != 1 or t.shape[0] != y.shape[1]:
            raise ValueError("Time vector length must match y's time dimension")
        equations = theory.get("equations", [])
        if not isinstance(equations, (list, tuple)) or len(equations) == 0:
            raise ValueError("theory['equations'] must be a non-empty list")

        if ground_truth is not None:
            acc, acc_det = _accuracy_with_gt(y, np.asarray(ground_truth))
        else:
            rev = _reversibility_score(y, t)
            smo = _smoothness_score(y, t)
            eng, drift = _energy_drift_score(y, t, energy_fn)
            acc = float(0.5 * rev + 0.3 * smo + 0.2 * eng)
            acc_det = {"proxy": True, "reversibility": rev, "smoothness": smo, "energy": eng, "energy_drift": drift}

        desc_len = _description_length(equations)
        simp = _simplicity_score(desc_len)
        simp_det = {"description_length": desc_len}

        nov, nov_det = _novelty_score(equations, self.prior_vocab, self.prior_freq)

        total = float(self.w_acc * acc + self.w_simp * simp + self.w_nov * nov)
        passed = bool(total >= self.threshold)
        details = {
            "accuracy": acc_det,
            "simplicity": simp_det,
            "novelty": nov_det,
            "weights": {"acc": self.w_acc, "simp": self.w_simp, "nov": self.w_nov},
        }
        return ScoreReport(
            total=total,
            passed=passed,
            components={"accuracy": acc, "simplicity": simp, "novelty": nov},
            details=details,
            threshold=self.threshold,
            timestamp=time.time(),
        )

# ========== Abstraction ==========

class LatentInvariantExtractor:
    """
    Lightweight abstraction: standardize data and expose simple invariants.
    Input data shape: (N, S) or list of arrays; here we use it for provenance and potential feature hints.
    """
    def __init__(self):
        self.mean_: Optional[np.ndarray] = None
        self.std_: Optional[np.ndarray] = None

    def fit(self, data: np.ndarray) -> "LatentInvariantExtractor":
        if data.ndim != 2:
            data = np.asarray(data)
            data = data.reshape(len(data), -1)
        self.mean_ = np.mean(data, axis=0)
        self.std_ = np.std(data, axis=0) + 1e-9
        return self

    def transform(self, data: np.ndarray) -> np.ndarray:
        if self.mean_ is None or self.std_ is None:
            raise RuntimeError("Call fit() before transform().")
        return (data - self.mean_) / self.std_

    def extract(self, data: np.ndarray) -> Dict[str, Any]:
        X = self.transform(data)
        cov = np.cov(X.T)
        return {
            "mean": self.mean_.tolist(),
            "std": self.std_.tolist(),
            "cov_trace": float(np.trace(cov)),
            "corr_abs_mean": float(np.mean(np.abs(np.corrcoef(X.T)))),
        }

# ========== Theory constructor ==========

class SymbolicTheoryConstructor:
    """
    Enumerates linear ODE candidates for states ['x','y']:
        dx/dt = a*x + b*y
        dy/dt = c*x + d*y
    Coefficients drawn from a small discrete set.
    """
    def __init__(self, var_names: Tuple[str, ...] = ("x","y"), coeff_grid: Tuple[float, ...] = (-1.0, -0.5, 0.0, 0.5, 1.0)):
        if tuple(var_names) != ("x","y"):
            raise NotImplementedError("This demo constructor supports exactly two states: ('x','y').")
        self.var_names = list(var_names)
        self.coeff_grid = coeff_grid

    def build_candidates(self, max_candidates: Optional[int] = None) -> List[Dict[str, Any]]:
        cand: List[Dict[str, Any]] = []
        count = 0
        for a in self.coeff_grid:
            for b in self.coeff_grid:
                for c in self.coeff_grid:
                    for d in self.coeff_grid:
                        A = np.array([[a, b],
                                      [c, d]], dtype=float)
                        if np.allclose(A, 0.0):  # skip trivial zero dynamics
                            continue
                        eqs = [
                            f"dx/dt = {a:+.2f}*x {b:+.2f}*y",
                            f"dy/dt = {c:+.2f}*x {d:+.2f}*y",
                        ]
                        cand.append({"equations": eqs, "A": A, "states": self.var_names})
                        count += 1
                        if max_candidates is not None and count >= max_candidates:
                            return cand
        return cand

# ========== Simulator (RK4) ==========

class SympyODESimulator:
    """
    Minimal RK4 simulator for linear system: dy/dt = A y
    (Name kept for compatibility with earlier examples.)
    """
    def __init__(self, theory: Dict[str, Any]):
        self.A = np.array(theory["A"], dtype=float)
        self.states = theory.get("states", ["x","y"])

    def simulate(self, y0: np.ndarray, t: np.ndarray) -> SimOutput:
        y0 = np.asarray(y0, dtype=float).reshape(-1)
        S = len(self.states)
        assert y0.shape[0] == S, f"Initial condition has wrong size: {y0.shape[0]} vs {S}"
        Y = np.zeros((S, len(t)), dtype=float)
        Y[:, 0] = y0
        for k in range(len(t) - 1):
            dt = float(t[k+1] - t[k])
            yk = Y[:, k]
            f = lambda y: self.A @ y
            k1 = f(yk)
            k2 = f(yk + 0.5*dt*k1)
            k3 = f(yk + 0.5*dt*k2)
            k4 = f(yk + dt*k3)
            Y[:, k+1] = yk + (dt/6.0) * (k1 + 2*k2 + 2*k3 + k4)
        return SimOutput(t=t, y=Y, states=self.states)

# ========== Engine ==========

class RecursiveGeneralizationEngine:
    def __init__(
        self,
        abstraction_engine: LatentInvariantExtractor,
        theory_constructor: SymbolicTheoryConstructor,
        simulator_factory: Callable[[Dict[str, Any]], SympyODESimulator],
        validator: MetaValidator,
    ):
        self.abstraction_engine = abstraction_engine
        self.theory_constructor = theory_constructor
        self.simulator_factory = simulator_factory
        self.validator = validator

    def run(
        self,
        input_data: np.ndarray,
        t: np.ndarray,
        y0: np.ndarray,
        ground_truth: Optional[np.ndarray] = None,
        max_candidates: Optional[int] = 300,
        top_k: int = 5,
        run_dir: Optional[str] = None,
    ) -> List[Tuple[Dict[str, Any], float, ScoreReport]]:
        # Abstraction
        self.abstraction_engine.fit(input_data)
        invariants = self.abstraction_engine.extract(input_data)

        # Construct theories
        candidates = self.theory_constructor.build_candidates(max_candidates=max_candidates)

        # Scoring
        results: List[Tuple[Dict[str, Any], float, ScoreReport]] = []
        energy_fn = lambda y: np.sum(y**2, axis=0)

        for th in candidates:
            sim = self.simulator_factory(th).simulate(y0=y0, t=t)
            rep = self.validator.score(theory=th, sim_output=sim, ground_truth=ground_truth, energy_fn=energy_fn)
            results.append((th, rep.total, rep))

        # Sort and keep top_k
        results.sort(key=lambda x: -x[1])
        top = results[:top_k]

        # Persist artifacts
        if run_dir is not None:
            Path(run_dir).mkdir(parents=True, exist_ok=True)
            # manifest
            manifest = {
                "timestamp": time.time(),
                "states": self.theory_constructor.var_names,
                "t_span": [float(t[0]), float(t[-1])],
                "num_timepoints": int(len(t)),
                "y0": y0.tolist(),
                "validator": {
                    "threshold": self.validator.threshold,
                    "alpha": [self.validator.w_acc, self.validator.w_simp, self.validator.w_nov],
                    "version": "meta-validator/1.0.0",
                },
                "abstraction": invariants,
                "num_candidates_scored": len(results),
                "top_k": top_k,
            }
            (Path(run_dir) / "manifest.json").write_text(json.dumps(manifest, indent=2))
            # top reports
            for i, (th, sc, rep) in enumerate(top, 1):
                payload = {
                    "rank": i,
                    "score": sc,
                    "theory": th,
                    "report": json.loads(rep.to_json()),
                }
                (Path(run_dir) / f"theory_{i:02d}.json").write_text(json.dumps(payload, indent=2))

        return top

# ========== Demo run ==========

def generate_ground_truth(t: np.ndarray, y0: np.ndarray, A_true: np.ndarray) -> np.ndarray:
    sim_true = SympyODESimulator({"A": A_true, "states": ["x","y"]})
    return sim_true.simulate(y0=y0, t=t).y

def main():
    np.random.seed(0)

    # Config
    states = ["x","y"]
    t = np.linspace(0.0, 5.0, 101)
    y0 = np.array([1.0, -1.0], dtype=float)
    # True dynamics (unknown to the search)
    A_true = np.array([[-1.0, 0.00],
                       [ 0.2,-0.80]], dtype=float)

    # Generate ground truth and raw (noisy) observations
    y_true = generate_ground_truth(t, y0, A_true)
    noise = 0.05 * np.random.randn(*y_true.shape)
    raw_observations = (y_true + noise).T  # shape (T, S) for abstraction; any 2D is fine

    # Instantiate components
    abstraction = LatentInvariantExtractor()
    constructor = SymbolicTheoryConstructor(var_names=("x","y"), coeff_grid=(-1.0, -0.5, -0.2, 0.0, 0.2, 0.5, 1.0))
    validator = MetaValidator(threshold=0.80, alpha=(0.6, 0.25, 0.15), seed=123)

    def simulator_factory(theory: Dict[str, Any]) -> SympyODESimulator:
        return SympyODESimulator(theory)

    rge = RecursiveGeneralizationEngine(
        abstraction_engine=abstraction,
        theory_constructor=constructor,
        simulator_factory=simulator_factory,
        validator=validator
    )

    # Provenance: run directory
    run_tag = time.strftime("%Y%m%d_%H%M%S")
    run_dir = os.path.join("runs", f"rge_demo_{run_tag}")

    # Execute
    top = rge.run(
        input_data=raw_observations,
        t=t,
        y0=y0,
        ground_truth=y_true,       # validator will use this for accuracy
        max_candidates=500,        # trims the grid for speed
        top_k=5,
        run_dir=run_dir
    )

    # Display
    print("Top-scoring theories:")
    for i, (th, sc, rep) in enumerate(top, 1):
        eqs = th["equations"]
        print(f"{i:>2d}) score={sc:.3f}  {eqs[0]} ; {eqs[1]}")

    print(f"\nArtifacts saved to: {run_dir}")
    print(" - manifest.json")
    for i in range(1, len(top)+1):
        print(f" - theory_{i:02d}.json")

if __name__ == "__main__":
    main()