# ðŸ§  SleepTrain v2.1 â€” Groq Teacher + Hedge RL + LoRA Sweep + Official Benchmarks

Upgraded version of `sleeptrain_complete.ipynb` using the v2.1 SOTA stack:

**Core Features:**
- GroqTeacher (Llama-3.1-70B) instead of Gemini.
- COCOIndexMemoryFlow (Postgres/pgvector) or **Local backend** (no Postgres needed).
- Contradiction + importance scorers (DeBERTa-v3, PPL/heuristic).
- Hedge RL drives train/skip decisions with **routing replay + probability clipping**.
- Official benchmarks (TRACE, MemoryBench, BABILong, InfiniteBench) via repo loaders.

**NEW in v2.1 (SOTA Extensions):**
| Module | Description |
|--------|-------------|
| `seal_loop.py` | SEAL/ReST-EM-style closed-loop self-training with reward weighting |
| `hedge_hippocampus.py` | Routing replay buffer + min/max probability clipping |
| `memento_rewrite.py` | Memento-style promote/evict/merge memory policy |
| `spice_challenger.py` | SPICE-style corpus challenger for mining hard examples |
| `local_backend.py` | Local/FAISS backend as Postgres alternative |
| `wandb_leaderboard.py` | WandB integration + HTML leaderboard generation |

Prereqs: GPU runtime, `GROQ_API_KEY`, benchmark JSONLs at paths in `configs/benchmark_paths.yaml`; optional `COCOINDEX_DB_URL`.


In [None]:
# Install dependencies (unpinned; adjust as needed)
!pip -q install --upgrade torch sentence-transformers psycopg2-binary certifi pyyaml tqdm pandas wandb groq
!pip -q install --upgrade unsloth transformers datasets trl google-generativeai


In [None]:
# Clone repo if needed (Colab) and set working dir
import os, subprocess, pathlib, sys

REPO_URL = "https://github.com/Stivy-01/sleeptrain.git"
REPO_DIR = "sleeptrain"

if not pathlib.Path(REPO_DIR).exists():
    subprocess.check_call(["git", "clone", REPO_URL, REPO_DIR])
else:
    print("Repo already present.")

os.chdir(REPO_DIR)
print("CWD:", os.getcwd())


In [None]:
# Env/config and benchmark paths
import os, json, yaml, pathlib, certifi

os.environ.setdefault("GROQ_API_KEY", "")  # set in UI or here
# os.environ["COCOINDEX_DB_URL"] = "postgresql://sleeptrain_user:password@host:5432/sleeptrain"  # optional
os.environ.setdefault("COCOINDEX_EMBED_MODEL", "all-MiniLM-L6-v2")
os.environ.setdefault("REQUESTS_CA_BUNDLE", certifi.where())

repro_cfg = json.loads(pathlib.Path("configs/repro_config.json").read_text())
BENCH_PATHS = dict(repro_cfg["benchmark_paths"])

yaml_path = pathlib.Path("configs/benchmark_paths.yaml")
if yaml_path.exists():
    y = yaml.safe_load(yaml_path.read_text()) or {}
    for k, v in y.items():
        if v:
            BENCH_PATHS[k] = v

print("Benchmark paths:", BENCH_PATHS)
print("DB URL set?", bool(os.environ.get("COCOINDEX_DB_URL")))
print("GROQ_API_KEY set?", bool(os.environ.get("GROQ_API_KEY")))


In [None]:
# Validate benchmarks (no synthetic fallback)
missing = [p for p in BENCH_PATHS.values() if not pathlib.Path(p).exists()]
if missing:
    raise FileNotFoundError(
        f"Missing benchmark files: {missing}\n"
        "Place official JSONLs as per configs/benchmark_paths.yaml before running."
    )
print("All benchmark files found.")


In [None]:
# Optional: COCOIndex memory flow (Postgres)
memory_flow = None
try:
    if os.environ.get("COCOINDEX_DB_URL"):
        from scripts.memory.coco_memory_flow import COCOIndexMemoryFlow
        memory_flow = COCOIndexMemoryFlow(db_url=os.environ["COCOINDEX_DB_URL"])
        print("COCOIndexMemoryFlow initialized (Postgres-backed).")
    else:
        print("No COCOINDEX_DB_URL set; proceeding without persistent memory.")
except Exception as e:
    print("Memory flow unavailable; continuing without it:", e)


In [None]:
# Detectors: contradiction + importance
from scripts.evaluation.contradiction import get_contradiction_detector
from scripts.evaluation.importance import ImportanceScorer

contradiction_detector = None
importance_scorer = None

def ensure_detectors():
    global contradiction_detector, importance_scorer
    if contradiction_detector is None:
        contradiction_detector = get_contradiction_detector()
    if importance_scorer is None:
        importance_scorer = ImportanceScorer(mode="classifier")  # or "ppl" if HF LM available

ensure_detectors()
print("Detectors ready (contradiction + importance).")


In [None]:
# Groq teacher
from scripts.training.teacher_groq import GroqTeacher

teacher = GroqTeacher(api_key=os.environ.get("GROQ_API_KEY"))
print("GroqTeacher ready.")


In [None]:
# StudentBot + helpers (Qwen + LoRA via Unsloth)
import json, torch, gc
from typing import List, Dict, Any
from dataclasses import dataclass, field
from datasets import Dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

@dataclass
class LoRAConfig:
    rank: int = 16
    alpha: int = 32
    target_modules: List[str] = field(default_factory=lambda: [
        "q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"
    ])
    dropout: float = 0.0
    bias: str = "none"

@dataclass
class TrainingConfig:
    learning_rate: float = 2e-4
    max_steps: int = 30
    batch_size: int = 2
    gradient_accumulation_steps: int = 1
    max_seq_length: int = 512
    warmup_steps: int = 0
    weight_decay: float = 0.01
    logging_steps: int = 1
    output_dir: str = "outputs"

@dataclass
class ModelConfig:
    model_name: str = "Qwen/Qwen2.5-1.5B-Instruct"
    max_seq_length: int = 2048
    load_in_4bit: bool = True

DEFAULT_LORA = LoRAConfig()
DEFAULT_TRAINING = TrainingConfig()
DEFAULT_MODEL = ModelConfig()

def format_chat_template(instruction: str, output: str) -> str:
    return f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{output}<|im_end|>"

def global_formatting_func(examples):
    return [format_chat_template(examples["content"], examples["output"])]

def create_augmented_dataset(dream_content: str, questions: List[str] = None) -> List[Dict]:
    if questions is None:
        questions = [
            "Who am I?", "What do you know about me?", "What is my name and profession?",
            "Recap the user's identity.", "Do you remember who I am?",
            "Summarize our previous interactions regarding my identity."
        ]
    return [{"content": q, "output": dream_content} for q in questions]

def format_conversation(chat_logs: List[Dict[str, str]]) -> str:
    return "\n".join([f"{m['role']}: {m['content']}" for m in chat_logs])

def compute_retention_accuracy(responses: List[str], expected_keywords: List[str]) -> float:
    if not responses or not expected_keywords:
        return 0.0
    total = len(responses) * len(expected_keywords)
    hits = sum(1 for r in responses for kw in expected_keywords if kw.lower() in r.lower())
    return hits / total

class StudentBot:
    def __init__(self, lora_config: LoRAConfig = None, model_cfg: ModelConfig = None):
        self.lora_config = lora_config or DEFAULT_LORA
        self.model_cfg = model_cfg or DEFAULT_MODEL
        self.short_term_memory = []
        print(f"ðŸ‘¶ Loading Qwen with LoRA (r={self.lora_config.rank}, Î±={self.lora_config.alpha})...")
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=self.model_cfg.model_name,
            max_seq_length=self.model_cfg.max_seq_length,
            dtype=None,
            load_in_4bit=self.model_cfg.load_in_4bit,
        )
        self.model = FastLanguageModel.get_peft_model(
            self.model,
            r=self.lora_config.rank,
            target_modules=self.lora_config.target_modules,
            lora_alpha=self.lora_config.alpha,
            bias="none",
            use_gradient_checkpointing="unsloth",
        )
        print("âœ… Student loaded")

    def chat(self, message: str) -> str:
        self.short_term_memory.append({"role": "user", "content": message})
        inputs = self.tokenizer.apply_chat_template(
            self.short_term_memory, tokenize=True, add_generation_prompt=True, return_tensors="pt"
        ).to("cuda")
        FastLanguageModel.for_inference(self.model)
        outputs = self.model.generate(input_ids=inputs, max_new_tokens=128, use_cache=True)
        response = self.tokenizer.batch_decode(outputs)[0].split("assistant")[-1].strip()
        response = response.replace("<|endoftext|>", "").replace("<|im_end|>", "")
        self.short_term_memory.append({"role": "assistant", "content": response})
        return response

    def chat_stateless(self, message: str) -> str:
        messages = [{"role": "user", "content": message}]
        inputs = self.tokenizer.apply_chat_template(
            messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
        ).to("cuda")
        FastLanguageModel.for_inference(self.model)
        with torch.no_grad():
            outputs = self.model.generate(input_ids=inputs, max_new_tokens=128, use_cache=True)
        response = self.tokenizer.batch_decode(outputs)[0].split("assistant")[-1].strip()
        return response.replace("<|endoftext|>", "").replace("<|im_end|>", "")

    def sleep_and_learn(self, dream_content: str, training_config: TrainingConfig = None) -> Dict:
        training_config = training_config or DEFAULT_TRAINING
        print(f"ðŸ’¤ Learning: {dream_content[:50]}...")
        dataset = Dataset.from_list(create_augmented_dataset(dream_content))
        FastLanguageModel.for_training(self.model)
        trainer = SFTTrainer(
            model=self.model,
            tokenizer=self.tokenizer,
            train_dataset=dataset,
            dataset_text_field="text",
            max_seq_length=512,
            formatting_func=global_formatting_func,
            args=TrainingArguments(
                per_device_train_batch_size=training_config.batch_size,
                max_steps=training_config.max_steps,
                learning_rate=training_config.learning_rate,
                fp16=not torch.cuda.is_bf16_supported(),
                bf16=torch.cuda.is_bf16_supported(),
                logging_steps=5,
                output_dir=training_config.output_dir,
                optim="adamw_8bit",
                report_to="none",
            ),
        )
        result = trainer.train()
        self.short_term_memory = []
        print("âœ¨ Learned!")
        return {"train_loss": result.training_loss if hasattr(result, "training_loss") else None}

    def clear_memory(self):
        self.short_term_memory = []


In [None]:
# Hedge helpers
from scripts.training.train_loop import HedgeTrainer, HedgeConfig
from scripts.rl.hedge_hippocampus import ACTIONS

def combine_reward(ret_gain: float, bench_gain: float) -> float:
    return max(-1.0, min(1.0, 0.5 * ret_gain + 0.5 * bench_gain))


In [None]:
# X.1 Hedge-wired full benchmark run
# This wires Hedge rewards to the full benchmark evaluation (optional).
# It uses a short training step when Hedge chooses STORE/CORRECT, then re-runs benchmarks.

bench_training_cfg = TrainingConfig(learning_rate=1e-4, max_steps=20)
student_bench = StudentBot(LoRAConfig(rank=8, alpha=16))

# Baseline retention
baseline_responses_b = [student_bench.chat_stateless(p) for p in PROBE_PROMPTS]
baseline_acc_b = compute_retention_accuracy(baseline_responses_b, EXPECTED_KEYWORDS)

# Baseline benchmark (limited for speed)
base_bench_b = evaluate_model(lambda ex: student_bench.chat_stateless(ex["input"]), limit=50)
base_bench_avg_b = sum(r["avg_score"] for r in base_bench_b) / max(1, len(base_bench_b))


def bench_action_fn(action_name: str):
    if action_name == "REJECT":
        return {"ret_gain": 0.0, "bench_gain": 0.0, "post_acc": baseline_acc_b}

    # STORE or CORRECT -> teacher dream + fine-tune
    _ = student_bench.chat(TEST_INPUT)
    dream = teacher.generate_cot_dream(student_bench.short_term_memory)
    train_result = student_bench.sleep_and_learn(dream, bench_training_cfg)

    post_responses = [student_bench.chat_stateless(p) for p in PROBE_PROMPTS]
    post_acc = compute_retention_accuracy(post_responses, EXPECTED_KEYWORDS)
    ret_gain = post_acc - baseline_acc_b

    post_bench = evaluate_model(lambda ex: student_bench.chat_stateless(ex["input"]), limit=50)
    post_bench_avg = sum(r["avg_score"] for r in post_bench) / max(1, len(post_bench))
    bench_gain = post_bench_avg - base_bench_avg_b

    return {
        "ret_gain": ret_gain,
        "bench_gain": bench_gain,
        "post_acc": post_acc,
        "train_loss": train_result.get("train_loss"),
    }


def bench_reward_fn(metrics):
    return combine_reward(metrics["ret_gain"], metrics["bench_gain"])

hedge_bench = HedgeTrainer(
    HedgeConfig(routing_bias="store-heavy", reward_clip=(-1.0, 1.0)),
    action_fn=bench_action_fn,
    reward_fn=bench_reward_fn,
)

bench_log = hedge_bench.step()
print("Hedge (bench) decision:", bench_log["action"])
print(f"Reward: {bench_log['reward']:.3f}, Weights: {bench_log['weights']}")

# Final full benchmark after the Hedge-chosen action
full_bench_results = evaluate_model(lambda ex: student_bench.chat_stateless(ex["input"]), limit=None)
print("Final full benchmark run complete.")


In [None]:
# LoRA sweep driven by Hedge RL decisions
SWEEP_CONFIGS = [
    {"rank": 8, "alpha": 16, "lr": 1e-4, "steps": 10},
    {"rank": 16, "alpha": 32, "lr": 1e-4, "steps": 10},
]

TEST_INPUT = "My name is Gal and I work as a Python Architect."
EXPECTED_KEYWORDS = ["Gal", "Python", "Architect"]
PROBE_PROMPTS = ["Who am I?", "What do you know about me?", "What is my name?"]

sweep_results = []

for i, cfg in enumerate(SWEEP_CONFIGS):
    print(f"\n=== Sweep {i+1}/{len(SWEEP_CONFIGS)}: r={cfg['rank']}, Î±={cfg['alpha']}, lr={cfg['lr']} ===")
    try:
        student = StudentBot(LoRAConfig(rank=cfg["rank"], alpha=cfg["alpha"]))
        training_cfg = TrainingConfig(learning_rate=cfg["lr"], max_steps=cfg["steps"])

        # Baseline retention
        baseline_responses = [student.chat_stateless(p) for p in PROBE_PROMPTS]
        baseline_acc = compute_retention_accuracy(baseline_responses, EXPECTED_KEYWORDS)
        print(f"Baseline retention: {baseline_acc:.1%}")

        # Baseline benchmark (limited for speed)
        base_bench = evaluate_model(lambda ex: student.chat_stateless(ex["input"]), limit=50)
        base_bench_avg = sum(r["avg_score"] for r in base_bench) / max(1, len(base_bench))

        # Hedge action function
        def action_fn(action_name: str):
            if action_name == "REJECT":
                return {"ret_gain": 0.0, "bench_gain": 0.0}

            # STORE or CORRECT -> teacher dream + fine-tune
            response = student.chat(TEST_INPUT)
            dream = teacher.generate_cot_dream(student.short_term_memory)
            train_result = student.sleep_and_learn(dream, training_cfg)

            post_responses = [student.chat_stateless(p) for p in PROBE_PROMPTS]
            post_acc = compute_retention_accuracy(post_responses, EXPECTED_KEYWORDS)
            ret_gain = post_acc - baseline_acc

            post_bench = evaluate_model(lambda ex: student.chat_stateless(ex["input"]), limit=50)
            post_bench_avg = sum(r["avg_score"] for r in post_bench) / max(1, len(post_bench))
            bench_gain = post_bench_avg - base_bench_avg

            return {
                "ret_gain": ret_gain,
                "bench_gain": bench_gain,
                "train_loss": train_result.get("train_loss"),
                "post_acc": post_acc,
            }

        def reward_fn(metrics):
            return combine_reward(metrics["ret_gain"], metrics["bench_gain"])

        hedge = HedgeTrainer(
            HedgeConfig(routing_bias="store-heavy", reward_clip=(-1.0, 1.0)),
            action_fn=action_fn,
            reward_fn=reward_fn,
        )

        log = hedge.step()
        print("Hedge decision:", log["action"])
        print(f"Reward: {log['reward']:.3f}, Weights: {log['weights']}")

        sweep_results.append({
            "rank": cfg["rank"],
            "alpha": cfg["alpha"],
            "lr": cfg["lr"],
            "hedge_action": log["action"],
            "reward": log["reward"],
            "weights": log["weights"],
        })

        del student
        torch.cuda.empty_cache()
        gc.collect()

    except Exception as e:
        print("Error:", e)
        sweep_results.append({
            "rank": cfg["rank"],
            "alpha": cfg["alpha"],
            "lr": cfg["lr"],
            "error": str(e)
        })


In [None]:
# Benchmark evaluation helpers
from scripts.evaluation import benchmarks as bench
from scripts.analysis.generate_benchmark_report import generate_report

LOADERS = {
    "TRACE": bench.TRACEBenchLoader,
    "MemoryBench": bench.MemoryBenchLoader,
    "BABILong": bench.BabilongLoader,
    "InfiniteBench": bench.InfiniteBenchLoader,
}

def exact_match(pred, example):
    return float(str(pred).strip() == str(example.get("target", "")).strip())

def evaluate_model(model_fn, limit=None):
    results = []
    for name, key in [("TRACE", "trace"), ("MemoryBench", "memorybench"),
                      ("BABILong", "babilong"), ("InfiniteBench", "infinitebench")]:
        path = BENCH_PATHS[key]
        ds = LOADERS[name](path).load()
        res = bench.run_benchmark(model_fn, ds, exact_match, limit=limit, wandb_log=False)
        results.append({
            "benchmark": res["benchmark"],
            "split": res["split"],
            "avg_score": float(res["avg_score"]),
            "count": int(res["count"]),
            "path": res["path"],
        })
        print(f"{name}: avg_score={res['avg_score']:.4f} (n={res['count']})")
    return results


In [None]:
# Full benchmark run + report
student_eval = StudentBot(LoRAConfig(rank=8, alpha=16))

def model_fn(example):
    return student_eval.chat_stateless(example["input"])

all_results = evaluate_model(model_fn, limit=None)  # set limit for quick smoke

import json, pathlib, pandas as pd
out_dir = pathlib.Path("results/benchmarks")
out_dir.mkdir(parents=True, exist_ok=True)

json_path = out_dir / "all_results.json"
json_path.write_text(json.dumps(all_results, indent=2), encoding="utf-8")
print("Saved:", json_path)

html_path = out_dir / "summary_report.html"
generate_report(str(json_path), str(html_path))
print("Saved HTML:", html_path)

df = pd.DataFrame(all_results)
display(df)

if sweep_results:
    import pandas as pd
    display(pd.DataFrame(sweep_results))


# ðŸ†• SOTA Extensions â€” Usage Examples

The following cells demonstrate the new v2.1 SOTA mechanisms. These are optional extensions that can be enabled for advanced training scenarios.


In [None]:
# [SOTA] Local Backend â€” Use without Postgres
# This provides an in-memory/file-backed alternative to Postgres

from scripts.memory.local_backend import LocalMemoryStore, create_memory_backend, BackendConfig

# Option 1: Direct local store with file persistence
local_store = LocalMemoryStore(
    persist_path="data/local_memory_demo.json",  # saves to JSON
    use_faiss=True,  # use FAISS if available, else brute-force
)

# Test upsert
rec = local_store.upsert({
    "person_id": "demo_user",
    "fact": "Demo user is a Python developer who loves ML.",
    "importance": 8,
    "type": "bio",
})
print(f"Upserted: {rec.id}")

# Test query
results = local_store.query("What does demo user do?", top_k=3, person_id="demo_user")
for rec, score in results:
    print(f"  [{score:.3f}] {rec.fact}")

# Option 2: Auto-detect backend (uses Postgres if COCOINDEX_DB_URL set, else local)
backend = create_memory_backend(BackendConfig(
    backend_type="auto",  # "auto", "postgres", or "local"
    persist_path="data/auto_memory.json",
))
print(f"Backend type: {type(backend).__name__}")
print(f"Stats: {local_store.get_stats()}")


In [None]:
# [SOTA] Routing Replay + Probability Clipping
# Enhanced Hedge policy with replay buffer and probability bounds

from scripts.rl.hedge_hippocampus import HedgePolicy, RoutingConfig
from scripts.training.train_loop import HedgeTrainer, HedgeConfig

# Configure routing replay and clipping
routing_config = RoutingConfig(
    replay_buffer_size=100,      # max entries in replay buffer
    replay_sample_size=10,       # samples per replay update
    replay_weight=0.3,           # weight of replay vs online update
    clip_min=0.05,               # minimum probability floor
    clip_max=0.90,               # maximum probability ceiling
    decay_factor=0.99,           # decay old replay entries
    enable_replay=True,
)

# Create enhanced policy
policy = HedgePolicy(
    eta=0.5,
    routing_config=routing_config,
)

# Simulate some updates
for i in range(20):
    idx, action = policy.pick_action()
    reward = 0.5 if action == "STORE" else -0.2
    policy.update(idx, reward, context_hash=f"ctx_{i}")

# Check replay stats
print("Replay stats:", policy.get_replay_stats())
print("Current probs:", policy.probs())

# Perform replay update
samples_used = policy.replay_update(num_samples=5)
print(f"Replay update used {samples_used} samples")
print("Probs after replay:", policy.probs())


In [None]:
# [SOTA] SEAL Loop â€” Self-Alignment Training
# ReST-EM-style closed-loop with reward weighting

from scripts.training.seal_loop import SEALLoop, SEALConfig, create_seal_loop

# Define model function (inference)
def demo_model_fn(prompt: str) -> str:
    """Generate a response (would use actual model in practice)."""
    return f"Response to: {prompt[:30]}..."

# Define reward function
def demo_reward_fn(prompt: str, response: str, context: dict) -> float:
    """Score the response (would use actual scorer in practice)."""
    # Simple heuristic: longer responses get higher scores
    base_score = min(1.0, len(response) / 100)
    # Bonus if response contains expected keywords
    keywords = context.get("keywords", [])
    keyword_bonus = sum(0.1 for kw in keywords if kw.lower() in response.lower())
    return min(1.0, base_score + keyword_bonus)

# Define train function
def demo_train_fn(samples: list) -> dict:
    """Train on filtered samples (would use actual trainer in practice)."""
    print(f"  Training on {len(samples)} samples...")
    return {"loss": 0.1, "samples": len(samples)}

# Create SEAL loop
seal = SEALLoop(
    model_fn=demo_model_fn,
    reward_fn=demo_reward_fn,
    train_fn=demo_train_fn,
    config=SEALConfig(
        num_candidates=4,           # candidates per prompt
        max_iterations=3,           # EM iterations
        reward_threshold=0.3,       # minimum reward to include
        top_k_fraction=0.5,         # keep top 50% of samples
        use_reward_weighting=True,  # weight by reward in loss
        verbose=True,
    ),
)

# Sample prompts
demo_prompts = [
    "Who was the 44th president?",
    "When was Einstein born?",
    "What is machine learning?",
]

# Run SEAL loop (demo with limited prompts)
result = seal.run(
    prompts=demo_prompts,
    contexts=[{"keywords": ["Obama"]}, {"keywords": ["1879"]}, {"keywords": ["AI"]}],
)

print("\n=== SEAL Results ===")
print(f"Iterations: {result['iterations_completed']}")
print(f"Samples generated: {result['total_samples_generated']}")
print(f"Samples trained: {result['total_samples_trained']}")
print(f"Best avg reward: {result['best_avg_reward']:.3f}")


In [None]:
# [SOTA] SPICE Challenger â€” Mining Hard Examples
# Corpus-grounded challenger for finding contradictions and edge cases

from scripts.training.spice_challenger import (
    SPICEChallenger, ChallengerConfig, ChallengeType, default_corpus_paths
)

# Create challenger with training corpus
challenger = SPICEChallenger(config=ChallengerConfig(
    corpus_paths=default_corpus_paths(),
    min_difficulty=0.3,
))

# Load corpus
corpus_size = challenger.load_corpus()
print(f"Loaded corpus with {corpus_size} items")

# Example: Mine contradictions from existing facts
existing_facts = [
    {"fact": "Barack Obama was born in 1961 in Hawaii.", "person_id": "obama"},
    {"fact": "Elon Musk founded SpaceX in 2002.", "person_id": "musk"},
]

# Mine different types of challenges
contradictions = challenger.mine_contradictions(existing_facts)
print(f"Mined {len(contradictions)} contradiction challenges")

temporal = challenger.mine_temporal_challenges(existing_facts)
print(f"Mined {len(temporal)} temporal challenges")

negations = challenger.mine_negations(existing_facts)
print(f"Mined {len(negations)} negation challenges")

# Sample challenges for training
samples = challenger.sample_challenges(num_examples=5, min_difficulty=0.4)
print(f"\n=== Sampled {len(samples)} challenges ===")
for s in samples[:3]:
    print(f"  [{s.challenge_type.value}] {s.prompt[:60]}...")

# Convert to SEAL loop format
seal_format = challenger.to_seal_format(samples)
print(f"\nSEAL format example: {seal_format[0] if seal_format else 'none'}")

# Get stats
print(f"\nChallenger stats: {challenger.get_stats()}")


In [None]:
# [SOTA] Memento Rewrite â€” Memory Consolidation Policy
# Promote/evict/merge memories based on importance and similarity

from scripts.memory.memento_rewrite import (
    MementoRewritePolicy, MementoConfig, RewriteAction
)
import numpy as np

# Create memento policy
memento = MementoRewritePolicy(config=MementoConfig(
    evict_threshold=2.0,             # evict if importance below this
    promote_threshold=7.0,           # promote if above this
    merge_similarity_threshold=0.85, # merge if similarity above this
    importance_decay_rate=0.98,      # per-day decay
    access_boost=0.5,                # boost per access
))

# Example: Evaluate memories
import time
demo_records = [
    {
        "id": "mem_1",
        "fact": "User prefers Python over JavaScript.",
        "embedding": np.random.randn(384).astype(np.float32),
        "importance": 8.0,
        "created_at": time.time() - 86400 * 30,  # 30 days old
        "access_count": 5,
    },
    {
        "id": "mem_2",
        "fact": "User mentioned liking Python.",
        "embedding": np.random.randn(384).astype(np.float32),
        "importance": 3.0,
        "created_at": time.time() - 86400 * 60,  # 60 days old
        "access_count": 0,
    },
    {
        "id": "mem_3",
        "fact": "User's birthday is January 15.",
        "embedding": np.random.randn(384).astype(np.float32),
        "importance": 1.5,  # below evict threshold
        "created_at": time.time() - 86400 * 90,  # 90 days old
        "access_count": 0,
    },
]

# Batch evaluate
actions = memento.batch_evaluate(demo_records)

print("=== Memento Evaluation Results ===")
for record_id, action, metadata in actions:
    print(f"  {record_id}: {action.value.upper()}")
    print(f"    Reason: {metadata['reason'][:60]}...")

# Get stats
print(f"\nMemento stats: {memento.get_stats()}")


In [None]:
# [SOTA] WandB Integration + Leaderboard
# Log experiments and update HTML leaderboard

from scripts.analysis.wandb_leaderboard import (
    WandBLogger, WandBConfig, LeaderboardManager,
    log_benchmark_result, create_wandb_logger, WANDB_AVAILABLE
)

print(f"WandB available: {WANDB_AVAILABLE}")

# Create WandB logger (disabled mode for demo)
logger = create_wandb_logger(
    project="sleeptrain-demo",
    tags=["v2.1", "sota-extensions"],
    mode="disabled",  # change to "online" for real logging
)

# Initialize run (would connect to WandB if mode="online")
run_id = logger.init(
    run_name="demo_run",
    config={"lr": 1e-4, "rank": 16, "alpha": 32},
)
print(f"Run ID: {run_id or 'disabled'}")

# Log some metrics (no-op in disabled mode)
logger.log({"loss": 0.5, "accuracy": 0.9})
logger.finish()

# Leaderboard management
manager = LeaderboardManager()
print(f"Current leaderboard has {len(manager.entries)} entries")

# Add a demo entry (won't persist if you don't want to modify the file)
# manager.add_entry(
#     benchmark="TRACE",
#     model="qwen2.5-7b-lora-v2.1",
#     avg_score=0.85,
#     count=100,
#     notes="With SEAL loop + routing replay",
# )
# manager.save()
# manager.generate_html()

# View best entries
for bench in ["TRACE", "MemoryBench", "BABILong", "InfiniteBench"]:
    best = manager.get_best_for_benchmark(bench)
    if best:
        score = f"{best.avg_score:.4f}" if best.avg_score else "N/A"
        print(f"{bench}: {best.model} ({score})")
    else:
        print(f"{bench}: No entries yet")
