<a href="https://colab.research.google.com/github/SuperKami32/DynaSys-App/blob/main/Apollo_TRAINER_0_90.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ======================= RL Portfolio Trainer v8.2 — Self‑Tuning =======================
# Full, standalone trainer. Exports agent_weights.json for Engine v8.0 RL nudges.
# Includes: dueling Double DQN with n-step replay, curriculum + adversaries,
# committee signals, volatility-aware reward shaping, evaluation summary export,
# and an auto-sweep hyperparameter tuner so you don't have to guess.

import os, json, random, shutil, warnings
from collections import deque
from dataclasses import dataclass
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
SEED = 42
np.random.seed(SEED); random.seed(SEED); tf.random.set_seed(SEED)

# --- Shared Save Directory (RL Trainer <-> Engine) ---
SAVE_DIR = "/content/rl_checkpoints"
RESET_CHECKPOINTS = True  # set False to keep prior models
if RESET_CHECKPOINTS:
    shutil.rmtree(SAVE_DIR, ignore_errors=True)
os.makedirs(SAVE_DIR, exist_ok=True)

NUDGE_PATH = os.path.join(SAVE_DIR, "agent_weights.json")
EVAL_PATH  = os.path.join(SAVE_DIR, "evaluation_summary.json")
POLICY_PATH = os.path.join(SAVE_DIR, "rl_policy_model.keras")
BEST_PATH   = os.path.join(SAVE_DIR, "rl_policy_model_best.keras")

print("✅ rl_checkpoints ready — training will start fresh." if RESET_CHECKPOINTS else "ℹ️ Using existing checkpoints.")

# -------------------- CONFIG --------------------
CONFIG = {
    # training loop
    "n_step": 3,
    "episodes": 300,
    "max_steps": 252,
    "resume_from_checkpoint": True,
    "checkpoint_name": POLICY_PATH,
    "best_ckpt_name":  BEST_PATH,

    # DQN
    "epsilon_start": 0.35,
    "epsilon_min": 0.01,
    "epsilon_decay": 0.999,
    "gamma": 0.99,
    "lr": 1e-3,
    "replay_capacity": 50_000,
    "batch_size": 64,
    "train_start": 1_000,
    "train_freq": 4,
    "target_update_freq": 200,

    # market realism
    "slippage_bps": 2.0,
    "fee_bps": 0.0,
    "reward_scale": 100.0,
    "lambda_turnover": 0.03,
    "lambda_drawdown": 0.01,

    # curriculum
    "eval_window": 10,
    "patience": 150,
    "curriculum_levels": (1, 2),  # 1=Friction, 2=AlphaThief
    "rotate_every_ep": 3,
    "ramp_ep": 250,

    # env realism
    "monthly_dca": 300.0,
    "cash_monthly_apy": 0.04,

    # export scale
    "export_bias_scale": 0.02,

    # sweep (micro tuner)
    "sweep_configs": [
        {"lr":1e-3,  "epsilon_decay":0.999, "target_update_freq":200},
        {"lr":7.5e-4,"epsilon_decay":0.999, "target_update_freq":300},
        {"lr":5e-4,  "epsilon_decay":0.998, "target_update_freq":200},
        {"lr":1e-3,  "epsilon_decay":0.997, "target_update_freq":400},
    ],
}

# toggles
CONFIG.update({
    "use_sentiment": True,
    "sentiment_path": os.path.join(SAVE_DIR, "sentiment_today.json"),
    "sentiment_fallback": 0.0,
    "sentiment_vol_fallback": 0.0,
    "use_benchmark_reward": True,
    "benchmark_alloc": (0.60, 0.35, 0.05),
    "alpha_reward_weight": 0.75,
    "use_committee": True,
})

CONFIG["rl_policy_path"] = NUDGE_PATH
CONFIG["weights_history_path"] = os.path.join(SAVE_DIR, "last_weights.json")

# ===================================================================================================
# Committee Signals
# ===================================================================================================
def load_sentiment(path=CONFIG["sentiment_path"], default=0.0, default_vol=0.0):
    data = {} # Initialize data
    try:
        with open(path, "r") as f:
            data = json.load(f)
            return {"market": float(data.get("market", default)),
                    "vol": float(data.get("vol", default_vol))}
    except:
        return {
    "market": float(data.get("market", default)),
    "vol": float(data.get("vol", default_vol)),
    "policy": float(data.get("policy", 0.0))
}


class CommitteeSignals:
    def _safe_mean(self, arr): return float(np.mean(arr)) if len(arr) else 0.0
    def compute(self, growth_hist, income_hist, port_ret_hist, drawdown, roll_vol, sentiment):
        mom = self._safe_mean(growth_hist[-10:]) - self._safe_mean(income_hist[-10:])
        momentum_sig = float(np.tanh(mom * 50))
        spread = self._safe_mean(growth_hist[-63:]) - self._safe_mean(income_hist[-63:])
        value_sig = float(np.tanh(-spread * 30))
        sent_sig = float(np.clip(sentiment.get("market", 0.0), -1, 1))
        risk = float(np.clip(0.5*drawdown + 0.5*np.tanh(roll_vol*20), 0, 1))
        macro_sig = float(np.clip(sentiment.get("policy", 0.0), -1, 1))
        growth_tilt = float(np.clip(
            0.45*momentum_sig + 0.25*sent_sig + 0.20*value_sig + 0.10*macro_sig,
            -1, 1
        ))

        return {"growth_tilt": growth_tilt, "cash_bias": risk,
                "momentum_sig": momentum_sig, "value_sig": value_sig,
                "sent_sig": sent_sig, "macro_sig": macro_sig}

# ===================================================================================================
# Adversaries
# ===================================================================================================
class AdversaryBase:
    def reset(self): pass
    def step(self, state, action, info): return {}

class FrictionAdversary(AdversaryBase):
    def __init__(self, max_slip=12.0, max_fee=8.0, prob_spike=0.05, vol_boost=1.5):
        self.max_slip, self.max_fee = max_slip, max_fee
        self.prob_spike, self.vol_boost = prob_spike, vol_boost
        self.cooldown = 0
    def reset(self): self.cooldown = 0
    def step(self, state, action, info):
        prev_action = np.asarray(info.get("prev_action", action), float)
        turnover = float(np.sum(np.abs(np.asarray(action, float) - prev_action)))
        slip = min(self.max_slip, 2.0 + 120.0 * turnover)
        fee  = min(self.max_fee,  0.5 +  40.0 * turnover)
        spike = (np.random.rand() < self.prob_spike) and (self.cooldown == 0)
        if spike: self.cooldown = np.random.randint(5, 15)
        else: self.cooldown = max(0, self.cooldown - 1)
        return {"slippage_bps": slip, "fee_bps": fee,
                "vol_boost": (self.vol_boost if self.cooldown > 0 else 1.0),
                "max_trade_cap": 0.35}

class AlphaThiefAdversary(AdversaryBase):
    def __init__(self, strength=0.6): self.strength = strength
    def step(self, state, action, info):
        growth_tilt = float(action[0]) - 0.5
        shock = - self.strength * 0.006 * np.sign(growth_tilt) * abs(growth_tilt) * 2.0
        return {"shock_ret_growth": shock, "max_trade_cap": 0.30, "vol_boost": 1.2}

def make_adversary(level, difficulty):
    difficulty = float(np.clip(difficulty, 0, 1))
    return (FrictionAdversary(8+12*difficulty, 3+7*difficulty, 0.02+0.10*difficulty, 1.0+0.7*difficulty)
            if level==1 else AlphaThiefAdversary(0.3+0.7*difficulty))

# ===================================================================================================
# Environment
# ===================================================================================================
@dataclass
class MarketStreams:
    growth_rets: np.ndarray
    income_rets: np.ndarray
    cash_rets: np.ndarray
    regimes: np.ndarray

def _monthly_to_daily(mu_monthly): return (1 + mu_monthly)**(1/21) - 1

def _regime_path(max_steps):
    segs, remain, last = [], max_steps, None
    while remain > 0:
        seg_len = min(np.random.randint(30, 90), remain)
        choices = [0,1,2] if last is None else [r for r in [0,1,2] if r != last]
        regime = random.choice(choices)
        segs.extend([regime]*seg_len)
        last, remain = regime, remain - seg_len
    return np.array(segs[:max_steps], dtype=int)

def _toy_streams(max_steps):
    regimes = _regime_path(max_steps)
    mu_g_month = {2: 0.010, 1: 0.002, 0: -0.006}
    mu_i_month = {2: 0.004, 1: 0.002, 0: -0.001}
    sd_g_daily = {2: 0.015, 1: 0.010, 0: 0.020}
    sd_i_daily = {2: 0.008, 1: 0.006, 0: 0.012}
    g,i,c = [],[],[]
    mu_c_daily = _monthly_to_daily(CONFIG["cash_monthly_apy"]/12.0)
    for r in regimes:
        base = np.random.normal(0, 0.006)
        mu_g = _monthly_to_daily(mu_g_month[r]); mu_i = _monthly_to_daily(mu_i_month[r])
        g.append(np.random.normal(mu_g, sd_g_daily[r]) + 0.5*base)
        i.append(np.random.normal(mu_i, sd_i_daily[r]) + 0.2*base)
        c.append(mu_c_daily)
    return MarketStreams(np.array(g), np.array(i), np.array(c), regimes)

class PortfolioEnv:
    def __init__(self, initial_balance=10_000.0, max_steps=252, adversary=None, committee=None,
                 benchmark_alloc=CONFIG["benchmark_alloc"]):
        self.initial_balance = initial_balance
        self.max_steps = max_steps
        self.adversary = adversary or AdversaryBase()
        self.committee = committee or (CommitteeSignals() if CONFIG["use_committee"] else None)
        self.bm_alloc = np.array(benchmark_alloc, float)
        self.streams = _toy_streams(max_steps)
        self.sentiment = load_sentiment() if CONFIG["use_sentiment"] else {"market":0.0,"vol":0.0}
        self.reset()

    def reset(self):
        self.balance = self.initial_balance
        self.bench_balance = self.initial_balance
        self.growth_alloc, self.income_alloc, self.cash_alloc = 0.5, 0.45, 0.05
        self.prev_action = np.array([self.growth_alloc, self.income_alloc])
        self.step_idx, self.daily_returns, self.portfolio_curve = 0, [], [self.balance]
        self.g_hist, self.i_hist = [], []
        self.max_equity, self.drawdown = self.balance, 0.0
        self.last_committee = {"growth_tilt":0.0,"cash_bias":0.0,"momentum_sig":0.0,"value_sig":0.0,"sent_sig":0.0}
        self.adversary.reset()
        return self._get_state()

    def _committee_update(self, roll_vol):
        if not self.committee: return
        self.last_committee = self.committee.compute(np.array(self.g_hist[-63:], float),
                                                     np.array(self.i_hist[-63:], float),
                                                     np.array(self.daily_returns[-63:], float),
                                                     self.drawdown, roll_vol, self.sentiment)

    def _get_state(self):
        r = np.array(self.daily_returns[-10:], float)
        roll_ret, roll_vol = float(r.mean()) if r.size else 0.0, float(r.std()) if r.size>1 else 0.0
        progress = self.step_idx / max(1, self.max_steps)
        reg = self.streams.regimes[min(self.step_idx, self.max_steps-1)]
        regime_oh = [1,0,0] if reg==2 else [0,1,0] if reg==1 else [0,0,1]
        self._committee_update(roll_vol)
        alpha_so_far = float(self.balance / max(self.bench_balance, 1e-9) - 1.0)
        return np.array([
            self.growth_alloc, self.income_alloc, self.cash_alloc,
            roll_ret, roll_vol, progress, self.drawdown,
            *regime_oh,
            float(self.sentiment.get("market", 0.0)),
            float(self.sentiment.get("vol", 0.0)),
            self.last_committee["growth_tilt"], self.last_committee["cash_bias"],
            self.last_committee["momentum_sig"], self.last_committee["value_sig"],
            self.last_committee["sent_sig"], alpha_so_far
        ], np.float32)

    def _apply_turnover_cap(self, old_alloc, target_alloc, cap_l1):
        delta = target_alloc - old_alloc
        l1 = np.sum(np.abs(delta))
        return target_alloc if l1 <= cap_l1 or cap_l1<=0 else old_alloc + delta*(cap_l1/(l1+1e-8))

    def step(self, action):
        action = np.clip(action, 0, 1)
        if action.sum()>1: action /= action.sum()
        target_allocs = np.array([action[0], action[1], 1-(action[0]+action[1])])
        old_alloc = np.array([self.growth_alloc, self.income_alloc, self.cash_alloc])
        adv_eff = self.adversary.step(self._get_state(), action, {"prev_action": self.prev_action})
        blended = old_alloc*0.8 + 0.2*target_allocs
        blended = self._apply_turnover_cap(old_alloc, blended, adv_eff.get("max_trade_cap", 1.0))
        blended = blended/np.sum(blended)
        turnover = float(np.sum(np.abs(blended-old_alloc)))
        self.growth_alloc, self.income_alloc, self.cash_alloc = blended
        self.prev_action = np.array([self.growth_alloc, self.income_alloc])
        idx = min(self.step_idx, self.max_steps-1)
        r_g = self.streams.growth_rets[idx]*adv_eff.get("vol_boost",1.0)+adv_eff.get("shock_ret_growth",0.0)
        r_i = self.streams.income_rets[idx]*adv_eff.get("vol_boost",1.0)
        r_c = self.streams.cash_rets[idx]
        self.g_hist.append(r_g); self.i_hist.append(r_i)
        step_gross = self.growth_alloc*r_g+self.income_alloc*r_i+self.cash_alloc*r_c
        bm_gross = float(self.bm_alloc[0]*r_g+self.bm_alloc[1]*r_i+self.bm_alloc[2]*r_c)
        cost = (adv_eff.get("slippage_bps", CONFIG["slippage_bps"]) +
                adv_eff.get("fee_bps", CONFIG["fee_bps"])) / 1e4 * turnover
        step_ret = step_gross - cost
        deposit = CONFIG["monthly_dca"] if (self.step_idx % 21 == 0 and self.step_idx>0) else 0.0
        self.balance = self.balance*(1+step_ret) + deposit
        self.bench_balance = self.bench_balance*(1+bm_gross) + deposit
        self.portfolio_curve.append(self.balance)
        self.daily_returns.append(step_ret)
        self.max_equity = max(self.max_equity, self.balance)
        prev_dd = self.drawdown
        self.drawdown = 1 - self.balance/self.max_equity

        rew = CONFIG["reward_scale"]*step_ret - CONFIG["lambda_turnover"]*turnover \
              - CONFIG["lambda_drawdown"]*max(0, self.drawdown - prev_dd)*100.0
        if CONFIG["use_benchmark_reward"]:
            rew += CONFIG["alpha_reward_weight"]*CONFIG["reward_scale"]*(step_ret - bm_gross)
        if self.cash_alloc > 0.7: rew -= 0.05 * CONFIG["reward_scale"]
        elif self.cash_alloc < 0.3: rew += 0.01 * CONFIG["reward_scale"]
        rew -= 0.25 * np.sum(np.square(target_allocs - old_alloc)) * CONFIG["reward_scale"]
        rew = np.clip(rew, -500, 500)
        rew = np.tanh(rew / 200.0) * 200.0

        self.step_idx += 1
        done = self.step_idx >= self.max_steps
        return self._get_state(), float(rew), done, {}

# ===================================================================================================
# Dueling Double DQN Agent with n-step replay
# ===================================================================================================
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size, self.action_size = state_size, action_size
        self.gamma, self.lr = CONFIG["gamma"], CONFIG["lr"]
        self.n_step = CONFIG["n_step"]; self.gamma_n = self.gamma ** self.n_step
        self.nbuf = deque(maxlen=self.n_step)
        self.epsilon, self.epsilon_min, self.epsilon_decay = (
            CONFIG["epsilon_start"], CONFIG["epsilon_min"], CONFIG["epsilon_decay"]
        )
        self.memory = deque(maxlen=CONFIG["replay_capacity"])
        self.q_network = self._build_model()
        self.target_network = self._build_model()
        self.update_target_network(tau=1.0)
        self.ckpt_path, self.best_ckpt = CONFIG["checkpoint_name"], CONFIG["best_ckpt_name"]
        if CONFIG["resume_from_checkpoint"] and os.path.exists(self.ckpt_path):
            try:
                self.q_network = keras.models.load_model(self.ckpt_path)
                self.target_network = keras.models.load_model(self.ckpt_path)
                print(f"[Resume] Loaded checkpoint from {self.ckpt_path}")
            except Exception as e:
                print(f"[Resume] Failed: {e}")

    def _build_model(self):
        inp = keras.layers.Input(shape=(self.state_size,))
        x = keras.layers.Dense(192, activation="relu")(inp)
        x = keras.layers.Dense(128, activation="relu")(x)
        x = keras.layers.Dense(64,  activation="relu")(x)
        V = keras.layers.Dense(1, activation="linear")(keras.layers.Dense(32, activation="relu")(x))
        A = keras.layers.Dense(self.action_size, activation="linear")(keras.layers.Dense(32, activation="relu")(x))
        A_mean = keras.layers.Lambda(lambda a: keras.backend.mean(a, axis=1, keepdims=True))(A)
        Q = keras.layers.Add()([V, keras.layers.Subtract()([A, A_mean])])
        m = keras.Model(inputs=inp, outputs=Q)
        m.compile(optimizer=keras.optimizers.Adam(learning_rate=self.lr), loss="mse")
        return m

    def update_target_network(self, tau=0.005):
        new_weights = self.q_network.get_weights()
        tgt_weights = self.target_network.get_weights()
        if not tgt_weights:
            self.target_network.set_weights(new_weights); return
        updated = [tau*n + (1-tau)*t for n,t in zip(new_weights, tgt_weights)]
        self.target_network.set_weights(updated)

    def remember(self, s, a, r, ns, d):
        self.nbuf.append((s, a, r, ns, d))
        if len(self.nbuf) < self.n_step: return
        R, next_state, done_flag = 0.0, ns, d
        for i, (_, _, r_i, ns_i, d_i) in enumerate(self.nbuf):
            R += (self.gamma ** i) * r_i
            if d_i:
                done_flag, next_state = True, ns_i
                break
        state_0, action_0 = self.nbuf[0][0], self.nbuf[0][1]
        self.memory.append((state_0, action_0, R, next_state, done_flag))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            a = np.random.rand(self.action_size)
            return a/a.sum() if a.sum()>1 else a
        return self.q_network.predict(state.reshape(1,-1), verbose=0)[0]

    def replay(self, step_no):
        if len(self.memory) < CONFIG["train_start"] or step_no % CONFIG["train_freq"] != 0:
            return
        batch = random.sample(self.memory, min(CONFIG["batch_size"], len(self.memory)))
        states  = np.array([b[0] for b in batch], np.float32)
        rewards = np.array([b[2] for b in batch], np.float32)
        next_st = np.array([b[3] for b in batch], np.float32)
        dones   = np.array([b[4] for b in batch], bool)
        q_vals = self.q_network.predict(states, verbose=0)
        q_next_online = self.q_network.predict(next_st, verbose=0)
        q_next_target = self.target_network.predict(next_st, verbose=0)
        best_next = np.argmax(q_next_online, axis=1)
        v_next = q_next_target[np.arange(q_next_target.shape[0]), best_next]
        y = rewards + (1 - dones.astype(np.float32)) * (self.gamma ** CONFIG["n_step"]) * v_next
        targets = q_vals.copy()
        for i in range(targets.shape[1]):
            targets[:, i] = y
        self.q_network.train_on_batch(states, targets)
        if self.epsilon > self.epsilon_min:
            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def save_checkpoint(self, best=False):
        self.q_network.save(self.best_ckpt if best else self.ckpt_path, include_optimizer=True)

# ===================================================================================================
# Curriculum & Training
# ===================================================================================================
def adversary_for_episode(ep_idx):
    d = min(1.0, ep_idx / max(1, CONFIG["ramp_ep"]))
    choices = CONFIG["curriculum_levels"]
    lvl = choices[(ep_idx // CONFIG["rotate_every_ep"]) % len(choices)]
    return make_adversary(lvl, d)

def train_agent(episodes=CONFIG["episodes"], max_steps=CONFIG["max_steps"]):
    env0 = PortfolioEnv(max_steps=max_steps, adversary=AdversaryBase(), committee=CommitteeSignals())
    agent = DQNAgent(env0._get_state().shape[0], action_size=2)
    rewards_hist, balances_hist = [], []
    best_score, since_improve, steps_done = -1e9, 0, 0
    print(f"[Train] Episodes={episodes} | Steps/ep={max_steps}")
    for ep in range(episodes):
        env = PortfolioEnv(max_steps=max_steps, adversary=adversary_for_episode(ep), committee=CommitteeSignals())
        s, ep_reward = env.reset(), 0.0
        for t in range(max_steps):
            a = agent.act(s)
            ns, r, done, _ = env.step(a)
            agent.remember(s, a, r, ns, done)
            steps_done += 1
            agent.replay(steps_done)
            if steps_done % CONFIG["target_update_freq"] == 0:
                agent.update_target_network()
            s, ep_reward = ns, ep_reward + r
            if done: break
        rewards_hist.append(ep_reward); balances_hist.append(env.balance)
        avg_r = float(np.mean(rewards_hist[-CONFIG["eval_window"]:]))
        improved = avg_r > best_score; best_score = max(best_score, avg_r)
        since_improve = 0 if improved else since_improve + 1
        agent.save_checkpoint(best=improved); agent.save_checkpoint(best=False)
        if (ep+1)%5==0:
            print(f"Ep {ep+1}/{episodes} | AvgR={avg_r:.2f} | Bal=${env.balance:,.0f} | eps={agent.epsilon:.3f}")
        if since_improve >= CONFIG["patience"]:
            print("[EarlyStop] No improvement."); break
    # quick plots
    plt.figure(figsize=(12,4))
    plt.subplot(1,2,1); plt.plot(rewards_hist); plt.title("Reward Curve"); plt.grid()
    plt.subplot(1,2,2); plt.plot(balances_hist); plt.title("Balance"); plt.grid()
    plt.tight_layout(); plt.show()
    # evaluation snapshot
    avg_recent_reward = float(np.mean(rewards_hist[-CONFIG["eval_window"]:])) if rewards_hist else 0.0
    with open(EVAL_PATH, "w") as f:
        json.dump({"avg_recent_reward": avg_recent_reward,
                   "mean_balance": float(np.mean(balances_hist)) if balances_hist else 0.0}, f, indent=2)
    return agent, rewards_hist, balances_hist, avg_recent_reward

# ===================================================================================================
# Test, Evaluate, Export
# ===================================================================================================
def test_agent(agent, max_steps=CONFIG["max_steps"]):
    env = PortfolioEnv(max_steps=max_steps, adversary=AdversaryBase())
    agent.epsilon = 0.0; s = env.reset()
    for _ in range(max_steps):
        a = agent.act(s); s,_,done,_=env.step(a)
        if done: break
    print(f"Final Balance: ${env.balance:,.2f}")
    plt.figure(figsize=(8,4)); plt.plot(env.portfolio_curve); plt.title("Equity Curve"); plt.grid(); plt.show()
    return env

def evaluate_agent(agent, n_runs=25, max_steps=CONFIG["max_steps"], save_summary=True):
    balances, returns, sharpes = [], [], []
    for run in range(n_runs):
        np.random.seed(10_000+run)
        env = PortfolioEnv(max_steps=max_steps, adversary=AdversaryBase()); agent.epsilon=0.0
        s = env.reset()
        for _ in range(max_steps):
            a=agent.act(s); s,_,done,_=env.step(a)
            if done: break
        balances.append(env.balance); returns.append((env.balance/env.initial_balance)-1)
        arr=np.array(env.daily_returns); sharpe=np.mean(arr)/(np.std(arr)+1e-8)*np.sqrt(252) if arr.size>2 else 0.0
        sharpes.append(sharpe)
    summary={"mean_final_balance":float(np.mean(balances)),
             "median_final_balance":float(np.median(balances)),
             "p10_balance":float(np.percentile(balances,10)),
             "p90_balance":float(np.percentile(balances,90)),
             "mean_total_return":float(np.mean(returns)),
             "mean_sharpe":float(np.mean(sharpes)),
             "pct_profitable_runs":float(np.mean(np.array(returns)>0)*100)}
    if save_summary:
        with open(EVAL_PATH,"w") as f: json.dump(summary,f,indent=2)
        print(f"[Saved] Evaluation summary to {EVAL_PATH}")
    return summary

def export_nudges_from_agent(agent, growth_tickers, income_tickers, out_path=NUDGE_PATH,
                             bias_scale=CONFIG["export_bias_scale"], probe_runs=50, max_steps=84):
    agent.epsilon=0.0; growth_w,income_w=[],[]
    for run in range(probe_runs):
        np.random.seed(100_000+run)
        env=PortfolioEnv(max_steps=max_steps,adversary=AdversaryBase())
        s=env.reset()
        for _ in range(max_steps):
            a=agent.act(s); s,_,done,_=env.step(a)
            if done: break
        growth_w.append(env.growth_alloc); income_w.append(env.income_alloc)
    g_bias,i_bias=float(np.mean(growth_w)-0.5),float(np.mean(income_w)-0.5)
    g_nudge=float(np.clip(bias_scale*g_bias,-0.05,0.05)); i_nudge=float(np.clip(bias_scale*i_bias,-0.05,0.05))
    nudges={**{t:g_nudge for t in growth_tickers},**{t:i_nudge for t in income_tickers}}
    with open(out_path,"w") as f: json.dump(nudges,f,indent=2)
    print(f"[Export] RL nudges → {out_path}\n"+json.dumps(nudges,indent=2))
    return nudges

# ===================================================================================================
# Auto-Sweep (Self-Tuning) — tries small variations and keeps best
# ===================================================================================================
def run_sweep():
    best_score, best_cfg, best_agent = -1e9, None, None
    base = {k:v for k,v in CONFIG.items()}  # shallow copy
    for i, cfg in enumerate(CONFIG["sweep_configs"], 1):
        CONFIG.update({"lr": cfg["lr"], "epsilon_decay": cfg["epsilon_decay"],
                       "target_update_freq": cfg["target_update_freq"]})
        print(f"\n=== Sweep {i}/{len(CONFIG['sweep_configs'])}: {cfg} ===")
        agent, rewards, balances, avg_recent = train_agent()
        try:
            with open(EVAL_PATH, "r") as f:
                eval_score = float(json.load(f).get("avg_recent_reward", avg_recent))
        except Exception:
            eval_score = avg_recent
        print(f"Sweep {i} score={eval_score:.3f}")
        if eval_score > best_score:
            best_score, best_cfg, best_agent = eval_score, dict(cfg), agent
            agent.save_checkpoint(best=True)
    CONFIG.update(base)
    if best_cfg:
        CONFIG.update({"lr": best_cfg["lr"], "epsilon_decay": best_cfg["epsilon_decay"],
                       "target_update_freq": best_cfg["target_update_freq"]})
    print(f"\nBest sweep score: {best_score:.3f} with {best_cfg}")
    return best_agent, best_cfg, best_score

# ===================================================================================================
# Main Run (wrapped into callable function)
# ===================================================================================================
def train_rl_agent(run_sweep_flag=True, export_nudges_flag=True):
    if run_sweep_flag:
        agent, best_cfg, best_score = run_sweep()
    else:
        agent, _, _ = train_agent()

    # Evaluate and save summary
    evaluate_agent(agent, n_runs=25)

    # Export nudges if requested
    if export_nudges_flag:
        growth_list = ["QQQM","VTI","AVUV","AVDV","VGT","NVDA","AAPL","MSFT","TSLA","AMZN","TQQQ","SOXL",
               "BTC-USD","ETH-USD","SOL-USD","LINK-USD","ADA-USD","DOT-USD","AVAX-USD","UNI-USD","AAVE-USD"]
        income_list = ["SCHD","O","JEPI","JEPQ","DGRO","VIG","DIVO","QYLD","XYLD","BIL","TLT"]

        export_nudges_from_agent(agent, growth_list, income_list)

    return {
        "nudges_path": NUDGE_PATH,
        "eval_summary_path": EVAL_PATH
    }

if __name__=="__main__":
    results = train_rl_agent()
    print(json.dumps(results, indent=2, default=str))

✅ rl_checkpoints ready — training will start fresh.

=== Sweep 1/4: {'lr': 0.001, 'epsilon_decay': 0.999, 'target_update_freq': 200} ===
[Train] Episodes=300 | Steps/ep=252
Ep 5/300 | AvgR=-2721.05 | Bal=$13,595 | eps=0.328
Ep 10/300 | AvgR=-2708.05 | Bal=$14,350 | eps=0.239
Ep 15/300 | AvgR=-2616.97 | Bal=$12,902 | eps=0.175
Ep 20/300 | AvgR=-2427.85 | Bal=$13,450 | eps=0.127
Ep 25/300 | AvgR=-2229.28 | Bal=$13,612 | eps=0.093
Ep 30/300 | AvgR=-2030.58 | Bal=$13,884 | eps=0.068
Ep 35/300 | AvgR=-1850.24 | Bal=$13,975 | eps=0.049
Ep 40/300 | AvgR=-1720.14 | Bal=$14,210 | eps=0.036
Ep 45/300 | AvgR=-1611.23 | Bal=$13,670 | eps=0.026


In [None]:
# === RL Meta-Loop: safe auto-suggester + optional auto-rerun ===
# Paste this AFTER you run the RL trainer once so evaluation_summary.json exists.
# It will read the evaluation summary, propose a small config mutation,
# and optionally run the trainer again in a safe, versioned temp folder.
#
# Behaviors:
#  - dry_run=True : only propose suggestions, NO training launched.
#  - auto_accept=False : will NOT replace best model automatically; it archives, prints diff, waits for you to set auto_accept True to update.
#  - Keeps a 'meta_archive' with all tried configs and evals.
#
# IMPORTANT: adapt TRAIN_CMD if you run trainer differently (e.g., as module or different path).

import os, json, time, shutil, subprocess, copy, datetime, hashlib
from pathlib import Path
import numpy as np

# === User tunables ===
RL_CHECKPOINT_DIR = Path("/content/rl_checkpoints")           # where trainer writes evaluation_summary.json, agent_weights.json
TRAINER_SCRIPT     = Path("/mnt/data/RL_Portfolio_Trainer_v7_0.py")  # path to the trainer script (or change to your %run target)
META_ARCHIVE_DIR   = RL_CHECKPOINT_DIR / "meta_archive"      # stores configs, evals, models from meta-loop
BEST_ARCHIVE_DIR   = RL_CHECKPOINT_DIR / "best_archive"      # stores copy of the best model(s)
DRY_RUN            = True     # True => only propose, do not retrain
AUTO_ACCEPT        = False    # True => automatically accept & archive improved models
MAX_GENERATIONS    = 6        # how many generations to attempt
SEED_BASE          = 424242   # base seed (incremented per generation)
MAX_RUNS_PER_GEN   = 1        # how many trainer runs to execute per generation (usually 1)
TIMEOUT_SECONDS    = None     # if desired, you can set an overall timeout for trainer subprocesses

# Create directories
META_ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
BEST_ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)

EVAL_PATH = RL_CHECKPOINT_DIR / "evaluation_summary.json"
CONFIG_PATH = RL_CHECKPOINT_DIR / "config_snapshot.json"

def read_json(path):
    try:
        with open(path, "r") as f:
            return json.load(f)
    except Exception:
        return None

def write_json(obj, path):
    with open(path, "w") as f:
        json.dump(obj, f, indent=2, default=str)

def timestamp():
    return datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")

def hash_dict(d):
    s = json.dumps(d, sort_keys=True)
    return hashlib.sha1(s.encode()).hexdigest()[:8]

# Simple mutation policy: smart tiny changes around lr, epsilon_decay, target_update_freq, batch_size
def propose_mutations(cfg):
    base = copy.deepcopy(cfg)
    # candidate knobs and sensible multiplicative ranges
    knobs = {
        "lr": {"mult": (0.6, 1.6)},
        "epsilon_decay": {"add": (-0.002, 0.002), "clip": (0.990, 0.99995)},
        "target_update_freq": {"mult": (0.7, 1.6), "int": True, "clip": (50, 2000)},
        "batch_size": {"mult": (0.8, 1.5), "int": True, "clip": (16, 1024)},
        "replay_capacity": {"mult": (0.7, 1.6), "int": True, "clip": (1000, 500000)},
        "n_step": {"add": (-1, 2), "int": True, "clip": (1, 8)}
    }

    # Use small gaussian/random nudges, biased by performance if possible
    new = copy.deepcopy(base)
    for k, meta in knobs.items():
        if k not in base:
            continue
        cur = base[k]
        if "mult" in meta:
            mn, mx = meta["mult"]
            factor = float(np.exp(np.random.uniform(np.log(mn), np.log(mx))))
            cand = cur * factor
        elif "add" in meta:
            a, b = meta["add"]
            cand = cur + float(np.random.uniform(a, b))
        else:
            cand = cur
        if meta.get("int", False):
            cand = int(max(meta.get("clip", (None, None))[0] or cand, 1, round(cand)))
        if "clip" in meta:
            cmin, cmax = meta["clip"]
            if cmin is not None: cand = max(cmin, cand)
            if cmax is not None: cand = min(cmax, cand)
        new[k] = cand

    # small sanity fixes
    if "epsilon_decay" in new and new["epsilon_decay"] > 0.99998:
      new["epsilon_decay"] = min(new["epsilon_decay"], 0.99998)
    return new

def score_from_eval(eval_json):
    if not eval_json:
        return -1e9
    # prefer avg_recent_reward if present, else mean_balance, else mean_total_return
    score = eval_json.get("avg_recent_reward")
    if score is None:
        score = eval_json.get("mean_total_return")
    if score is None:
        score = eval_json.get("mean_balance")
    try:
        return float(score)
    except:
        return -1e9

def safe_copy_best_checkpoint(dst_dir, src_checkpoint_path_candidates):
    """
    src_checkpoint_path_candidates: list of possible file paths (strings) the trainer may have saved the model to
    This function copies any that exist into dst_dir with a timestamped name.
    """
    copied = []
    dst_dir = Path(dst_dir)
    dst_dir.mkdir(parents=True, exist_ok=True)
    for p in src_checkpoint_path_candidates:
        p = Path(p)
        if p.exists():
            name = f"{p.name.split('.')[0]}_{timestamp()}_{hashlib.sha1(str(p).encode()).hexdigest()[:6]}"
            dst = dst_dir / name
            try:
                if p.is_dir():
                    shutil.copytree(p, dst)
                else:
                    shutil.copy2(p, dst)
                copied.append(str(dst))
            except Exception as e:
                print("Copy failed:", e)
    return copied

# === Load current config & eval ===
raw_cfg = read_json(CONFIG_PATH) or {}
raw_eval = read_json(EVAL_PATH) or {}
print("Loaded current CONFIG snapshot:", str(CONFIG_PATH), "exists?", CONFIG_PATH.exists())
print("Loaded last evaluation:", str(EVAL_PATH), "exists?", EVAL_PATH.exists())

current_score = score_from_eval(raw_eval)
print("Current run score:", current_score, "raw eval keys:", list(raw_eval.keys()) if raw_eval else [])

# Meta-loop: iterate, propose, (optionally) run trainer, record results
best_seen_score = current_score
best_seen_eval = raw_eval
best_cfg = raw_cfg
best_archive_items = []

for gen in range(1, MAX_GENERATIONS + 1):
    print("\n=== META GEN", gen, "===")
    # 1) Propose a new config mutation
    proposal_cfg = propose_mutations(raw_cfg)
    proposal_cfg["_meta"] = {
        "generated_at": timestamp(),
        "generation": gen,
        "parent_score": current_score,
        "parent_hash": hash_dict(raw_cfg)
    }
    proposal_id = f"gen{gen}_{timestamp()}_{hash_dict(proposal_cfg)}"
    proposal_folder = META_ARCHIVE_DIR / proposal_id
    proposal_folder.mkdir(parents=True, exist_ok=True)
    write_json(proposal_cfg, proposal_folder / "proposed_config.json")
    print("Proposed config saved to", str(proposal_folder / "proposed_config.json"))

    # 2) Dry-run only?
    if DRY_RUN:
        print("DRY_RUN is True -> not launching trainer. Inspect proposed_config.json in meta_archive.")
        # Update raw_cfg to be the proposal for next iteration only if you want progressive proposals
        raw_cfg = proposal_cfg
        continue

    # 3) If not dry-run, we will run the trainer in a temporary fresh workspace
    #    to avoid clobbering current rl_checkpoints (trainer will still write into RL_CHECKPOINT_DIR by default).
    #    We copy the proposed config into RL_CHECKPOINT_DIR as 'config_snapshot.json' before launching trainer.
    write_json(proposal_cfg, RL_CHECKPOINT_DIR / "config_snapshot.json")
    print("Wrote proposed config into RL checkpoint dir for trainer to pick up.")

    # 4) Launch the trainer subprocess. Modify TRAIN_CMD to your environment if needed.
    #    We call the script directly with Python; if you run trainer via %run in the same notebook,
    #    you can instead `exec(open(TRAINER_SCRIPT).read(), globals())` but subprocess keeps isolation.
    TRAIN_CMD = [ "python3", str(TRAINER_SCRIPT) ]
    run_results = None
    try:
        for run_idx in range(1, MAX_RUNS_PER_GEN + 1):
            print(f"Launching trainer run {run_idx}/{MAX_RUNS_PER_GEN} ...")
            start_t = time.time()
            proc = subprocess.run(TRAIN_CMD, cwd=str(RL_CHECKPOINT_DIR), timeout=TIMEOUT_SECONDS)
            elapsed = time.time() - start_t
            print("Trainer finished (exitcode {}) in {:.1f}s".format(proc.returncode, elapsed))
            # 5) Load evaluation and snapshot
            eval_after = read_json(EVAL_PATH)
            write_json(eval_after or {}, proposal_folder / f"evaluation_after_run{run_idx}.json")
            # copy any checkpoints
            copied = safe_copy_best_checkpoint(proposal_folder / "checkpoints", [RL_CHECKPOINT_DIR / "rl_policy_model_best.keras", RL_CHECKPOINT_DIR / "rl_policy_model.keras"])
            proposal_folder.joinpath("train_result_info.json").write_text(json.dumps({"exitcode": proc.returncode, "elapsed_s": elapsed}))
            run_results = {"eval": eval_after, "copied_checkpoints": copied, "exitcode": proc.returncode, "elapsed": elapsed}
    except subprocess.TimeoutExpired:
        print("Trainer timed out for proposal", proposal_id)
        write_json({"timeout": True}, proposal_folder / "train_result_info.json")
        run_results = {"eval": None, "timeout": True}
    except Exception as e:
        print("Trainer run failed:", e)
        write_json({"error": str(e)}, proposal_folder / "train_result_info.json")
        run_results = {"eval": None, "error": str(e)}

    # 6) Scoring the run and archiving
    eval_after = run_results.get("eval") if run_results else None
    score_after = score_from_eval(eval_after)
    write_json({"score_before": current_score, "score_after": score_after}, proposal_folder / "score_snapshot.json")
    print("Score before:", current_score, "score after:", score_after)

    # 7) If improved, archive model & optionally accept as new baseline
    improved = score_after > best_seen_score
    if improved:
        print("=== IMPROVEMENT detected! ===")
        # copy candidate checkpoints to best archive
        copied = safe_copy_best_checkpoint(BEST_ARCHIVE_DIR, [RL_CHECKPOINT_DIR / "rl_policy_model_best.keras", RL_CHECKPOINT_DIR / "rl_policy_model.keras"])
        best_archive_items.append({"gen": gen, "proposal": proposal_id, "score": score_after, "copied": copied})
        write_json({"gen": gen, "score": score_after, "proposal": proposal_id, "copied": copied}, BEST_ARCHIVE_DIR / f"improve_{proposal_id}.json")
        best_seen_score = score_after
        best_seen_eval = eval_after
        best_cfg = proposal_cfg
        # auto accept?
        if AUTO_ACCEPT:
            print("AUTO_ACCEPT is True -> accepting proposal as new baseline (copying into rl_checkpoints).")
            write_json(proposal_cfg, RL_CHECKPOINT_DIR / "config_snapshot.json")
        else:
            print("AUTO_ACCEPT is False -> NOT accepting automatically. To accept, set AUTO_ACCEPT=True or copy the config manually from archive.")
    else:
        print("No improvement detected for this proposal.")

    # 8) update raw_cfg for the next generation -- optionally we can accept the proposal as parent or keep parent
    # Choose to set parent to best_cfg so mutations are anchored to best-known config
    raw_cfg = best_cfg

    # Optional: short cooldown to let logs settle
    time.sleep(1)

print("\n=== Meta-loop complete ===")
print("Best seen score:", best_seen_score)
print("Best seen eval snapshot stored in best_archive:", list(BEST_ARCHIVE_DIR.glob("*"))[:5])
print("Meta archive folder:", META_ARCHIVE_DIR)