# O2O RL: DSR + PPO (Colab Demo)
This notebook collects a tiny offline dataset, trains a Data Support Representation (DSR), and runs online PPO using the DSR for both pessimism and intrinsic exploration.

In [None]:
!nvidia-smi || true
!python --version
!pip -q install -r requirements.txt pandas matplotlib

## 1) Prepare a small offline dataset

In [None]:
!python -m o2o.datasets --env_id CartPole-v1 --episodes 30 --out data/cartpole_offline.npz --seed 0
!ls -lah data | tail -n +1 || dir data

## 2) Train DSR on offline data

In [None]:
!python train_dsr.py --offline_path data/cartpole_offline.npz --env_id CartPole-v1 \
+    --dsr_out checkpts/cartpole_dsr.pt --epochs 10 --batch_size 1024 --device cuda
!ls -lah checkpts | tail -n +1 || dir checkpts

## 3) (Optional) Behavior Cloning init for actor

In [None]:
# Uncomment to pretrain actor with BC (can speed up online learning)
# !python pretrain_bc.py --offline_path data/cartpole_offline.npz --env_id CartPole-v1 \
+    #   --out_actor checkpts/bc_actor.pt --epochs 10 --device cuda
# !ls -lah checkpts | tail -n +1 || dir checkpts

## 4) Online learning with DSR-guided PPO

In [None]:
LOGDIR="logs"
!mkdir -p $LOGDIR || true
!python train_online.py --env_id CartPole-v1 --dsr_path checkpts/cartpole_dsr.pt \
+      --total_steps 50000 --steps_per_epoch 2048 --train_iters 10 --minibatch_size 256 --device cuda \
+      --pess_alpha0 1.0 --pess_alpha_final 0.1 --pess_anneal_steps 100000 --pess_gamma 1.0 \
+      --adv_gate_tau 0.5 --adv_gate_k 5.0 --bonus_type boundary --bonus_eta 0.1 \
+      --log_csv $LOGDIR/run.csv
!tail -n 5 $LOGDIR/run.csv || type $LOGDIR\run.csv

## 5) Plot learning curves

In [None]:
import pandas as pd, matplotlib.pyplot as plt
df = pd.read_csv('logs/run.csv')
fig, ax = plt.subplots(1,2, figsize=(10,4))
ax[0].plot(df['steps'], df['avg_ep_ret']); ax[0].set_title('Average Episode Return'); ax[0].grid(True)
ax[1].plot(df['steps'], df['support_mean']); ax[1].set_title('Mean DSR Support (on-policy)'); ax[1].grid(True)
plt.tight_layout(); plt.show()