In [30]:
# Colab cell: Train PPO expert -> collect PPO dataset -> train BC Transformer -> record MP4/GIF -> display
# Paste into one Colab cell and run.

# -----------------------
# Install dependencies
# -----------------------
!apt-get update -qq
!apt-get install -y -qq xvfb ffmpeg > /dev/null
!pip install -q gymnasium imageio[ffmpeg] pyvirtualdisplay stable-baselines3[extra] torch torchvision

# -----------------------
# Start virtual display (don't shadow `display`)
# -----------------------
from pyvirtualdisplay import Display
vdisplay = Display(visible=0, size=(900,600))
vdisplay.start()

# -----------------------
# Imports
# -----------------------
import os, random, time
from base64 import b64encode
import numpy as np
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import gymnasium as gym
import imageio
from IPython.display import HTML, display as ipy_display, Image

# stable-baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# -----------------------
# Hyperparameters (tweakable)
# -----------------------
SEED = 42
ENV_NAME = "CartPole-v1"

# PPO expert training
PPO_TIMESTEPS = 150_000   # set lower for faster runs (e.g. 50_000) or higher for stronger expert

# Data collection and BC training
COLLECT_EPISODES = 300    # episodes sampled from PPO expert
STACK_K = 4               # stack last K observations (gives sequence info)
DATASET_PATH = "ppo_expert_cartpole.npz"
MODEL_PATH_PPO = "ppo_cartpole.zip"
MODEL_PATH_BC = "bc_transformer_cartpole.pth"
NORM_PATH = "norm_stack.npz"

# BC model / training hyperparams
TRAIN_EPOCHS = 60
TRAIN_BATCH_SIZE = 128
LR = 3e-4
HIDDEN_DIM = 256
NUM_HEADS = 4
NUM_LAYERS = 3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# -----------------------
# Helper: gym compatibility (reset/step wrappers)
# -----------------------
def env_reset(env):
    out = env.reset()
    return out[0] if isinstance(out, tuple) else out

def env_step(env, action):
    out = env.step(action)
    if len(out) == 5:
        obs, reward, terminated, truncated, info = out
        done = bool(terminated or truncated)
        return obs, reward, done, info
    if len(out) == 4:
        obs, reward, done, info = out
        return obs, reward, bool(done), info
    raise RuntimeError("Unexpected env.step return shape")

# -----------------------
# Model: Transformer BC (works on stacked observations of size 4*STACK_K)
# -----------------------
class TransformerPolicy(nn.Module):
    def __init__(self, obs_dim=4*STACK_K, hidden_dim=HIDDEN_DIM, num_heads=NUM_HEADS, num_layers=NUM_LAYERS, n_actions=2):
        super().__init__()
        self.embed = nn.Linear(obs_dim, hidden_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(hidden_dim, n_actions)
    def forward(self, x):
        # x: (B, obs_dim)
        x = self.embed(x)        # (B, hidden_dim)
        x = x.unsqueeze(1)       # (B, 1, hidden_dim)
        x = self.encoder(x)      # (B, 1, hidden_dim)
        x = x.mean(dim=1)        # (B, hidden_dim)
        return self.fc(x)

# -----------------------
# Train PPO expert (returns trained model)
# -----------------------
def train_ppo_expert(env_name=ENV_NAME, timesteps=PPO_TIMESTEPS):
    print(f"[PPO] Training expert for {timesteps} timesteps (may take a few minutes)...")
    # Use DummyVecEnv for stable-baselines
    def make_env(): return gym.make(env_name)
    venv = DummyVecEnv([make_env])
    model = PPO("MlpPolicy", venv, verbose=1, seed=SEED)
    model.learn(total_timesteps=timesteps)
    model.save(MODEL_PATH_PPO)
    print(f"[PPO] Saved expert at {MODEL_PATH_PPO}")
    return model

# -----------------------
# Collect trajectories from expert (deterministic)
# stacked = last K obs concatenated
# -----------------------
def collect_from_expert(model, env_name=ENV_NAME, episodes=COLLECT_EPISODES, stack_k=STACK_K, save_path=DATASET_PATH):
    env = gym.make(env_name)
    states, actions = [], []
    from collections import deque
    for ep in range(episodes):
        dq = deque(maxlen=stack_k)
        # initialize deque with zeros
        for _ in range(stack_k-1):
            dq.append(np.zeros(4, dtype=np.float32))
        obs = env_reset(env)
        dq.append(np.array(obs, dtype=np.float32))
        done = False
        steps = 0
        while not done:
            # deterministic expert
            action, _ = model.predict(obs, deterministic=True)
            # stack into single vector
            stacked = np.concatenate(list(dq), axis=0)
            states.append(stacked.copy())
            actions.append(int(action))
            obs, reward, done, info = env_step(env, int(action))
            # push new obs into deque
            dq.append(np.array(obs, dtype=np.float32))
            steps += 1
            if steps > 1000: break
        if (ep+1) % 50 == 0 or ep == episodes-1:
            print(f"[collect] {ep+1}/{episodes} episodes collected (total steps: {len(states)})")
    env.close()
    states = np.array(states, dtype=np.float32)
    actions = np.array(actions, dtype=np.int64)
    # normalize and save mean/std
    mean = states.mean(axis=0, keepdims=True)
    std = states.std(axis=0, keepdims=True) + 1e-8
    states_norm = (states - mean) / std
    np.savez_compressed(save_path, states=states_norm, actions=actions)
    np.savez_compressed(NORM_PATH, mean=mean, std=std)
    print(f"[collect] Saved dataset: {save_path} (states {states.shape}) and norm {NORM_PATH}")
    return states_norm, actions

# -----------------------
# Train BC on the collected PPO dataset
# -----------------------
class ImitationDataset(Dataset):
    def __init__(self, states, actions):
        self.states = torch.tensor(states, dtype=torch.float32)
        self.actions = torch.tensor(actions, dtype=torch.long)
    def __len__(self): return len(self.states)
    def __getitem__(self, idx): return self.states[idx], self.actions[idx]

def train_bc(dataset_path=DATASET_PATH, model_path=MODEL_PATH_BC, epochs=TRAIN_EPOCHS):
    data = np.load(dataset_path)
    states = data["states"]
    actions = data["actions"]
    ds = ImitationDataset(states, actions)
    loader = DataLoader(ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
    model = TransformerPolicy().to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = nn.CrossEntropyLoss()
    model.train()
    for epoch in range(epochs):
        total = 0.0
        for xb, yb in loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            logits = model(xb)
            loss = loss_fn(logits, yb)
            opt.zero_grad(); loss.backward(); opt.step()
            total += loss.item() * xb.size(0)
        avg = total / len(ds)
        if (epoch+1) % 10 == 0 or epoch == 0 or epoch == epochs-1:
            print(f"[BC] Epoch {epoch+1}/{epochs} - loss {avg:.5f}")
    torch.save(model.state_dict(), model_path)
    print(f"[BC] Saved BC model -> {model_path}")
    return model

# -----------------------
# Evaluate BC and record frames (rgb_array)
# -----------------------
def evaluate_and_record_bc(model_path=MODEL_PATH_BC, env_name=ENV_NAME, episodes=2, max_steps=500, fps=30):
    # load normalization
    norm = None
    if os.path.exists(NORM_PATH):
        d = np.load(NORM_PATH)
        mean, std = d["mean"], d["std"]
        norm = (mean.squeeze(), std.squeeze())
    # create env with rgb_array
    try:
        env = gym.make(env_name, render_mode="rgb_array")
    except TypeError:
        env = gym.make(env_name)
    model = TransformerPolicy().to(DEVICE)
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.eval()
    frames = []
    rewards = []
    from collections import deque
    for ep in range(episodes):
        dq = deque(maxlen=STACK_K)
        for _ in range(STACK_K-1):
            dq.append(np.zeros(4, dtype=np.float32))
        obs = env_reset(env)
        dq.append(np.array(obs, dtype=np.float32))
        done = False
        ep_reward = 0.0
        steps = 0
        while not done and steps < max_steps:
            stacked = np.concatenate(list(dq), axis=0)
            if norm is not None:
                stacked_proc = (stacked - norm[0]) / norm[1]
            else:
                stacked_proc = stacked
            inp = torch.tensor(stacked_proc, dtype=torch.float32).unsqueeze(0).to(DEVICE)
            with torch.no_grad():
                action = int(torch.argmax(model(inp), dim=-1).item())
            obs, reward, done, info = env_step(env, action)
            dq.append(np.array(obs, dtype=np.float32))
            ep_reward += reward
            steps += 1
            # try to get frame
            try:
                frame = env.render()
            except Exception:
                frame = None
            if frame is not None:
                frames.append(frame)
        rewards.append(ep_reward)
        print(f"[eval] Episode {ep+1}: reward = {ep_reward}")
    env.close()
    # save frames as mp4/gif
    if len(frames) == 0:
        print("[eval] No frames captured. Ensure gymnasium supports render_mode='rgb_array'.")
        return None, rewards
    mp4_path = "bc_cartpole_demo.mp4"
    gif_path = "bc_cartpole_demo.gif"
    # write mp4
    writer = imageio.get_writer(mp4_path, fps=fps, codec="libx264")
    for f in frames:
        writer.append_data(f)
    writer.close()
    # write gif
    imageio.mimsave(gif_path, frames, fps=fps)
    print(f"[eval] Saved {mp4_path} and {gif_path}")
    return mp4_path, rewards

# -----------------------
# Utility to show mp4 in notebook
# -----------------------
def show_video(path, width=640):
    mp4 = open(path, "rb").read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    html = f'<video width="{width}" controls><source src="{data_url}" type="video/mp4"></video>'
    ipy_display(HTML(html))

# -----------------------
# Run pipeline
# -----------------------
print("STEP 1/4: Train PPO expert")
ppo_model = train_ppo_expert(timesteps=PPO_TIMESTEPS)

print("\nSTEP 2/4: Collect high-quality dataset from PPO expert")
states, actions = collect_from_expert(ppo_model, episodes=COLLECT_EPISODES, stack_k=STACK_K)

print("\nSTEP 3/4: Train BC transformer on PPO dataset")
bc_model = train_bc()

print("\nSTEP 4/4: Evaluate BC and record demo")
mp4_path, rewards = evaluate_and_record_bc(episodes=2)

print(f"\nBC eval rewards: {rewards}")
if mp4_path:
    print("\nDemo video:")
    show_video(mp4_path)
else:
    print("No demo video generated (no frames).")

# Optional: download file in Colab
try:
    from google.colab import files
    files.download(mp4_path)
except Exception:
    pass

print("Done.")


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hSTEP 1/4: Train PPO expert
[PPO] Training expert for 150000 timesteps (may take a few minutes)...
Using cuda device


  return datetime.utcnow().replace(tzinfo=utc)


-----------------------------
| time/              |      |
|    fps             | 572  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 519         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008758047 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00345    |
|    learning_rate        | 0.0003      |
|    loss                 | 8.6         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.017      |
|    value_loss           | 57.6        |
-----------------------------------------
----------------------------------



[eval] Episode 2: reward = 500.0




[eval] Saved bc_cartpole_demo.mp4 and bc_cartpole_demo.gif

BC eval rewards: [500.0, 500.0]

Demo video:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Done.
