In [None]:
!sudo apt-get update
!sudo apt-get install -y swig g++ python3-dev
!pip install gymnasium[box2d] torch
!pip install sympy==1.12

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building depe

In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Beta
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
from torch.optim.lr_scheduler import LambdaLR
from torch.cuda.amp import GradScaler, autocast
import time
import datetime
from collections import deque
import random
import os

ENV_NAME = "BipedalWalker-v3"
RANDOM_SEED = 0
LEARNING_RATE = 3e-4
DISCOUNT = 0.99
LAMBDA = 0.95
CLIP_VALUE = 0.2
TRAIN_EPOCHS = 10
MINIBATCH = 64
ROLLOUT_SIZE = 2048
NEURONS = 256
SHOW = False
SAVE_INTERVAL = 100000
MODEL_PATH = "ppo_models"
DECAY_LR = True
MIXED_PRECISION = True

GOAL = 300
AVG_WINDOW = 20
MAX_EPISODES = 600

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True
else:
    MIXED_PRECISION = False

os.makedirs(MODEL_PATH, exist_ok=True)

compute_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {compute_device}")
if MIXED_PRECISION:
    print("Using Mixed Precision Training")
if DECAY_LR:
    print("Using Linear Learning Rate Decay")

class NeuralNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(NeuralNet, self).__init__()
        self.value_layer1 = nn.Linear(input_size, hidden_size)
        self.value_layer2 = nn.Linear(hidden_size, hidden_size)
        self.value_output = nn.Linear(hidden_size, 1)
        self.policy_layer1 = nn.Linear(input_size, hidden_size)
        self.policy_layer2 = nn.Linear(hidden_size, hidden_size)
        self.alpha_head = nn.Linear(hidden_size, output_size)
        self.beta_head = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        v = F.relu(self.value_layer1(x))
        v = F.relu(self.value_layer2(v))
        value = self.value_output(v)
        p = F.relu(self.policy_layer1(x))
        p = F.relu(self.policy_layer2(p))
        alpha = F.softplus(self.alpha_head(p)) + 1.0
        beta = F.softplus(self.beta_head(p)) + 1.0
        return value, alpha, beta

class Agent:
    def __init__(self, obs_size, act_size, hidden_size, lr, gamma, lambda_, clip, epochs, batch_size, total_timesteps):
        self.discount = gamma
        self.lambda_param = lambda_
        self.clip_param = clip
        self.train_epochs = epochs
        self.minibatch_size = batch_size
        self.max_steps = total_timesteps

        self.model = NeuralNet(obs_size, act_size, hidden_size).to(compute_device)
        self.optim = optim.Adam(self.model.parameters(), lr=lr, eps=1e-5)

        if DECAY_LR:
            lr_func = lambda step: max(1.0 - float(step) / float(self.max_steps), 0)
            self.lr_scheduler = LambdaLR(self.optim, lr_lambda=lr_func)
        else:
            self.lr_scheduler = None

        self.grad_scaler = GradScaler(enabled=MIXED_PRECISION)
        self.memory = {'obs': [], 'acts': [], 'rews': [], 'next_obs': [], 'logps': [], 'dones': []}
        self.current_steps = 0

    def remember(self, obs, act, rew, next_obs, logp, done):
        self.memory['obs'].append(obs)
        self.memory['acts'].append(act.cpu())
        self.memory['rews'].append(rew)
        self.memory['next_obs'].append(next_obs)
        self.memory['logps'].append(logp.cpu())
        self.memory['dones'].append(done)

    def get_action(self, state, low, high):
        state_tensor = torch.tensor(state, dtype=torch.float32, device=compute_device).unsqueeze(0)
        with torch.no_grad():
            with autocast(enabled=MIXED_PRECISION):
                _, alpha, beta = self.model(state_tensor)
        dist = Beta(alpha, beta)
        action_norm = dist.sample()
        log_prob = dist.log_prob(action_norm).sum(axis=-1)
        action = low + (high - low) * action_norm.cpu().numpy()[0]
        action = np.clip(action, low, high)
        return action, action_norm.squeeze(0), log_prob.squeeze(0)

    def learn(self):
        if not self.memory['obs']:
            return 0.0, 0.0, 0.0

        obs_tensor = torch.tensor(np.array(self.memory['obs']), dtype=torch.float32, device=compute_device)
        acts_tensor = torch.stack(self.memory['acts']).to(compute_device)
        rews_tensor = torch.tensor(np.array(self.memory['rews']), dtype=torch.float32, device=compute_device).unsqueeze(1)
        next_obs_tensor = torch.tensor(np.array(self.memory['next_obs']), dtype=torch.float32, device=compute_device)
        old_logps_tensor = torch.stack(self.memory['logps']).to(compute_device)
        masks_tensor = torch.tensor([1.0 - float(d) for d in self.memory['dones']], dtype=torch.float32, device=compute_device).unsqueeze(1)

        with torch.no_grad():
            with autocast(enabled=MIXED_PRECISION):
                values, _, _ = self.model(obs_tensor)
                next_values, _, _ = self.model(next_obs_tensor)
            values = values.squeeze(-1)
            next_values = next_values.squeeze(-1)
            rews_tensor = rews_tensor.squeeze(-1)
            masks_tensor = masks_tensor.squeeze(-1)

            deltas = rews_tensor + self.discount * next_values * masks_tensor - values
            advantages = torch.zeros_like(deltas)
            last_gae = 0.0
            for t in reversed(range(len(deltas))):
                last_gae = deltas[t] + self.discount * self.lambda_param * masks_tensor[t] * last_gae
                advantages[t] = last_gae
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
            targets = advantages + values

        avg_advantage = advantages.abs().mean().item()
        policy_loss = 0.0
        value_loss = 0.0
        entropy_loss = 0.0

        for _ in range(self.train_epochs):
            batch_indices = BatchSampler(SubsetRandomSampler(range(len(obs_tensor))), self.minibatch_size, drop_last=False)
            for idx in batch_indices:
                obs_batch = obs_tensor[idx]
                acts_batch = acts_tensor[idx]
                old_logps_batch = old_logps_tensor[idx]
                advantages_batch = advantages[idx]
                targets_batch = targets[idx]

                with autocast(enabled=MIXED_PRECISION):
                    value_pred, alpha, beta = self.model(obs_batch)
                    value_pred = value_pred.squeeze(-1)
                    dist = Beta(alpha, beta)
                    new_logps = dist.log_prob(acts_batch).sum(axis=-1)
                    entropy = dist.entropy().sum(axis=-1).mean()
                    ratios = torch.exp(new_logps - old_logps_batch)
                    surr1 = ratios * advantages_batch
                    surr2 = torch.clamp(ratios, 1.0 - self.clip_param, 1.0 + self.clip_param) * advantages_batch
                    p_loss = -torch.min(surr1, surr2).mean()
                    v_loss = F.mse_loss(value_pred, targets_batch)
                    total_loss = p_loss + 0.5 * v_loss - 0.01 * entropy

                self.optim.zero_grad()
                self.grad_scaler.scale(total_loss).backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
                self.grad_scaler.step(self.optim)
                self.grad_scaler.update()

                policy_loss += p_loss.item()
                value_loss += v_loss.item()
                entropy_loss += entropy.item()

        if self.lr_scheduler:
            self.current_steps += len(obs_tensor)
            temp_optim = optim.Adam(self.model.parameters(), lr=0)
            temp_optim.load_state_dict(self.optim.state_dict())
            temp_optim.state = self.optim.state
            for group in temp_optim.param_groups:
                group['initial_lr'] = LEARNING_RATE
            self.lr_scheduler.optimizer = temp_optim
            self.lr_scheduler.last_epoch = self.current_steps
            self.lr_scheduler.step()
            current_lr = self.lr_scheduler.get_last_lr()[0]
            for group in self.optim.param_groups:
                group['lr'] = current_lr
            self.lr_scheduler.optimizer = self.optim

        self.memory = {'obs': [], 'acts': [], 'rews': [], 'next_obs': [], 'logps': [], 'dones': []}
        batches = len(obs_tensor) / self.minibatch_size
        updates = self.train_epochs * batches
        avg_p_loss = policy_loss / updates
        avg_v_loss = value_loss / updates
        avg_ent = entropy_loss / updates

        return avg_p_loss, avg_v_loss, avg_ent, avg_advantage

def test_agent(agent, env, low, high, test_episodes=5, render=False):
    total_reward = 0.0
    print(f"\nEvaluating ({test_episodes} episodes)")
    for i in range(test_episodes):
        state, _ = env.reset()
        episode_reward = 0.0
        done = False
        truncated = False
        step_count = 0
        max_steps = getattr(env.spec, 'max_episode_steps', 1600)

        while not done and not truncated:
            state_tensor = torch.tensor(state, dtype=torch.float32, device=compute_device).unsqueeze(0)
            with torch.no_grad():
                with autocast(enabled=MIXED_PRECISION):
                    _, alpha, beta = agent.model(state_tensor)
            action_norm = (alpha / (alpha + beta)).squeeze(0)
            action = low + (high - low) * action_norm.cpu().numpy()
            action = np.clip(action, low, high)

            next_state, reward, done, truncated, _ = env.step(action)
            episode_reward += reward
            state = next_state
            step_count += 1

            if render:
                try:
                    env.render()
                    time.sleep(0.01)
                except:
                    render = False

            if step_count > max_steps + 100:
                truncated = True

        print(f"Episode {i+1}/{test_episodes} | Reward: {episode_reward:.2f} | Steps: {step_count}")
        total_reward += episode_reward

    average = total_reward / test_episodes
    print(f"Average Reward: {average:.2f}")
    return average

if __name__ == "__main__":
    training_env = gym.make(ENV_NAME, render_mode=None)
    testing_env = gym.make(ENV_NAME, render_mode="human" if SHOW else None)

    obs_size = training_env.observation_space.shape[0]
    act_size = training_env.action_space.shape[0]
    act_low = training_env.action_space.low
    act_high = training_env.action_space.high

    total_timesteps = MAX_EPISODES * 1000

    print("-" * 30)
    print(f"Environment: {ENV_NAME}")
    print(f"State Dim: {obs_size}, Action Dim: {act_size}")
    print(f"Action Range: {act_low} to {act_high}")
    print(f"Update Interval: {ROLLOUT_SIZE}")
    print(f"PPO Epochs: {TRAIN_EPOCHS}, Batch Size: {MINIBATCH}")
    print(f"Device: {compute_device}")
    print(f"AMP: {MIXED_PRECISION}")
    print(f"LR Decay: {DECAY_LR} (Initial: {LEARNING_RATE}, Steps: {total_timesteps})")
    print(f"Model Dir: {MODEL_PATH}")
    print(f"Stopping Conditions:")
    print(f"  Reward >= {GOAL} over {AVG_WINDOW} episodes")
    print(f"  Max Episodes: {MAX_EPISODES}")
    print("-" * 30)

    ppo_agent = Agent(obs_size, act_size, NEURONS, LEARNING_RATE, DISCOUNT, LAMBDA, CLIP_VALUE, TRAIN_EPOCHS, MINIBATCH, total_timesteps)

    step_count = 0
    episode_count = 0
    state, _ = training_env.reset(seed=RANDOM_SEED)
    episode_reward = 0.0
    episode_steps = 0

    start_time = time.time()
    reward_history = deque(maxlen=AVG_WINDOW)
    success = False

    print("Training started...")

    while episode_count < MAX_EPISODES and not success:
        collected_steps = 0
        for _ in range(ROLLOUT_SIZE):
            episode_steps += 1
            step_count += 1
            collected_steps += 1

            env_action, norm_action, log_prob = ppo_agent.get_action(state, act_low, act_high)
            next_state, reward, done, truncated, _ = training_env.step(env_action)
            ppo_agent.remember(state, norm_action, reward, next_state, log_prob, done or truncated)

            state = next_state
            episode_reward += reward

            if step_count % SAVE_INTERVAL == 0:
                model_path = os.path.join(MODEL_PATH, f'ppo_{ENV_NAME}_checkpoint_{step_count}.pth')
                try:
                    torch.save(ppo_agent.model.state_dict(), model_path)
                    print(f"Checkpoint saved: {model_path} at step {step_count}")
                except Exception as e:
                    print(f"Failed to save checkpoint: {e}")

            if done or truncated:
                episode_count += 1
                reward_history.append(episode_reward)

                current_lr = ppo_agent.optim.param_groups[0]['lr']
                log_msg = f"Episode: {episode_count}/{MAX_EPISODES}, Steps: {step_count}, LR: {current_lr:.2e}, Reward: {episode_reward:.2f}"

                if len(reward_history) == AVG_WINDOW:
                    avg_reward = np.mean(reward_history)
                    log_msg += f", Avg({AVG_WINDOW}): {avg_reward:.2f}"
                    if avg_reward >= GOAL:
                        success = True
                        print(log_msg)
                        print(f"Solved! Average reward {avg_reward:.2f} >= {GOAL}")
                        break
                else:
                    log_msg += f", Avg({len(reward_history)}/{AVG_WINDOW}): ---"

                print(log_msg)

                state, _ = training_env.reset()
                episode_reward = 0.0
                episode_steps = 0

                if success or episode_count >= MAX_EPISODES:
                    break

        if len(ppo_agent.memory['obs']) > 0:
            p_loss, v_loss, ent, adv = ppo_agent.learn()

    training_time = time.time() - start_time
    time_str = str(datetime.timedelta(seconds=int(training_time)))
    print(f"\nTraining completed")
    if success:
        print(f"Reason: Reward threshold reached")
    elif episode_count >= MAX_EPISODES:
        print(f"Reason: Max episodes reached")
    else:
        print(f"Reason: Unknown")

    print(f"Total Episodes: {episode_count}")
    print(f"Total Steps: {step_count}")
    print(f"Time: {time_str}")

    final_model_path = os.path.join(MODEL_PATH, f'ppo_{ENV_NAME}_final_ep{episode_count}_step{step_count}.pth')
    try:
        torch.save(ppo_agent.model.state_dict(), final_model_path)
        print(f"Final model saved: {final_model_path}")
    except Exception as e:
        print(f"Failed to save final model: {e}")

    print("\nFinal evaluation")
    final_score = test_agent(ppo_agent, testing_env, act_low, act_high, 10, SHOW)
    print(f"Final evaluation score: {final_score:.2f}")

    training_env.close()
    testing_env.close()
    print("Done")

Using device: cpu
Using Linear Learning Rate Decay
------------------------------
Environment: BipedalWalker-v3
State Dim: 24, Action Dim: 4
Action Low: [-1. -1. -1. -1.], Action High: [1. 1. 1. 1.]
Update Interval: 2048
PPO Epochs: 10, Batch Size: 64
Using Device: cpu
Mixed Precision: False
LR Decay: True (Initial: 0.0003, Est. Total Steps: 600000)
Saving models to: ppo_models
Stopping Conditions:
  1. Avg Reward >= 300 over last 20 episodes.
  2. Max Episodes Reached: 600
------------------------------
Starting training...


  self.scaler = GradScaler(enabled=use_mixed_precision)
  with autocast(enabled=use_mixed_precision):


Ep: 1/600, Timesteps: 1600, LR: 3.00e-04, Ep Reward: -66.83, Avg(1/20): ---
Ep: 2/600, Timesteps: 1657, LR: 3.00e-04, Ep Reward: -99.37, Avg(2/20): ---
Ep: 3/600, Timesteps: 1717, LR: 3.00e-04, Ep Reward: -98.16, Avg(3/20): ---
Ep: 4/600, Timesteps: 1783, LR: 3.00e-04, Ep Reward: -98.20, Avg(4/20): ---
Ep: 5/600, Timesteps: 1851, LR: 3.00e-04, Ep Reward: -113.23, Avg(5/20): ---
Ep: 6/600, Timesteps: 1973, LR: 3.00e-04, Ep Reward: -119.21, Avg(6/20): ---


  with autocast(enabled=use_mixed_precision):
  with autocast(enabled=use_mixed_precision):


Ep: 7/600, Timesteps: 2058, LR: 2.99e-04, Ep Reward: -104.66, Avg(7/20): ---
Ep: 8/600, Timesteps: 2180, LR: 2.99e-04, Ep Reward: -117.45, Avg(8/20): ---
Ep: 9/600, Timesteps: 2243, LR: 2.99e-04, Ep Reward: -111.46, Avg(9/20): ---
Ep: 10/600, Timesteps: 3843, LR: 2.99e-04, Ep Reward: -57.68, Avg(10/20): ---
Ep: 11/600, Timesteps: 3905, LR: 2.99e-04, Ep Reward: -110.11, Avg(11/20): ---
Ep: 12/600, Timesteps: 3954, LR: 2.99e-04, Ep Reward: -108.59, Avg(12/20): ---
Ep: 13/600, Timesteps: 4093, LR: 2.99e-04, Ep Reward: -125.16, Avg(13/20): ---
Ep: 14/600, Timesteps: 4158, LR: 2.98e-04, Ep Reward: -110.51, Avg(14/20): ---
Ep: 15/600, Timesteps: 5758, LR: 2.98e-04, Ep Reward: -62.31, Avg(15/20): ---
Ep: 16/600, Timesteps: 5971, LR: 2.98e-04, Ep Reward: -130.23, Avg(16/20): ---
Ep: 17/600, Timesteps: 6021, LR: 2.98e-04, Ep Reward: -109.28, Avg(17/20): ---
Ep: 18/600, Timesteps: 6110, LR: 2.98e-04, Ep Reward: -119.67, Avg(18/20): ---
Ep: 19/600, Timesteps: 6182, LR: 2.97e-04, Ep Reward: -114.3

  with autocast(enabled=use_mixed_precision):


  Eval Episode 1/10 | Reward: 19.67 | Steps: 1600
  Eval Episode 2/10 | Reward: 10.88 | Steps: 1600
  Eval Episode 3/10 | Reward: 9.87 | Steps: 1600
  Eval Episode 4/10 | Reward: 6.22 | Steps: 1600
  Eval Episode 5/10 | Reward: 7.83 | Steps: 1600
  Eval Episode 6/10 | Reward: 3.93 | Steps: 1600
  Eval Episode 7/10 | Reward: 11.41 | Steps: 1600
  Eval Episode 8/10 | Reward: 14.06 | Steps: 1600
  Eval Episode 9/10 | Reward: 13.68 | Steps: 1600
  Eval Episode 10/10 | Reward: 13.54 | Steps: 1600
--- Evaluation Finished | Average Reward: 11.11 ---
Final Average Evaluation Reward (over 10 episodes): 11.11

Script finished.
