<a href="https://colab.research.google.com/github/NithinReddychallagonda/RLML-AIML/blob/main/RLML_LAB_09.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#2303A51550

Ch.Nithin Reddy

Batch:-08

Q:-Proximal Policy Optimization -PPO using a deep learning framework -TensorFlow or PyTorch.

In [3]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque

env_name = "CartPole-v1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(obs_dim, 64), nn.Tanh(),
            nn.Linear(64, 64), nn.Tanh()
        )
        self.policy = nn.Sequential(nn.Linear(64, act_dim), nn.Softmax(dim=-1))
        self.value = nn.Linear(64, 1)

    def forward(self, x):
        h = self.shared(x)
        return self.policy(h), self.value(h)


def collect_trajectories(env, net, steps, gamma, lam):
    obs, _ = env.reset()
    obs_buf, act_buf, rew_buf, val_buf, logp_buf = [], [], [], [], []
    ep_rews, ep_len = [], []

    for _ in range(steps):
        obs_t = torch.as_tensor(obs, dtype=torch.float32).to(device)
        pi, v = net(obs_t)
        dist = torch.distributions.Categorical(pi)
        a = dist.sample().cpu().numpy()
        logp = dist.log_prob(torch.as_tensor(a)).cpu().item()

        obs_buf.append(obs.copy())
        act_buf.append(a)
        val_buf.append(v.cpu().item())
        logp_buf.append(logp)

        next_obs, r, terminated, truncated, _ = env.step(int(a))
        done = terminated or truncated
        rew_buf.append(r)
        ep_rews.append(r)

        obs = next_obs
        if done:
            obs, _ = env.reset()
            ep_len.append(len(ep_rews))
            ep_rews = []

    obs_buf, act_buf = np.array(obs_buf), np.array(act_buf)
    rew_buf, val_buf, logp_buf = np.array(rew_buf), np.array(val_buf), np.array(logp_buf)

    last_val = net(torch.as_tensor(obs, dtype=torch.float32).to(device))[1].cpu().item()
    adv_buf = np.zeros_like(rew_buf)
    lastgaelam = 0

    for t in reversed(range(len(rew_buf))):
        if t == len(rew_buf) - 1:
            nextnonterminal = 1.0
            nextvalues = last_val
        else:
            nextnonterminal = 1.0
            nextvalues = val_buf[t + 1]
        delta = rew_buf[t] + gamma * nextvalues * nextnonterminal - val_buf[t]
        lastgaelam = delta + gamma * lam * nextnonterminal * lastgaelam
        adv_buf[t] = lastgaelam

    ret_buf = adv_buf + val_buf
    return obs_buf, act_buf, logp_buf, adv_buf, ret_buf


def ppo_train(env_name="CartPole-v1", total_steps=20000, batch_steps=1024, epochs=10, minibatch_size=64,
              gamma=0.99, lam=0.95, clip=0.2, pi_lr=3e-4):
    env = gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    net = ActorCritic(obs_dim, act_dim).to(device)
    optimizer = optim.Adam(net.parameters(), lr=pi_lr)
    steps = 0

    while steps < total_steps:
        obs_buf, act_buf, logp_buf, adv_buf, ret_buf = collect_trajectories(env, net, batch_steps, gamma, lam)
        steps += batch_steps
        adv_buf = (adv_buf - adv_buf.mean()) / (adv_buf.std() + 1e-8)
        inds = np.arange(batch_steps)

        for _ in range(epochs):
            np.random.shuffle(inds)
            for start in range(0, batch_steps, minibatch_size):
                mb = inds[start:start + minibatch_size]
                obs_mb = torch.as_tensor(obs_buf[mb], dtype=torch.float32).to(device)
                act_mb = torch.as_tensor(act_buf[mb], dtype=torch.int64).to(device)
                old_logp_mb = torch.as_tensor(logp_buf[mb], dtype=torch.float32).to(device)
                adv_mb = torch.as_tensor(adv_buf[mb], dtype=torch.float32).to(device)
                ret_mb = torch.as_tensor(ret_buf[mb], dtype=torch.float32).to(device)

                pi, v = net(obs_mb)
                dist = torch.distributions.Categorical(pi)
                logp = dist.log_prob(act_mb)
                ratio = torch.exp(logp - old_logp_mb)
                surr1 = ratio * adv_mb
                surr2 = torch.clamp(ratio, 1 - clip, 1 + clip) * adv_mb
                policy_loss = -torch.min(surr1, surr2).mean()
                value_loss = ((v.squeeze(-1) - ret_mb) ** 2).mean()
                entropy = dist.entropy().mean()
                loss = policy_loss + 0.5 * value_loss - 0.01 * entropy

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        print(f"Steps: {steps}\tLoss: {loss.item():.3f}\tPolicy: {policy_loss.item():.3f}\tValue: {value_loss.item():.3f}")
    env.close()


if __name__ == "__main__":
    ppo_train(env_name)


Steps: 1024	Loss: 58.331	Policy: 0.076	Value: 116.522
Steps: 2048	Loss: 50.379	Policy: 0.199	Value: 100.372
Steps: 3072	Loss: 57.655	Policy: -0.149	Value: 115.619
Steps: 4096	Loss: 53.559	Policy: 0.169	Value: 106.791
Steps: 5120	Loss: 50.189	Policy: 0.308	Value: 99.772
Steps: 6144	Loss: 46.991	Policy: 0.024	Value: 93.945
Steps: 7168	Loss: 43.378	Policy: 0.006	Value: 86.756
Steps: 8192	Loss: 39.856	Policy: 0.024	Value: 79.674
Steps: 9216	Loss: 36.072	Policy: -0.006	Value: 72.168
Steps: 10240	Loss: 32.451	Policy: 0.205	Value: 64.504
Steps: 11264	Loss: 29.832	Policy: -0.062	Value: 59.798
Steps: 12288	Loss: 25.994	Policy: 0.273	Value: 51.452
Steps: 13312	Loss: 24.449	Policy: -0.023	Value: 48.955
Steps: 14336	Loss: 21.921	Policy: -0.145	Value: 44.144
Steps: 15360	Loss: 19.332	Policy: -0.084	Value: 38.843
Steps: 16384	Loss: 17.209	Policy: 0.017	Value: 34.394
Steps: 17408	Loss: 15.037	Policy: -0.032	Value: 30.147
Steps: 18432	Loss: 13.076	Policy: 0.054	Value: 26.055
Steps: 19456	Loss: 11.486	