<a href="https://colab.research.google.com/github/Vyshnavijulapelly/Reinforcement-Learning/blob/main/RL_Lab_08.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import math
import time
import random
from collections import deque, namedtuple
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
try:
    import gym
except Exception as e:
    raise ImportError("This script needs gym. Install with: pip install gym")

class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_sizes=(64,64), log_std_init=-0.5):
        super().__init__()
        layers = []
        last = obs_dim
        for h in hidden_sizes:
            layers.append(nn.Linear(last, h))
            layers.append(nn.ReLU())
            last = h
        self.net = nn.Sequential(*layers)
        self.mean = nn.Linear(last, act_dim)
        self.log_std = nn.Parameter(torch.ones(act_dim) * log_std_init)

    def forward(self, obs):
        x = self.net(obs)
        mean = self.mean(x)
        log_std = self.log_std.expand_as(mean)
        std = torch.exp(log_std)
        return mean, std, log_std

class Critic(nn.Module):
    def __init__(self, obs_dim, hidden_sizes=(64,64)):
        super().__init__()
        layers = []
        last = obs_dim
        for h in hidden_sizes:
            layers.append(nn.Linear(last, h))
            layers.append(nn.ReLU())
            last = h
        layers.append(nn.Linear(last, 1))
        self.net = nn.Sequential(*layers)

    def forward(self, obs):
        return self.net(obs).squeeze(-1)

Transition = namedtuple('Transition', ['obs', 'act', 'rew', 'next_obs', 'done', 'logp'])

def gaussian_log_prob(mean, log_std, act):
    var = torch.exp(2 * log_std)
    logp = -0.5 * (((act - mean) ** 2) / var + 2 * log_std + math.log(2 * math.pi))
    return logp.sum(axis=-1)

def sample_action(mean, std):
    eps = torch.randn_like(mean)
    return mean + eps * std

class A2CAgent:
    def __init__(self, obs_dim, act_dim, device='cpu', actor_lr=3e-4, critic_lr=1e-3, gamma=0.99, value_coef=0.5, entropy_coef=1e-3, max_grad_norm=0.5):
        self.device = device
        self.actor = Actor(obs_dim, act_dim).to(device)
        self.critic = Critic(obs_dim).to(device)
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=critic_lr)
        self.gamma = gamma
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.max_grad_norm = max_grad_norm

    def get_action(self, obs):
        obs_t = torch.as_tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
        with torch.no_grad():
            mean, std, log_std = self.actor(obs_t)
            act = sample_action(mean, std)
            logp = gaussian_log_prob(mean, log_std, act)
            value = self.critic(obs_t)
        return act.cpu().numpy()[0], logp.cpu().numpy()[0], value.cpu().numpy()[0]

    def update(self, transitions, last_value=0.0):
        obs = torch.as_tensor(np.vstack([t.obs for t in transitions]), dtype=torch.float32, device=self.device)
        acts = torch.as_tensor(np.vstack([t.act for t in transitions]), dtype=torch.float32, device=self.device)
        rewards = [t.rew for t in transitions]
        dones = [t.done for t in transitions]
        old_logps = torch.as_tensor(np.array([t.logp for t in transitions]), dtype=torch.float32, device=self.device)
        returns = []
        R = last_value
        for r, done in zip(reversed(rewards), reversed(dones)):
            R = r + self.gamma * R * (1.0 - float(done))
            returns.insert(0, R)
        returns = torch.as_tensor(returns, dtype=torch.float32, device=self.device)
        values = self.critic(obs)
        mean, std, log_std = self.actor(obs)
        new_logps = gaussian_log_prob(mean, log_std, acts)
        entropy = (0.5 * (1.0 + math.log(2 * math.pi)) + log_std).sum(-1).mean()
        advantages = returns - values.detach()
        policy_loss = -(new_logps * advantages).mean()
        value_loss = 0.5 * (returns - values).pow(2).mean()
        actor_loss = policy_loss - self.entropy_coef * entropy
        self.optimizer_actor.zero_grad()
        actor_loss.backward()
        nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
        self.optimizer_actor.step()
        self.optimizer_critic.zero_grad()
        value_loss.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
        self.optimizer_critic.step()
        return dict(policy_loss=policy_loss.item(), value_loss=value_loss.item(), entropy=entropy.item())

def train(env_name='Pendulum-v1', seed=1, episodes=500, max_steps=200, n_steps=5, render=False):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    env = gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    agent = A2CAgent(obs_dim, act_dim, device=device)
    total_rewards = []
    for ep in range(1, episodes + 1):
        obs = env.reset()
        if isinstance(obs, tuple):
            obs = obs[0]
        ep_reward = 0.0
        done = False
        step = 0
        buffer = []
        while not done and step < max_steps:
            act, logp, value = agent.get_action(obs)
            next_obs, rew, terminated, truncated, info = None, None, None, None, None
            try:
                next_obs, rew, terminated, truncated, info = env.step(act)
                done_flag = terminated or truncated
            except Exception:
                next_obs, rew, done_flag, info = env.step(act)
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            buffer.append(Transition(obs=obs, act=act, rew=rew, next_obs=next_obs, done=done_flag, logp=logp))
            ep_reward += rew
            obs = next_obs
            step += 1
            if len(buffer) >= n_steps or done_flag or step >= max_steps:
                if done_flag:
                    last_value = 0.0
                else:
                    obs_t = torch.as_tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
                    with torch.no_grad():
                        last_value = agent.critic(obs_t).cpu().numpy()[0]
                stats = agent.update(buffer, last_value=last_value)
                buffer = []
            if done_flag:
                break
        total_rewards.append(ep_reward)
        if ep % 10 == 0:
            avg = np.mean(total_rewards[-50:])
            print(f"Ep {ep}\tAvgReward(50)={avg:.2f}\tLastReward={ep_reward:.2f}\tPolicyLoss={stats['policy_loss']:.4f}\tValueLoss={stats['value_loss']:.4f}\tEntropy={stats['entropy']:.4f}")
    env.close()
    return agent, total_rewards

if __name__ == '__main__':
    start = time.time()
    agent, rewards = train(episodes=60, max_steps=200, n_steps=5)
    print('Done in', time.time() - start)

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  deprecation(
  deprecation(
  return datetime.utcnow().replace(tzinfo=utc)


Ep 10	AvgReward(50)=-704.79	LastReward=-588.47	PolicyLoss=33.3017	ValueLoss=748.8160	Entropy=0.9336


  return datetime.utcnow().replace(tzinfo=utc)


Ep 20	AvgReward(50)=-656.62	LastReward=-534.66	PolicyLoss=26.7379	ValueLoss=1161.5837	Entropy=0.9358
Ep 30	AvgReward(50)=-679.25	LastReward=-484.70	PolicyLoss=137.9226	ValueLoss=11563.3984	Entropy=0.9461
Ep 40	AvgReward(50)=-697.97	LastReward=-763.56	PolicyLoss=230.4587	ValueLoss=31512.4062	Entropy=0.9535
Ep 50	AvgReward(50)=-703.34	LastReward=-788.49	PolicyLoss=320.4993	ValueLoss=85216.2031	Entropy=0.9594
Ep 60	AvgReward(50)=-715.46	LastReward=-784.14	PolicyLoss=559.5554	ValueLoss=218181.1562	Entropy=0.9626
Done in 18.598207473754883
