In [1]:
import gymnasium as gym
from gymnasium.spaces import Discrete,Box,Tuple
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from torch import nn
from torch import optim
from torch.distributions.categorical import Categorical
import os

sns.set()

In [2]:
DEVICE = 'cuda'
game_name = 'LunarLander-v2'

In [3]:
env = gym.make(game_name,render_mode = 'human')
print(isinstance(env.action_space,Box))
print(env.action_space.sample())
print(env.observation_space.sample())

False
0
[ 1.4343741e+00 -3.8181785e-03  3.8957295e+00 -3.2106454e+00
  2.6929338e+00 -1.7445391e-01  3.5152283e-01  6.0487342e-01]


In [4]:
class ActorCriticNetwork(nn.Module):
  def __init__(self, obs_space_size, action_space):
    super().__init__()

    self.trained_for = 0
    action_space_size = action_space.shape[0] if isinstance(action_space,Box) else action_space.n

    self.shared_layers = nn.Sequential(
        nn.Linear(obs_space_size, 64),
        nn.ReLU(),
        nn.Linear(64, 64),
        nn.ReLU())

    self.policy_layers = nn.Sequential(
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, action_space_size))

    self.value_layers = nn.Sequential(
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, 1))

  def value(self, obs):
    z = self.shared_layers(obs)
    value = self.value_layers(z)
    return value

  def policy(self, obs):
    z = self.shared_layers(obs)
    policy_logits = self.policy_layers(z)
    return policy_logits

  def forward(self, obs):
    z = self.shared_layers(obs)
    policy_logits = self.policy_layers(z)
    value = self.value_layers(z)
    return policy_logits, value

In [5]:
class PPOTrainer():
  def __init__(self,
              actor_critic,
              ppo_clip_val=0.2,
              target_kl_div=0.01,
              max_policy_train_iters=80,
              value_train_iters=80,
              policy_lr=3e-4,
              value_lr=1e-2):
    self.ac = actor_critic
    self.ppo_clip_val = ppo_clip_val
    self.target_kl_div = target_kl_div
    self.max_policy_train_iters = max_policy_train_iters
    self.value_train_iters = value_train_iters

    policy_params = list(self.ac.shared_layers.parameters()) + \
        list(self.ac.policy_layers.parameters())
    self.policy_optim = optim.Adam(policy_params, lr=policy_lr)

    value_params = list(self.ac.shared_layers.parameters()) + \
        list(self.ac.value_layers.parameters())
    self.value_optim = optim.Adam(value_params, lr=value_lr)

  def train_policy(self, obs, acts, old_log_probs, gaes):
    for _ in range(self.max_policy_train_iters):
      self.policy_optim.zero_grad()

      new_logits = self.ac.policy(obs)
      new_logits = Categorical(logits=new_logits)
      new_log_probs = new_logits.log_prob(acts)

      policy_ratio = torch.exp(new_log_probs - old_log_probs)
      clipped_ratio = policy_ratio.clamp(
          1 - self.ppo_clip_val, 1 + self.ppo_clip_val)

      clipped_loss = clipped_ratio * gaes
      full_loss = policy_ratio * gaes
      policy_loss = -torch.min(full_loss, clipped_loss).mean()

      policy_loss.backward()
      self.policy_optim.step()

      kl_div = (old_log_probs - new_log_probs).mean()
      if kl_div >= self.target_kl_div:
        break

  def train_value(self, obs, returns):
    for _ in range(self.value_train_iters):
      self.value_optim.zero_grad()

      values = self.ac.value(obs)
      value_loss = (returns - values) ** 2
      value_loss = value_loss.mean()

      value_loss.backward()
      self.value_optim.step()

In [6]:
def discount_rewards(rewards, gamma=0.99):
    """
    Return discounted rewards based on the given rewards and gamma param.
    """
    new_rewards = [float(rewards[-1])]
    for i in reversed(range(len(rewards)-1)):
        new_rewards.append(float(rewards[i]) + gamma * new_rewards[-1])
    return np.array(new_rewards[::-1])

def calculate_gaes(rewards, values, gamma=0.99, decay=0.97):
    """
    Return the General Advantage Estimates from the given rewards and values.
    Paper: https://arxiv.org/pdf/1506.02438.pdf
    """
    next_values = np.concatenate([values[1:], [0]])
    deltas = [rew + gamma * next_val - val for rew, val, next_val in zip(rewards, values, next_values)]

    gaes = [deltas[-1]]
    for i in reversed(range(len(deltas)-1)):
        gaes.append(deltas[i] + decay * gamma * gaes[-1])

    return np.array(gaes[::-1])

In [7]:
def rollout(model, env, max_steps=1000):
    """
    Performs a single rollout.
    Returns training data in the shape (n_steps, observation_shape)
    and the cumulative reward.
    """
    ### Create data storage
    train_data = [[], [], [], [], []] # obs, act, reward, values, act_log_probs
    obs = env.reset()[0]

    ep_reward = 0
    for _ in range(max_steps):
        logits, val = model(torch.tensor([obs], dtype=torch.float32,
                                         device=DEVICE))
        act_distribution = Categorical(logits=logits)
        act = act_distribution.sample()
        act_log_prob = act_distribution.log_prob(act).item()

        act, val = act.item(), val.item()

        next_obs, reward, done, _, _ = env.step(act if isinstance(env.action_space,Discrete) else [act])

        for i, item in enumerate((obs, act, reward, val, act_log_prob)):
          train_data[i].append(item)

        obs = next_obs
        ep_reward += reward
        if done:
            break

    train_data = [np.asarray(x) for x in train_data]

    ### Do train data filtering
    train_data[3] = calculate_gaes(train_data[2], train_data[3])

    return train_data, ep_reward

In [8]:
env = gym.make(game_name,render_mode = 'human')
model = ActorCriticNetwork(env.observation_space.shape[0], env.action_space)
model = model.to(DEVICE)

train_data, reward = rollout(model, env,max_steps=10) # Test rollout function
env.render()

  logits, val = model(torch.tensor([obs], dtype=torch.float32,


In [9]:
# Define training params
n_episodes = 500
print_freq = 1
save_freq = 1
max_steps = 500

ppo = PPOTrainer(
    model,
    policy_lr = 3e-4,
    value_lr = 1e-3,
    target_kl_div = 0.02,
    max_policy_train_iters = 40,
    value_train_iters = 40)

In [10]:
if(os.path.isfile(f'{game_name}.pt')):
    model.load_state_dict(torch.load(f'{game_name}.pt'))
    print('loaded')
else:
    print('not found')

loaded


In [11]:
ep_rewards = []
for episode_idx in range(n_episodes):
  # Perform rollout
  train_data, reward = rollout(model, env, max_steps=max_steps)
  ep_rewards.append(reward)

  # Shuffle
  permute_idxs = np.random.permutation(len(train_data[0]))

  # Policy data
  obs = torch.tensor(train_data[0][permute_idxs],
                     dtype=torch.float32, device=DEVICE)
  acts = torch.tensor(train_data[1][permute_idxs],
                      dtype=torch.int32, device=DEVICE)
  gaes = torch.tensor(train_data[3][permute_idxs],
                      dtype=torch.float32, device=DEVICE)
  act_log_probs = torch.tensor(train_data[4][permute_idxs],
                               dtype=torch.float32, device=DEVICE)

  # Value data
  returns = discount_rewards(train_data[2])[permute_idxs]
  returns = torch.tensor(returns, dtype=torch.float32, device=DEVICE)

  # Train model
  ppo.train_policy(obs, acts, act_log_probs, gaes)
  ppo.train_value(obs, returns)

  model.trained_for += 1

  if (episode_idx + 1) % save_freq == 0:
    torch.save(model.state_dict(), f'{game_name}.pt')

  if (episode_idx + 1) % print_freq == 0:
    print('Episode {} | Avg Reward {:.1f}'.format(
        episode_idx + 1, np.mean(ep_rewards[-print_freq:])))

Episode 1 | Avg Reward 1.0
Episode 2 | Avg Reward -159.9
Episode 3 | Avg Reward -47.6
Episode 4 | Avg Reward 20.0
Episode 5 | Avg Reward -109.7
Episode 6 | Avg Reward -122.1
Episode 7 | Avg Reward -24.9
Episode 8 | Avg Reward -49.9
Episode 9 | Avg Reward -11.8
Episode 10 | Avg Reward -22.2
Episode 11 | Avg Reward -93.9
Episode 12 | Avg Reward -11.5
Episode 13 | Avg Reward -139.1
Episode 14 | Avg Reward -62.4
Episode 15 | Avg Reward -141.2
Episode 16 | Avg Reward 75.9
Episode 17 | Avg Reward -137.1
Episode 18 | Avg Reward -360.5
Episode 19 | Avg Reward -363.0
Episode 20 | Avg Reward -67.2
Episode 21 | Avg Reward -253.7
Episode 22 | Avg Reward -171.1
Episode 23 | Avg Reward -138.1
Episode 24 | Avg Reward -111.7
Episode 25 | Avg Reward -324.7
Episode 26 | Avg Reward -183.7
Episode 27 | Avg Reward -76.1
Episode 28 | Avg Reward -88.6
Episode 29 | Avg Reward -69.4
Episode 30 | Avg Reward -88.6
Episode 31 | Avg Reward -225.4
Episode 32 | Avg Reward -85.0
Episode 33 | Avg Reward -333.5
Episode

KeyboardInterrupt: 

In [None]:
# Test the trained agent
model.load_state_dict(torch.load(f'{game_name}.pt'))
print(f'Trained for {model.trained_for}')
while(True):
  # Perform rollout
  train_data, reward = rollout(model, env)
  env.render()

  print('Episode {} | Avg Reward {:.1f}'.format(
        episode_idx + 1, reward))
