In [4]:
import gymnasium as gym
import numpy as np

## Gymnasium API Notes

In [20]:
env = gym.make("BipedalWalker-v3", render_mode=None)

env.action_space.shape[0]

4

In [21]:
env2 = gym.make("LunarLander-v3", render_mode=None)

env2.action_space.n

np.int64(4)

#### Action space dimension for continuous parallel environments

In [22]:
envs = gym.make_vec("Pendulum-v1", num_envs=2, vectorization_mode="sync")

envs.single_action_space.shape[0]
envs.action_space.shape[-1]

1

#### Action space dimension for discrete parallel environments

In [23]:
envs = gym.make_vec("LunarLander-v3", num_envs=2, vectorization_mode="sync")

envs.single_action_space.n
envs.action_space.shape # gives number of parallel environments
envs.action_space[0].n

np.int64(4)

## Tensor Manipulation

for A2C, we have several tensors with the following shapes:
- states: (N + 1,)
- actions: (N, )
- rewards: (N, )
- log_probs: (N, )
- dones: (N, )
where N is the number of rewards received.

Initially, these were saved to their own separate lists. But, this forces the API to be sequential and inefficient. To introduce parallelism, we need to generalize this by using Tensors and NumPy Arrays

In [8]:
# Lunar Lander uses 4 dimensional discrete action space
envs = gym.make_vec("LunarLander-v3", num_envs=2, vectorization_mode="sync")
envs.reset(seed=42)

actions = envs.action_space.sample()

obs, rewards, terminates, truncates, infos = envs.step(actions)

print(actions)

print("Actions:")
for a in actions:
    print(a)

print("Observations:")
for ob in obs:
    print(ob)

[3 1]
Actions:
3
1
Observations:
[ 0.00465546  1.4247642   0.24004106  0.29480776 -0.00680472 -0.08300382
  0.          0.        ]
[-0.00712929  1.3984202  -0.3651231  -0.29073122  0.0096392   0.11075787
  0.          0.        ]


To parallelize our environment, we can actually create the following numpy arrays instead of lists:
- states: (T + 1, state_space_size, E)
- rewards: (T, E)
- log_probs: (T, E)
where T is the number of timesteps and E is the number of environments. This actually means we can keep our original code, and parallelize our operations over each environment. This seems pretty efficient and is the best we can do for these algorithms, which have data dependencies.

# A2C Mini Batching

In [60]:
# add project root to sys.path
import sys
import os
from pathlib import Path
import os

here = Path(os.getcwd()).resolve()
sys.path.append(str(here.parent))
from networks import *
from agents import *

import json

config_file_name = "a2c_config.json"

# --------------------------------------------------------------- #
#                     Loading JSON Config File                    #
# --------------------------------------------------------------- #
script_dir = os.path.dirname(os.path.abspath(here))
config_path = os.path.join(script_dir, "config", config_file_name)

with open(config_path, "r") as f:
    cfg = json.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("=" * 50)
print("*")
print(f"* Using config file: {config_path}")
print(f"* Using device: {device}")
print("*")
print("=" * 50)
# add device to cfg
cfg["device"] = device
np.random.seed(cfg["seed"])
torch.manual_seed(cfg["seed"])

agent = A2CAgent(cfg)

env = gym.make("BipedalWalker-v3")
envs = gym.make_vec("BipedalWalker-v3", num_envs=2, vectorization_mode="sync")
env.reset()
envs.reset()

*
* Using config file: /home/xavier/projects/BipedalWalker/config/a2c_config.json
* Using device: cuda
*
Using activation function: relu


(array([[ 2.7455471e-03,  1.2283065e-05, -1.6016494e-03, -1.6000077e-02,
          9.2544638e-02,  3.7181317e-03,  8.5972905e-01, -1.5536519e-03,
          1.0000000e+00,  3.2846723e-02,  3.7179857e-03,  8.5350478e-01,
         -2.5298817e-03,  1.0000000e+00,  4.4081339e-01,  4.4581950e-01,
          4.6142212e-01,  4.8954949e-01,  5.3410202e-01,  6.0246021e-01,
          7.0914787e-01,  8.8593054e-01,  1.0000000e+00,  1.0000000e+00],
        [ 2.7474896e-03, -6.9800271e-06,  5.4296060e-04, -1.5999954e-02,
          9.2052542e-02, -7.1652001e-04,  8.6021698e-01,  1.9326812e-03,
          1.0000000e+00,  3.2458264e-02, -7.1647146e-04,  8.5376561e-01,
          5.0673110e-04,  1.0000000e+00,  4.4081402e-01,  4.4582012e-01,
          4.6142277e-01,  4.8955020e-01,  5.3410280e-01,  6.0246104e-01,
          7.0914888e-01,  8.8593185e-01,  1.0000000e+00,  1.0000000e+00]],
       dtype=float32),
 {})

In [61]:
# sample random actions from single environment's action space
action = env.action_space.sample()
print(action)
actions = np.array([action for _ in range(envs.num_envs)])
print(actions)
print("=" * 50)
# use step() to generate next states from random actions
obs, rewards, terminates, truncates, infos = env.step(action)
print(obs)
obss, rewards, terminates, truncates, infos = envs.step(actions)
print(obss)

[-0.45927367  0.6635488   0.04158337  0.896212  ]
[[-0.45927367  0.6635488   0.04158337  0.896212  ]
 [-0.45927367  0.6635488   0.04158337  0.896212  ]]
[ 0.0275713   0.02042601  0.02294076  0.02646169 -0.30862036 -0.9203621
  1.7221355   1.0037428   1.         -0.40061867 -0.91190886  1.7239717
  0.9924399   1.          0.45596296  0.4611411   0.47727996  0.506374
  0.5524577   0.62316513  0.7335194   0.9163776   1.          1.        ]
[[ 0.02757297  0.02045241  0.02138251  0.02655961 -0.3080452  -0.9191171
   1.7215612   1.0037447   1.         -0.39995864 -0.91069496  1.7232398
   0.99246436  1.          0.45599863  0.46117717  0.4773173   0.5064136
   0.5525009   0.6232138   0.7335768   0.9164493   1.          1.        ]
 [ 0.02757056  0.02041726  0.0234743   0.02643744 -0.3087883  -0.92078876
   1.7222743   1.00374     1.         -0.4008174  -0.9123346   1.7241647
   0.99244195  1.          0.4559534   0.46113142  0.47726995  0.50636333
   0.55244607  0.623152    0.733504    0.91

In [None]:
# compute output of actor network for single state
# If the environment is not vectorized, we need to add a batch dimension
# using unsqueeze(0)
state_t = torch.FloatTensor(obs).unsqueeze(0).to(agent.device)
state_t.shape
logits_t, action_probs = agent.actor(state_t)
print(action_probs)

dist = torch.distributions.Categorical(probs=action_probs)
action = dist.sample()
log_prob = dist.log_prob(action).sum(dim=-1)
entropy = dist.entropy().sum(dim=-1)

print("Action:", action)
print("Log Probability:", log_prob)
print("Entropy:", entropy)

print("=" * 50)
# for vectorized envs
states_t = torch.FloatTensor(obss).to(agent.device)
print(states_t.shape)
logits_t, actions_probs = agent.actor(states_t)
print(actions_probs)
# In both cases, we just need to use this code
dist = torch.distributions.Categorical(probs=actions_probs)
actions = dist.sample()
log_probs = dist.log_prob(actions).sum(dim=-1)
entropies = dist.entropy().sum(dim=-1)

print("Actions:", actions.cpu().numpy())
print("Log Probabilities:", log_probs)
print("Entropies:", entropies)

tensor([[0.5041, 0.4959]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Action: tensor([0], device='cuda:0')
Log Probability: tensor([-0.6850], device='cuda:0', grad_fn=<SqueezeBackward1>)
Entropy: tensor([0.6931], device='cuda:0', grad_fn=<NegBackward0>)
torch.Size([4, 4])
tensor([[0.5044, 0.4956],
        [0.5038, 0.4962],
        [0.5039, 0.4961],
        [0.5036, 0.4964]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Actions: [0 0 0 0]
Log Probabilities: tensor([-0.6844, -0.6855, -0.6854, -0.6859], device='cuda:0',
       grad_fn=<SqueezeBackward1>)
Entropies: tensor([0.6931, 0.6931, 0.6931, 0.6931], device='cuda:0',
       grad_fn=<NegBackward0>)


In [None]:
# continuous action spaces
# If the action space is multi-dimensional, we need to use the sum
# across the actions to generate a single log probability and entropy value
state_t = torch.FloatTensor(obs).unsqueeze(0).to(agent.device)
state_t.shape
logits_t, action_probs = agent.actor(state_t)
print(action_probs)
mean, log_std = agent.actor(state_t)
mean = mean.squeeze()
log_std = log_std.squeeze()
std = torch.exp(log_std)

dist = torch.distributions.Normal(mean, std)
action_t = dist.sample()
action_t = torch.clamp(action_t, -1, 1)
log_prob_t = dist.log_prob(action_t).sum(dim=-1)
entropy_t = dist.entropy().sum(dim=-1)

print("Action:", action_t)
print("Log Probability:", log_prob_t)
print("Entropy:", entropy_t)

print("=" * 50)
# for vectorized envs
states_t = torch.FloatTensor(obss).to(agent.device)
print(states_t.shape)
means, log_stds = agent.actor(states_t)
print(actions_probs)

dist = torch.distributions.Normal(means, torch.exp(log_stds))
action_t = dist.sample()
action_t = torch.clamp(action_t, -1, 1)
log_prob_t = dist.log_prob(action_t).sum(dim=-1)
entropy_t = dist.entropy().sum(dim=-1)

print("Actions:", action_t.cpu().numpy())
print("Log Probabilities:", log_prob_t)
print("Entropies:", entropy_t)

tensor([[ 0.1041,  0.0455,  0.1117, -0.1002]], device='cuda:0',
       grad_fn=<ClampBackward1>)
Action: tensor([-0.6209,  1.0000,  0.6564, -0.9662], device='cuda:0')
Log Probability: tensor(-5.0351, device='cuda:0', grad_fn=<SumBackward1>)
Entropy: tensor(5.8368, device='cuda:0', grad_fn=<SumBackward1>)
torch.Size([2, 24])
tensor([[0.5044, 0.4956],
        [0.5038, 0.4962],
        [0.5039, 0.4961],
        [0.5036, 0.4964]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Actions: [[-0.819049   -1.         -1.         -1.        ]
 [-0.5251849   0.9991143   0.69724464 -0.10119598]]
Log Probabilities: tensor([-5.3571, -4.6022], device='cuda:0', grad_fn=<SumBackward1>)
Entropies: tensor([5.8367, 5.8368], device='cuda:0', grad_fn=<SumBackward1>)
