In [2]:
# define the mlp network for lunarlander
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np
from typing import Tuple

def layer_init(layer: nn.Module, std: float, bias: float) -> nn.Module:
    # initialize the linear/convolutional layer
    if hasattr(layer, "weight") and layer.weight is not None:
        nn.init.orthogonal_(layer.weight, gain=std)
    if hasattr(layer, "bias") and layer.bias is not None:
        nn.init.constant_(layer.bias, bias)
    return layer

class LunarLanderMLP(nn.Module):
    # initialize the mlp
    def __init__(self, envs):
        super().__init__()
        # define the critic network
        self.critic = nn.Sequential(
            layer_init(nn.Linear(8, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 1), std=1.0)
        )
        # define the actor network
        self.actor = nn.Sequential(
            layer_init(nn.Linear(8, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, envs.action_space.n), std=0.01)
        )

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        logits = self.actor(x) # [batch, num_actions]
        dist = Categorical(logits=logits) # [batch, num_actions] after softmax to construct a distribution
        if action is None: # output actions in sample stage
            action = dist.sample() # tips: tensor.sample() is wrong!!!
        log_prob = dist.log_prob(action) # [batch]
        entropy = dist.entropy() # [batch]
        value = self.critic(x).squeeze(-1) # [batch]
        return action, log_prob, entropy, value

In [3]:
@torch.no_grad()
def compute_gae(
    rewards: torch.Tensor, 
    dones: torch.Tensor, 
    values: torch.Tensor, 
    next_value: torch.Tensor, 
    next_done: torch.Tensor, 
    gamma: float, 
    gae_lambda: float) -> Tuple[torch.Tensor, torch.Tensor]:
    # calculate GAE: general advantage estimination
    # need: next_obs(next_value, next_done), rewards, dones, values
    advantages = torch.zeros_like(rewards)
    next_advantage = 0
    num_steps = rewards.shape[0]
    for t in reversed(range(num_steps)):
        # if num_step == 100, t = 99, 98 ... 0
        # if done == 0, mask == 1, game didn't stop
        # next_done and next_value is the current state (internal vars)
        # check if it is the last step
        if t == num_steps - 1:
            mask = 1.0 - next_done
            next_value = next_value 
        else:
            # if it is not the last step, check the next step in the buffer
            mask = 1.0 - dones[t + 1]
            next_value = values[t + 1]
        # calculate the td_error/delta
        delta = rewards[t] + gamma * next_value - values[t]
        # calculate the GAE
        current_advantage = delta + gamma * gae_lambda * mask * next_advantage
        advantages[t] = current_advantage # save current advantage
        next_advantage = current_advantage
    returns = advantages + values
    return advantages, returns

In [4]:
import gymnasium as gym
from typing import Callable

# define hyper-params
env_id = "LunarLander-v3"
num_envs = 16
capture_video = False

# environment factory function
def make_env(env_id: str, idx: int, capture_video: bool, run_name: str) -> Callable[[], gym.Env]:
    def thunk() -> gym.Env:
        # create the basic env
        if capture_video and idx == 0: # only capture the env_id == 0
            env = gym.make(env_id, render_mode='rgb_array')
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = gym.make(env_id)
        # add a wrapper to get the real rewards
        # what is a wrapper in gym?
        # wrapper can add functions for the env without modifying the source code
        # e.g., output more infos after env.step(action) or record videos
        env = gym.wrappers.RecordEpisodeStatistics(env)
        return env
    return thunk

# create the vector envs
envs = gym.vector.SyncVectorEnv(
    [make_env(env_id, i, capture_video, "exp_1") for i in range(num_envs)]
)   

assert isinstance(envs.single_action_space, gym.spaces.Discrete)
print(
    f"LunarLander envs are created! "
    f"obs_shape:{envs.single_observation_space.shape}; "
    f"action_n:{envs.single_action_space.n}"
)

LunarLander envs are created! obs_shape:(8,); action_n:4


  from pkg_resources import resource_stream, resource_exists
