# Dependancies

In [5]:
from typing import Optional, Tuple
from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import torch
from torch import nn
from torch.optim import Adam
from torch.distributions import Normal

import gym

# The Model
* Diagonal Gaussian
  - 2 MLPs
* Value function Learner
  - 1 MLP
* Actor
  - Class to tie both together

In [6]:
class DiagonalGaussian(nn.Module):
    def __init__(
        self, obs_dim: int, hidden_dim: int, action_dim: int, activation
    ) -> None:
        super(DiagonalGaussian, self).__init__()
        log_std = -0.5 * np.ones(action_dim, dtype=float)
        self.covariance_matrix = torch.nn.Parameter(torch.as_tensor(log_std))
        self.mean_action_net = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            activation,
            nn.Linear(hidden_dim, action_dim),
            activation,
        )

    def _distribution(self, observation):
        mean_act = self.mean_action_net(observation)
        covariance_mat = torch.exp(self.covariance_matrix)
        return Normal(mean_act, covariance_mat)

    def _log_probs_from_dist(self, policy_dist, action):
        return policy_dist.log_prob(action).sum(axis=-1)

    def forward(self, observation, action=None):
        policy_dist = self._distribution(observation)
        logp_act = None
        if action is not None:
            logp_act = self._log_probs_from_dist(policy_dist, action)
        return policy_dist, logp_act


class ValueFunctionLearner(nn.Module):
    def __init__(self, obs_dim: int, hidden_dim: int, activation) -> None:
        super(ValueFunctionLearner, self).__init__()
        self.v_net = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            activation,
            nn.Linear(hidden_dim, 1),
            activation,
        )

    def forward(self, observation):
        return torch.squeeze(self.v_net(observation), -1)


class Agent:
    def __init__(
        self,
        obs_dim: int,
        action_dim: int,
        hidden_dim: int = 32,
        activation=nn.Softmax(dim=-1),
        save_path: Path = None
    ) -> None:
        super(Agent, self).__init__()
        self.policy = DiagonalGaussian(obs_dim, hidden_dim, action_dim, activation)
        self.value_func = ValueFunctionLearner(obs_dim, hidden_dim, activation)
        self.save_path = save_path

    def step(self, obs: torch.Tensor) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        with torch.no_grad():
            policy_dist = self.policy._distribution(obs)
            action = policy_dist.sample()
            action[0] = 1
            mean_action = self.policy._log_probs_from_dist(policy_dist, action)
            value = self.value_func(obs)
        return action.numpy(), value.numpy(), mean_action.numpy()

    def act(self, obs: torch.Tensor) -> np.ndarray:
        return self.step(obs)[0]

    def save_model(self) -> None:
        assert self.save_path is not None
        torch.save(self.policy.state_dict(), Path(self.save_path, "policy"))
        torch.save(self.value_func.state_dict(), Path(self.save_path, "value_function"))

    def load_model(self, eval: bool = False) -> None:
        assert self.save_path is not None
        pi = torch.load(Path(self.save_path, "policy"))
        val = torch.load(Path(self.save_path, "value_function"))
        self.policy.load_state_dict(pi)
        self.value_func.load_state_dict(val)
        if eval:
            self.policy.eval()
            self.value_func.eval()

# Utilities
* Retur Estimators
* Advantage Function
* Trajectory Buffer

In [7]:
class ReturnEstimator(ABC):
    @abstractmethod
    def get_return(self, rewards: np.ndarray) -> np.ndarray:
        """Calculates a return over a trajectory"""
        raise NotImplementedError

@dataclass
class DiscountReturn(ReturnEstimator):
    """Calculate the discounted return over a trajectory, with discount factor gamma."""
    gamma: float = 0.99

    def get_return(self, rewards: np.ndarray) -> np.ndarray:
        pot = np.cumsum(np.ones(len(rewards))) - 1
        g = np.full(len(pot), fill_value=self.gamma)
        discount_gamma = g**pot
        return rewards * discount_gamma

class Advantage(ABC):
    @abstractmethod
    def estimate(self, rewards: np.ndarray, values: np.ndarray) -> np.ndarray:
        """Calculates an advantage."""
        raise NotImplementedError

@dataclass
class GAE(Advantage):
    return_estimator: ReturnEstimator
    lamda: float = 0.5
    gamma: float = 0.99

    def estimate(self, rewards: np.ndarray, values: np.ndarray) -> np.ndarray:
        rew = np.append(rewards, 0)
        val = np.append(values, 0)
        deltas = rew[:-1] + (self.gamma * val[1:]) - val[:-1]
        adv = self.return_estimator.get_return(deltas)
        return adv

class TrajectoryReplayBuffer:
    """A buffer for storing trajectory data"""

    def __init__(
        self,
        return_estimator: ReturnEstimator,
        advantage: Advantage,
        buf_size: int,
        obs_dim: int,
        act_dim: int,
    ) -> None:
        self._ret_estimator = return_estimator
        self._adv_estimator = advantage
        self._buf_size = buf_size
        self._obs = np.zeros((buf_size, obs_dim), dtype=float)
        self._act = np.zeros((buf_size, act_dim), dtype=float)
        self._val = np.zeros(buf_size, dtype=float)
        self._rewards = np.zeros(buf_size, dtype=float)
        self._mean_act = np.zeros(buf_size, dtype=float)

    def store(
        self,
        idx: int,
        obs: np.ndarray,
        action: np.ndarray,
        value: np.ndarray,
        reward: float,
        mean_act: float,
    ) -> None:
        assert idx < self._buf_size
        self._obs[idx] = obs
        self._act[idx] = action
        self._val[idx] = value
        self._rewards[idx] = reward
        self._mean_act[idx] = mean_act

    def finish_trajectory(self):
        self._ret = self._ret_estimator.get_return(self._rewards)
        self._adv = self._adv_estimator.estimate(self._rewards, self._val)

    def get_trajectories(self):
        data = dict(
            obs=self._obs,
            act=self._act,
            val=self._val,
            ret=self._ret,
            logp=self._mean_act,
            adv=self._adv,
        )
        return {k: torch.as_tensor(v, dtype=torch.float32) for k, v in data.items()}

# Loss functions & Update function

In [10]:
def ppo_policy_loss(
    actor_critic,
    clip_ratio: float,
    obs: torch.Tensor,
    act: torch.Tensor,
    adv: torch.Tensor,
    logp_old: torch.Tensor
) -> torch.Tensor:
    pi, logp = actor_critic.policy(obs, act)
    ratio = torch.exp(logp - logp_old)
    clip_adv = torch.clamp(ratio, 1-clip_ratio, 1+clip_ratio) * adv
    loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()
    return loss_pi

def ppo_value_loss(
    actor_critic,
    obs: torch.Tensor,
    ret: torch.Tensor
) -> torch.Tensor:
    return ((actor_critic.value_func(obs) - ret)**2).mean()

def update(agent: Agent, data: dict, update_cycles: int) -> None:
    for _ in range(update_cycles):
        pi_optim.zero_grad()
        pi_loss = ppo_policy_loss(agent,0.2,data["obs"],data["act"],data["adv"],data["logp"],)
        pi_loss.backward()
        pi_optim.step()

        val_optim.zero_grad()
        v_loss = ppo_value_loss(agent, data["obs"], data["ret"])
        v_loss.backward()
        val_optim.step()

# Space Description, Hyperparams & Main Loop

### Action Space
    The action is a `ndarray` with shape `(1,)` representing the torque applied to free end of the pendulum.
    | Num | Action | Min  | Max |
    |-----|--------|------|-----|
    | 0   | Torque | -2.0 | 2.0 |
### Observation Space
    The observation is a `ndarray` with shape `(3,)` representing the x-y coordinates of the pendulum's free end and its angular velocity.
    | Num | Observation      | Min  | Max |
    |-----|------------------|------|-----|
    | 0   | x = cos(theta)   | -1.0 | 1.0 |
    | 1   | y = sin(angle)   | -1.0 | 1.0 |
    | 2   | Angular Velocity | -8.0 | 8.0 |

In [11]:
### Setting up Hyperparameters ###
EPISODES = 2
EPISODE_LEN = 10
LEARNING_RATE = 0.0001

OBS_DIM = 3
ACT_DIM = 1
HIDDEN_DIM = 32
BUF_SIZE = EPISODE_LEN

GAMMA, LAMDA = 0.99, 0.5
# --------------- #
return_estimator = DiscountReturn(gamma=GAMMA)
advantage_return = DiscountReturn(gamma=GAMMA * LAMDA)
advantage = GAE(advantage_return, lamda=LAMDA, gamma=GAMMA)

trajectory_buffer = TrajectoryReplayBuffer(
    return_estimator,
    advantage,
    buf_size=BUF_SIZE,
    obs_dim=OBS_DIM,
    act_dim=ACT_DIM,
)
### Init Agent ###
agent = Agent(
    obs_dim=OBS_DIM,
    action_dim=ACT_DIM,
    hidden_dim=HIDDEN_DIM,
    activation=nn.Softmax(dim=-1)
)
### Init Agent ###

### Init Optimizer ###
pi_optim = Adam(agent.policy.parameters(), lr=LEARNING_RATE)
val_optim = Adam(agent.value_func.parameters(), lr=LEARNING_RATE)
### Init Optimizer ###

In [12]:
### Training Loop ###
env = gym.make("Pendulum-v0")
env = gym.wrappers.FlattenObservation(env)
for _ in range(EPISODES): 
    obs = env.reset()
    for t in range(EPISODE_LEN):
      act, value, mean_act = agent.step(torch.as_tensor(obs, dtype=torch.float32))
      # print(f"action: {act}\nvalue: {value}\nmean action: {mean_act}")
      obs, reward, done, _ = env.step(act)
      # print(f"observation: {obs}\nreward: {reward}\ndone: {done}")
      # print("-------------------")
      trajectory_buffer.store(t, obs, act, value, reward, mean_act)
      if done:
          print(f"Episode finished after {t} timesteps")
          break

    trajectory_buffer.finish_trajectory()
    # print(f"advantage: {trajectory_buffer._adv},
    # value targets(estimated return): {trajectory_buffer._ret}")
    data = trajectory_buffer.get_trajectories()

    update(agent, data, 80)

env.close()