In [1]:
import torch
import snntorch
from torch import nn
import torch.nn.functional as F
import numpy as np

import gym
import numpy as np
import torch
from torch import nn
from torch.distributions import MultivariateNormal
from torch.optim.adam import Adam

## ANN definition

In [2]:
class FeedForwardNN(nn.Module):
    """
        A standard in_dim-64-64-out_dim Feed Forward Neural Network.
    """

    def __init__(self, in_dim, out_dim):
        """
            Initialize the network and set up the layers.

            Parameters:
                in_dim - input dimensions as an int
                out_dim - output dimensions as an int

                Return:
                None
        """
        super(FeedForwardNN, self).__init__()

        self.layer1 = nn.Linear(in_dim, 64)
        self.layer2 = nn.Linear(64, 64)
        self.layer3 = nn.Linear(64, out_dim)

    def forward(self, obs):
        """
            Runs a forward pass on the neural network.

            Parameters:
                obs - observation to pass as input

            Return:
                output - the output of our forward pass
        """

        # Convert observation to tensor if it's a numpy array
        if isinstance(obs, np.ndarray):
            obs = torch.tensor(obs, dtype=torch.float)

        activation1 = torch.relu(self.layer1(obs))
        activation2 = torch.relu(self.layer2(activation1))
        output = self.layer3(activation2)

        return output

In [3]:
def compute_rtgs(batch_rews):
    """
        Compute the Reward-To-Go of each timestep in a batch given the rewards.

        Parameters:
            batch_rews - the rewards in a batch, Shape: (number of episodes, number of timesteps per episode)

        Return:
            batch_rtgs - the rewards to go, Shape: (number of timesteps in batch)
    """
    # The rewards-to-go (rtg) per episode per batch to return.
    # The shape will be (num timesteps per episode)
    batch_rtgs = []

    # Iterate through each episode
    for ep_rews in reversed(batch_rews):

        discounted_reward = 0  # The discounted reward so far

        # Iterate through all rewards in the episode. We go backwards for smoother calculation of each
        # discounted return (think about why it would be harder starting from the beginning)
        for rew in reversed(ep_rews):
            discounted_reward = rew + discounted_reward * gamma
            batch_rtgs.insert(0, discounted_reward)

    # Convert the rewards-to-go into a tensor
    batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float)

    return batch_rtgs

In [4]:
obs_dim = 5
act_dim = 5

In [5]:
timesteps_per_batch = 4800  # Number of timesteps to run per batch
max_timesteps_per_episode = 1600  # Max number of timesteps per episode
n_updates_per_iteration = 5  # Number of times to update actor/critic per iteration
lr = 0.005  # Learning rate of actor optimizer
gamma = 0.95  # Discount factor to be applied when calculating Rewards-To-Go
clip = 0.2  # Recommended 0.2, helps define the threshold to clip the ratio during SGA

# Miscellaneous parameters
render = True  # If we should render during rollout
render_every_i = 10  # Only render every n iterations
save_freq = 10  # How often we save in number of iterations
seed = None  # Sets the seed of our program, used for reproducibility of results

In [6]:
# create an instance for the actor network
actor = FeedForwardNN(obs_dim, act_dim)  # ALG STEP 1

# create an instance for the critic network
critic = FeedForwardNN(obs_dim, 1)

# Initialize optimizers for actor and critic
actor_optim = Adam(actor.parameters(), lr=lr)
critic_optim = Adam(critic.parameters(), lr=lr)

cov_var = torch.full(size=(act_dim,), fill_value=0.5)
cov_mat = torch.diag(cov_var)

In [33]:
# first observation, from reset
obs = torch.tensor([300.0000, 450.0000,   0.0000,   4.7124,   0.0000])

In [34]:
mean = actor(obs)

print(mean.detach())

tensor([  3.1073,   8.2000, -21.1976,  23.6841, -15.8548])


In [35]:
# Create a distribution with the mean action and std from the covariance matrix above.
# For more information on how this distribution works, check out Andrew Ng's lecture on it:
# https://www.youtube.com/watch?v=JjB58InuTqM
dist = MultivariateNormal(mean, cov_mat)

# Sample an action from the distribution
action = dist.sample()

# Calculate the log probability for that action
log_prob = dist.log_prob(action)

# Return the sampled action and the log probability of that action in our distribution
print(action.detach().numpy())
print(log_prob.detach())

[  3.0343742   7.223215  -22.181799   24.653101  -16.455553 ]
tensor(-6.0896)


In [36]:
# rollout

# batch obs
batch_obs = torch.tensor([[300.0000, 450.0000,   0.0000,   4.7124,   0.0000],
        [300.0667, 450.0000,   3.9801,   4.7124,   0.0000],
        [300.1997, 450.0000,   7.9404,   4.7124,   0.0000],
        [300.0000, 450.0000,   0.0000,   4.7124,   0.0000],
        [300.0667, 450.0000,   3.9801,   4.7124,   0.0000],
        [300.1997, 450.0000,   7.9404,   4.7124,   0.0000]])

# batch actions
batch_acts =  torch.tensor([[13.1795, 10.1534, 10.7613,  9.3591, -7.8791],
        [13.7555,  9.6675, 10.0287,  9.6279, -6.4844],
        [13.1954, 10.4574,  9.9011,  9.5315, -7.1549],
        [13.4455,  9.4522,  9.5629, 10.1283, -6.6890],
        [14.7801,  8.4991,  8.6375,  9.9466, -6.9821],
        [13.8373, 10.2118, 10.8795,  8.7003, -6.5202]])

# batch log prob
batch_log_probs = torch.tensor([-5.0741, -3.2771, -3.4157, -4.1163, -8.0700, -4.7627])

# batch rewards
batch_rtgs = torch.tensor([-3.1173, -2.1154, -1.0746, -3.1173, -2.1154, -1.0746])

# batch lengths
batch_lens = [3, 3]

In [37]:
def evaluate(batch_obs, batch_acts):
    """
        Estimate the values of each observation, and the log probs of
        each action in the most recent batch with the most recent
        iteration of the actor network. Should be called from learn.

        Parameters:
            batch_obs - the observations from the most recently collected batch as a tensor.
                        Shape: (number of timesteps in batch, dimension of observation)
            batch_acts - the actions from the most recently collected batch as a tensor.
                        Shape: (number of timesteps in batch, dimension of action)

        Return:
            V - the predicted values of batch_obs
            log_probs - the log probabilities of the actions taken in batch_acts given batch_obs
    """

    # Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs
    V = critic(batch_obs).squeeze()

    # Calculate the log probabilities of batch actions using most recent actor network.
    # This segment of code is similar to that in get_action()
    mean = actor(batch_obs)
    dist = MultivariateNormal(mean, cov_mat)
    log_probs = dist.log_prob(batch_acts)

    # Return the value vector V of each observation in the batch
    # and log probabilities log_probs of each action in the batch
    return V, log_probs

In [51]:
# Calculate advantage at k-th iteration
V, _ = evaluate(batch_obs, batch_acts)
A_k = batch_rtgs - V.detach()

print(V)
print(A_k)

tensor([10.6725, 10.7427, 10.8127, 10.6725, 10.7427, 10.8127],
       grad_fn=<SqueezeBackward0>)
tensor([-13.7898, -12.8581, -11.8873, -13.7898, -12.8581, -11.8873])


: 

In [47]:
# normalizing the advantage
A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)

print(A_k)

tensor([-1.1103, -0.0152,  1.1256, -1.1103, -0.0152,  1.1256])


In [48]:
# Calculate V_phi and pi_theta(a_t | s_t)
V, curr_log_probs = evaluate(batch_obs, batch_acts)

# Calculate the ratio pi_theta(a_t | s_t) / pi_theta_k(a_t | s_t)
# NOTE: we just subtract the logs, which is the same as
# dividing the values and then canceling the log with e^log.
# For why we use log probabilities instead of actual probabilities,
# here's a great explanation:
# https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms
# TL;DR makes gradient ascent easier behind the scenes.
ratios = torch.exp(curr_log_probs - batch_log_probs)

print(V)
print(curr_log_probs)
print(ratios)

tensor([12.0563, 12.1266, 12.1969, 12.0563, 12.1266, 12.1969],
       grad_fn=<SqueezeBackward0>)
tensor([-1398.3192, -1358.0565, -1313.4099, -1325.2916, -1275.5676, -1421.1865],
       grad_fn=<SubBackward0>)
tensor([0., 0., 0., 0., 0., 0.], grad_fn=<ExpBackward0>)


In [49]:
# Calculate surrogate losses.
surr1 = ratios * A_k
surr2 = torch.clamp(ratios, 1 - clip, 1 + clip) * A_k

print(surr1)
print(surr2)

tensor([-0., -0., 0., -0., -0., 0.], grad_fn=<MulBackward0>)
tensor([-0.8883, -0.0122,  0.9005, -0.8883, -0.0122,  0.9005],
       grad_fn=<MulBackward0>)


In [50]:
# Calculate actor and critic losses.
# NOTE: we take the negative min of the surrogate losses because we're trying to maximize
# the performance function, but Adam minimizes the loss. So minimizing the negative
# performance function maximizes it.
actor_loss = (-torch.min(surr1, surr2)).mean()
critic_loss = nn.MSELoss()(V, batch_rtgs)

print(actor_loss)
print(critic_loss)

# Calculate gradients and perform backward propagation for actor network
actor_optim.zero_grad()
actor_loss.backward(retain_graph=True)
actor_optim.step()

# Calculate gradients and perform backward propagation for critic network
critic_optim.zero_grad()
critic_loss.backward()
critic_optim.step()

tensor(0.3002, grad_fn=<MeanBackward0>)
tensor(203.0672, grad_fn=<MseLossBackward0>)
