In [23]:
import torch
import snntorch
from torch import nn
import torch.nn.functional as F
import numpy as np

import gym
import numpy as np
import torch
from torch import nn
from torch.distributions import MultivariateNormal
from torch.optim.adam import Adam
import random

### Network definition

In [11]:
class FeedForwardNN(nn.Module):
    """
        A standard in_dim-64-64-out_dim Feed Forward Neural Network.
    """

    def __init__(self, in_dim, out_dim):
        """
            Initialize the network and set up the layers.

            Parameters:
                in_dim - input dimensions as an int
                out_dim - output dimensions as an int

                Return:
                None
        """
        super(FeedForwardNN, self).__init__()

        self.layer1 = nn.Linear(in_dim, 64)
        self.layer2 = nn.Linear(64, 64)
        self.layer3 = nn.Linear(64, out_dim)

    def forward(self, obs):
        """
            Runs a forward pass on the neural network.

            Parameters:
                obs - observation to pass as input

            Return:
                output - the output of our forward pass
        """

        # Convert observation to tensor if it's a numpy array
        if isinstance(obs, np.ndarray):
            obs = torch.tensor(obs, dtype=torch.float)

        activation1 = torch.relu(self.layer1(obs))
        activation2 = torch.relu(self.layer2(activation1))
        output = self.layer3(activation2)

        return output

In [24]:
class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = []
        self.max_size = max_size
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.max_size:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.max_size

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

### Simulation set up


In [12]:
obs_dim = 5
act_dim = 5

### Hyper-parameters

In [13]:
timesteps_per_batch = 4800  # Number of timesteps to run per batch
max_timesteps_per_episode = 1600  # Max number of timesteps per episode
n_updates_per_iteration = 5  # Number of times to update actor/critic per iteration
lr = 0.005  # Learning rate of actor optimizer
gamma = 0.95  # Discount factor to be applied when calculating Rewards-To-Go
clip = 0.2  # Recommended 0.2, helps define the threshold to clip the ratio during SGA

# Miscellaneous parameters
render = True  # If we should render during rollout
render_every_i = 10  # Only render every n iterations
save_freq = 10  # How often we save in number of iterations
seed = None  # Sets the seed of our program, used for reproducibility of results

### Initialization
#### actor and critic networks
#### optimizers
#### covariance matrix

In [14]:
# create an instance for the actor network
actor = FeedForwardNN(obs_dim, act_dim)  # ALG STEP 1

# create an instance for the critic network
critic = FeedForwardNN(obs_dim, 1)

# Initialize optimizers for actor and critic
actor_optim = Adam(actor.parameters(), lr=lr)
critic_optim = Adam(critic.parameters(), lr=lr)

cov_var = torch.full(size=(act_dim,), fill_value=0.5)
cov_mat = torch.diag(cov_var)

replay_buffer = ReplayBuffer(max_size=10000)

### Methods
#### rewards to go calculation
#### evaluation


In [15]:
def compute_rtgs(batch_rews):
    """
        Compute the Reward-To-Go of each timestep in a batch given the rewards.

        Parameters:
            batch_rews - the rewards in a batch, Shape: (number of episodes, number of timesteps per episode)

        Return:
            batch_rtgs - the rewards to go, Shape: (number of timesteps in batch)
    """
    # The rewards-to-go (rtg) per episode per batch to return.
    # The shape will be (num timesteps per episode)
    batch_rtgs = []

    # Iterate through each episode
    for ep_rews in reversed(batch_rews):

        discounted_reward = 0  # The discounted reward so far

        # Iterate through all rewards in the episode. We go backwards for smoother calculation of each
        # discounted return (think about why it would be harder starting from the beginning)
        for rew in reversed(ep_rews):
            discounted_reward = rew + discounted_reward * gamma
            batch_rtgs.insert(0, discounted_reward)

    # Convert the rewards-to-go into a tensor
    batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float)

    return batch_rtgs

def evaluate(batch_obs, batch_acts):
    """
        Estimate the values of each observation, and the log probs of
        each action in the most recent batch with the most recent
        iteration of the actor network. Should be called from learn.

        Parameters:
            batch_obs - the observations from the most recently collected batch as a tensor.
                        Shape: (number of timesteps in batch, dimension of observation)
            batch_acts - the actions from the most recently collected batch as a tensor.
                        Shape: (number of timesteps in batch, dimension of action)

        Return:
            V - the predicted values of batch_obs
            log_probs - the log probabilities of the actions taken in batch_acts given batch_obs
    """

    # Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs
    V = critic(batch_obs).squeeze()

    print(batch_obs)

    # Calculate the log probabilities of batch actions using most recent actor network.
    # This segment of code is similar to that in get_action()
    mean = actor(batch_obs)
    dist = MultivariateNormal(mean, cov_mat)
    log_probs = dist.log_prob(batch_acts)

    # Return the value vector V of each observation in the batch
    # and log probabilities log_probs of each action in the batch
    return V, log_probs

def get_action(obs):
    """
        Queries an action from the actor network, should be called from rollout.

        Parameters:
            obs - the observation at the current timestep

        Return:
            action - the action to take, as a numpy array
            log_prob - the log probability of the selected action in the distribution
    """
    # Query the actor network for a mean action
    mean = actor(obs)

    # Create a distribution with the mean action and std from the covariance matrix above.
    dist = MultivariateNormal(mean, cov_mat)

    # Sample an action from the distribution
    action = dist.sample()

    # Calculate the log probability for that action
    log_prob = dist.log_prob(action)

    # Return the sampled action and the log probability of that action in our distribution
    return action.detach().numpy(), log_prob.detach()

### Reset, First observation

In [16]:
obs = torch.tensor([300.0000, 450.0000,   0.0000,   4.7124,   0.0000])

print(obs.shape)

torch.Size([5])


### Get action and log prob of first observation
This is a one iteration of get_action() method

- Get network output to be used as a mean for the distribution.

- Create a distribution with the mean action and std from the covariance matrix above. <br/>
For more information on how this distribution works, check out Andrew Ng's lecture on it: <br/>
https://www.youtube.com/watch?v=JjB58InuTqM <br/>

- Sample an action from the distribution

- Calculate the log probability for that action


In [17]:
mean = actor(obs)

dist = MultivariateNormal(mean, cov_mat)

action = dist.sample()

log_prob = dist.log_prob(action)

print("mean of distribution ", mean.detach())
print("action to take ", action.detach().numpy())
print("log probability of the action ",log_prob.detach())


mean of distribution  tensor([  9.5382, -10.3285,   2.1436,  -3.4395,  18.5752])
action to take  [  9.466348  -11.078942    1.7510126  -3.4813359  17.64109  ]
log probability of the action  tensor(-4.4588)


In [18]:
act , log = get_action(obs)

print(act)
print(log)

[ 9.685039  -9.983261   2.6699722 -4.2243786 18.117476 ]
tensor(-4.1052)


### Rollout

- batch observations collected from simulation, first obs O_0 is from reset [0, n-1]
- batch actions collected from querying the network given observations [1, n]
- batch log probabilities collected from querying the network given observations [1, n]
- batch rewards collected from simulation after taking an action. [1, n]
- batch lenghts stores batch and episode lengths

In [19]:
# rollout

# batch obs
batch_obs = torch.tensor([[300.0000, 450.0000,   0.0000,   4.7124,   0.0000],
        [299.9333, 450.0000,   3.9801,   4.7124,   0.0000],
        [299.8003, 450.0000,   7.9404,   4.7124,   0.0000],
        [300.0000, 450.0000,   0.0000,   4.7124,   0.0000],
        [299.9333, 450.0000,   3.9801,   4.7124,   0.0000],
        [299.8003, 450.0000,   7.9404,   4.7124,   0.0000]])

batch_acts = []
batch_log_probs = []

for i in range(len(batch_obs)):
        action, log_prob = get_action(batch_obs[i])
        batch_acts.append(action)
        batch_log_probs.append(log_prob)

batch_rews = [[-1.1209449646284, -1.134160306418572, -1.1539176437616128], [-1.1209449646284, -1.134160306418572, -1.1539176437616128]]

batch_rtgs = compute_rtgs(batch_rews)

batch_lens = [3, 3]

batch_acts = torch.tensor(np.array(batch_acts), dtype=torch.float)
batch_log_probs = torch.tensor(np.array(batch_log_probs), dtype=torch.float)


for i in range(len(batch_obs)):
        replay_buffer.push(batch_obs[i], batch_acts[i], batch_rews[i])



print("batch acts ",batch_acts)
print("batch log probs ",batch_log_probs)
print("batch rtg ",batch_rtgs)

batch acts  tensor([[  9.0482, -10.4983,   2.8699,  -2.3475,  18.9719],
        [ 10.7451, -10.0399,   1.1352,  -4.5502,  18.8192],
        [ 10.7088, -10.7789,   1.3703,  -2.2861,  18.8478],
        [ 10.1925,  -9.6547,   2.3445,  -3.8386,  18.1726],
        [  9.4776, -10.6591,   0.6093,  -2.4303,  17.7746],
        [  8.3343,  -9.7671,   2.0907,  -4.7859,  19.6051]])
batch log probs  tensor([-5.0080, -6.4758, -5.7374, -4.1057, -6.5133, -7.3456])
batch rtg  tensor([-3.2398, -2.2304, -1.1539, -3.2398, -2.2304, -1.1539])


### Evaluate one iteration (for demonstration)
- Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs
- Calculate the log probabilities of batch actions using most recent actor network.
This segment of code is similar to that in get_action()

In [20]:
V = critic(batch_obs).squeeze()

mean = actor(batch_obs)
dist = MultivariateNormal(mean, cov_mat)
log_probs = dist.log_prob(batch_acts)

print(mean.shape)
print(V.detach())
print(log_probs.detach())

torch.Size([6, 5])
tensor([14.5844, 14.4048, 14.2315, 14.5844, 14.4048, 14.2315])
tensor([-5.0080, -6.4758, -5.7373, -4.1057, -6.5133, -7.3456])


In [21]:
# Calculate advantage at k-th iteration
V, _ = evaluate(batch_obs, batch_acts)
A_k = batch_rtgs - V.detach()

print(V)
print(A_k)

tensor([[300.0000, 450.0000,   0.0000,   4.7124,   0.0000],
        [299.9333, 450.0000,   3.9801,   4.7124,   0.0000],
        [299.8003, 450.0000,   7.9404,   4.7124,   0.0000],
        [300.0000, 450.0000,   0.0000,   4.7124,   0.0000],
        [299.9333, 450.0000,   3.9801,   4.7124,   0.0000],
        [299.8003, 450.0000,   7.9404,   4.7124,   0.0000]])
tensor([14.5844, 14.4048, 14.2315, 14.5844, 14.4048, 14.2315],
       grad_fn=<SqueezeBackward0>)
tensor([-17.8242, -16.6352, -15.3854, -17.8242, -16.6352, -15.3854])


In [22]:
# normalizing the advantage
A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)

print(A_k)

tensor([-1.1086, -0.0186,  1.1272, -1.1086, -0.0186,  1.1272])


### Update the network through a number of iterations

- Calculate V_phi and pi_theta(a_t | s_t)
- Calculate the ratio pi_theta(a_t | s_t) / pi_theta_k(a_t | s_t) <br/>
NOTE: we just subtract the logs, which is the same as<br/>
dividing the values and then canceling the log with e^log.<br/>
For why we use log probabilities instead of actual probabilities,<br/>
here's a great explanation:<br/>
https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms<br/>
TL;DR makes gradient ascent easier behind the scenes.<br/>
- Calculate surrogate losses.
- Calculate actor and critic losses. <br/>
NOTE: we take the negative min of the surrogate losses because we're trying to maximize <br/>
the performance function, but Adam minimizes the loss. So minimizing the negative <br/>
performance function maximizes it. <br/>
- Calculate gradients and perform backward propagation for actor and critic network



In [30]:
num_updates_per_iteration = 1

actor_loss_arr = []
critic_loss_arr = []

# observations, actions, rewards = replay_buffer.sample(batch_size)



for _ in range(num_updates_per_iteration):

    print("## update start ##")

    V, curr_log_probs = evaluate(batch_obs, batch_acts)

    ratios = torch.exp(curr_log_probs - batch_log_probs)

    surr1 = ratios * A_k
    surr2 = torch.clamp(ratios, 1 - clip, 1 + clip) * A_k

    actor_loss = (-torch.min(surr1, surr2)).mean()
    critic_loss = nn.MSELoss()(V, batch_rtgs)

    actor_loss_arr.append(actor_loss)
    critic_loss_arr.append(critic_loss)

    actor_optim.zero_grad()
    actor_loss.backward(retain_graph=True)
    actor_optim.step()

    critic_optim.zero_grad()
    critic_loss.backward()
    critic_optim.step()


    print("* V \n",V.detach())
    print("* curr_log_probs \n",curr_log_probs.detach())
    print("* ratios \n",ratios.detach())
    print("* surr1 \n",surr1.detach())
    print("* surr2 \n",surr2.detach())

print(actor_loss)
print(critic_loss)

## update start ##
tensor([-5.1444, -6.9009, -5.5648, -3.8300, -7.4699, -7.2108])
tensor([-7.6773, -7.7762, -7.8732, -7.6773, -7.7762, -7.8732],
       grad_fn=<SqueezeBackward0>)
tensor([-24573.3047, -24026.8809, -24084.8027, -24505.0898, -23910.9277,
        -23803.5879], grad_fn=<SubBackward0>)
* V 
 tensor([-7.6773, -7.7762, -7.8732, -7.6773, -7.7762, -7.8732])
* curr_log_probs 
 tensor([-24573.3047, -24026.8809, -24084.8027, -24505.0898, -23910.9277,
        -23803.5879])
* ratios 
 tensor([0., 0., 0., 0., 0., 0.])
* surr1 
 tensor([-0., -0., 0., -0., -0., 0.])
* surr2 
 tensor([-0.8862, -0.0163,  0.9025, -0.8862, -0.0163,  0.9025])
tensor(0.3008, grad_fn=<MeanBackward0>)
tensor(31.8651, grad_fn=<MseLossBackward0>)
