In [74]:
import torch
import snntorch
from torch import nn
import torch.nn.functional as F
import numpy as np

import gym
import numpy as np
import torch
from torch import nn
from torch.distributions import MultivariateNormal, Categorical
from torch.optim.adam import Adam


### Network definition

In [75]:
class FFNetwork(nn.Module):
    """
        A standard in_dim-64-64-out_dim Feed Forward Neural Network.
    """

    def __init__(self, in_dim, out_dim):
        """
            Initialize the network and set up the layers.

            Parameters:
                in_dim - input dimensions as an int
                out_dim - output dimensions as an int

                Return:
                None
        """
        super(FFNetwork, self).__init__()

        self.layer1 = nn.Linear(in_dim, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, out_dim)

    def forward(self, obs):
        """
            Runs a forward pass on the neural network.

            Parameters:
                obs - observation to pass as input

            Return:
                output - the output of our forward pass
        """

        # Convert observation to tensor if it's a numpy array
        if isinstance(obs, np.ndarray):
            obs = torch.tensor(obs, dtype=torch.float)


        activation1 = torch.relu(self.layer1(obs))
        activation2 = torch.relu(self.layer2(activation1))
        output = self.layer3(activation2)

        return output

### Simulation set up


In [76]:
obs_dim = 8
act_dim = 5

### Hyper-parameters

In [77]:
timesteps_per_batch = 4800  # Number of timesteps to run per batch
max_timesteps_per_episode = 1600  # Max number of timesteps per episode
n_updates_per_iteration = 5  # Number of times to update actor/critic per iteration
lr = 0.005  # Learning rate of actor optimizer
gamma = 0.95  # Discount factor to be applied when calculating Rewards-To-Go
clip = 0.2  # Recommended 0.2, helps define the threshold to clip the ratio during SGA

# Miscellaneous parameters
render = True  # If we should render during rollout
render_every_i = 10  # Only render every n iterations
save_freq = 10  # How often we save in number of iterations
seed = None  # Sets the seed of our program, used for reproducibility of results
gae_lambda = 0.95

### Initialization
#### actor and critic networks
#### optimizers
#### covariance matrix

In [78]:
# create an instance for the actor network
actor = FFNetwork(obs_dim, act_dim)  # ALG STEP 1

# create an instance for the critic network
critic = FFNetwork(obs_dim, 1)

# Initialize optimizers for actor and critic
actor_optim = Adam(actor.parameters(), lr=lr)
critic_optim = Adam(critic.parameters(), lr=lr)

cov_var = torch.full(size=(act_dim,), fill_value=0.5)
cov_mat = torch.diag(cov_var)

### Methods
#### rewards to go calculation
#### evaluation


In [79]:
def calculate_gae(rewards, values, dones):
    batch_advantages = []
    for ep_rews, ep_vals, ep_dones in zip(rewards, values, dones):
        advantages = []
        last_advantage = 0

        for t in reversed(range(len(ep_rews))):
            if t + 1 < len(ep_rews):
                delta = ep_rews[t] + gamma * ep_vals[t + 1] * (1 - ep_dones[t + 1]) - ep_vals[t]
            else:
                delta = ep_rews[t] - ep_vals[t]

            advantage = delta + gamma * gae_lambda * (1 - ep_dones[t]) * last_advantage
            last_advantage = advantage
            advantages.insert(0, advantage)

        batch_advantages.extend(advantages)

    return torch.tensor(batch_advantages, dtype=torch.float)

def evaluate(batch_obs, batch_acts):
    """
    Estimate the values of each observation, and the log probs of
    each action in the most recent batch with the most recent
    iteration of the actor network. Should be called from learn.

    Parameters:
        batch_obs - the observations from the most recently collected batch as a tensor.
                    Shape: (number of timesteps in batch, dimension of observation)
        batch_acts - the actions from the most recently collected batch as a tensor.
                    Shape: (number of timesteps in batch, dimension of action)

    Return:
        V - the predicted values of batch_obs
        log_probs - the log probabilities of the actions taken in batch_acts given batch_obs
    """

    # Query critic network for a value V for each batch_obs
    V = critic(batch_obs).squeeze()

    logits = actor(batch_obs)
    dist = Categorical(logits=logits)

    # Calculate entropy loss for regularization
    entropy_loss = -dist.entropy().mean()

    log_probs = dist.log_prob(batch_acts)

    # Return the value vector V of each observation in the batch
    # and log probabilities log_probs of each action in the batch
    return V, log_probs, dist, entropy_loss

def get_action(obs):
    """
    Queries an action from the actor network, should be called from rollout.

    Parameters:
        obs - the observation at the current timestep

    Return:
        action - the action to take, as a numpy array
        log_prob - the log probability of the selected action in the distribution
    """

    # For discrete action spaces
    logits = actor(obs)
    dist = Categorical(logits=logits)

    # Sample an action from the distribution
    action = dist.sample()

    # Calculate the log probability for that action
    log_prob = dist.log_prob(action)

    # Return the sampled action and the log probability of that action in our distribution
    return action.detach().numpy(), log_prob.detach()

### Reset, First observation

In [80]:
LL_obs = np.array([1.2, 0.5, 2.0, -5, 1.0, -4.5, 0.8, 0.3])

RE_obs = np.array([300.0000, 450.0000,   0.0000,   4.7124,   0.0000, 1, 1, 0])

print(RE_obs.shape)

(8,)


### Get action and log prob of first observation
This is a one iteration of get_action() method

- Get network output to be used as a mean for the distribution.

- Create a distribution with the mean action and std from the covariance matrix above. <br/>
For more information on how this distribution works, check out Andrew Ng's lecture on it: <br/>
https://www.youtube.com/watch?v=JjB58InuTqM <br/>

- Sample an action from the distribution

- Calculate the log probability for that action


In [81]:
m = nn.Softmax(dim=0)


logits = actor(RE_obs)
dist = Categorical(logits=logits)

action = dist.sample()

log_prob = dist.log_prob(action)

print("softmax probabilities ", m(logits))
print("logits ", logits.detach())
print("action to take ", action.detach().numpy())
print("log probability of the action ",log_prob.detach())


softmax probabilities  tensor([2.0514e-12, 4.9073e-23, 1.1674e-23, 1.0000e+00, 5.3230e-24],
       grad_fn=<SoftmaxBackward0>)
logits  tensor([  8.2407, -16.2155, -17.6515,  35.1532, -18.4368])
action to take  3
log probability of the action  tensor(0.)


In [88]:
action_array = np.zeros(5)
action_array[action] = 1

print(action_array)

[0. 0. 0. 1. 0.]


In [86]:
act , log = get_action(RE_obs)

print(act)
print(log)

3
tensor(0.)


In [83]:
val = critic(LL_obs)

print(val)

tensor([-0.0053], grad_fn=<ViewBackward0>)


### Rollout

- batch observations collected from simulation, first obs O_0 is from reset [0, n-1]
- batch actions collected from querying the network given observations [1, n]
- batch log probabilities collected from querying the network given observations [1, n]
- batch rewards collected from simulation after taking an action. [1, n]
- batch lenghts stores batch and episode lengths

In [85]:
rollout_obs = torch.tensor([[300.0000, 450.0000,   0.0000,   4.7124,   0.0000, 1, 1, 0],
        [299.9333, 450.0000,   3.9801,   4.7124,   0.0000, 1, 1, 0],
        [299.8003, 450.0000,   7.9404,   4.7124,   0.0000, 1, 1, 0],
        [300.0000, 450.0000,   0.0000,   4.7124,   0.0000, 1, 1, 0],
        [299.9333, 450.0000,   3.9801,   4.7124,   0.0000, 1, 1, 0],
        [299.8003, 450.0000,   7.9404,   4.7124,   0.0000, 1, 1, 0]])

rollout_rews = torch.tensor([[-1.1209449646284, -1.134160306418572, -1.1539176437616128], [-1.1209449646284, -1.134160306418572, -1.1539176437616128]])

obs = RE_obs

for i in range(6):
    
        action, log_prob = get_action(obs)
        val = critic(obs)

        obs = rollout_obs[i]


In [21]:
# rollout

# batch obs
batch_obs = torch.tensor([[300.0000, 450.0000,   0.0000,   4.7124,   0.0000],
        [299.9333, 450.0000,   3.9801,   4.7124,   0.0000],
        [299.8003, 450.0000,   7.9404,   4.7124,   0.0000],
        [300.0000, 450.0000,   0.0000,   4.7124,   0.0000],
        [299.9333, 450.0000,   3.9801,   4.7124,   0.0000],
        [299.8003, 450.0000,   7.9404,   4.7124,   0.0000]])

batch_acts = []
batch_log_probs = []

for i in range(len(batch_obs)):
        action, log_prob = get_action(batch_obs[i])
        batch_acts.append(action)
        batch_log_probs.append(log_prob)

batch_rews = [[-1.1209449646284, -1.134160306418572, -1.1539176437616128], [-1.1209449646284, -1.134160306418572, -1.1539176437616128]]

batch_rtgs = compute_rtgs(batch_rews)

batch_lens = [3, 3]

batch_acts = torch.tensor(np.array(batch_acts), dtype=torch.float)
batch_log_probs = torch.tensor(np.array(batch_log_probs), dtype=torch.float)


print("batch acts ",batch_acts)
print("batch log probs ",batch_log_probs)
print("batch rtg ",batch_rtgs)

torch.Size([6])
batch acts  tensor([[-27.7970,  65.2302,  29.4832,  72.2285, -51.3564],
        [-28.3933,  64.6171,  27.6463,  73.8009, -51.5109],
        [-28.9280,  64.6414,  28.5367,  73.0541, -51.5062],
        [-27.7130,  65.7168,  28.7141,  72.3472, -51.4211],
        [-28.7506,  65.7660,  28.3840,  74.7082, -51.9623],
        [-28.7255,  65.5319,  28.7791,  73.1720, -48.8167]])
batch log probs  tensor([-5.1444, -6.9009, -5.5648, -3.8300, -7.4699, -7.2108])
batch rtg  tensor([-3.2398, -2.2304, -1.1539, -3.2398, -2.2304, -1.1539])


### Evaluate one iteration (for demonstration)
- Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs
- Calculate the log probabilities of batch actions using most recent actor network.
This segment of code is similar to that in get_action()

In [25]:
V = critic(batch_obs).squeeze()

mean = actor(batch_obs)
dist = MultivariateNormal(mean, cov_mat)
log_probs = dist.log_prob(batch_acts)

print(mean.shape)
print(V.detach())
print(log_probs.detach())

torch.Size([6, 5])
tensor([-14.2175, -14.3675, -14.5060, -14.2175, -14.3675, -14.5060])
tensor([-5.1444, -6.9009, -5.5648, -3.8300, -7.4698, -7.2108])


In [11]:
# Calculate advantage at k-th iteration
V, _ = evaluate(batch_obs, batch_acts)
A_k = batch_rtgs - V.detach()

print(V)
print(A_k)

tensor([-16.2353, -16.3779, -16.5182, -16.2353, -16.3779, -16.5182],
       grad_fn=<SqueezeBackward0>)
tensor([12.9955, 14.1475, 15.3643, 12.9955, 14.1475, 15.3643])


In [12]:
# normalizing the advantage
A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)

print(A_k)

tensor([-1.1077, -0.0204,  1.1281, -1.1077, -0.0204,  1.1281])


### Update the network through a number of iterations

- Calculate V_phi and pi_theta(a_t | s_t)
- Calculate the ratio pi_theta(a_t | s_t) / pi_theta_k(a_t | s_t) <br/>
NOTE: we just subtract the logs, which is the same as<br/>
dividing the values and then canceling the log with e^log.<br/>
For why we use log probabilities instead of actual probabilities,<br/>
here's a great explanation:<br/>
https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms<br/>
TL;DR makes gradient ascent easier behind the scenes.<br/>
- Calculate surrogate losses.
- Calculate actor and critic losses. <br/>
NOTE: we take the negative min of the surrogate losses because we're trying to maximize <br/>
the performance function, but Adam minimizes the loss. So minimizing the negative <br/>
performance function maximizes it. <br/>
- Calculate gradients and perform backward propagation for actor and critic network



In [30]:
num_updates_per_iteration = 1

actor_loss_arr = []
critic_loss_arr = []


for _ in range(num_updates_per_iteration):

    print("## update start ##")

    V, curr_log_probs = evaluate(batch_obs, batch_acts)

    ratios = torch.exp(curr_log_probs - batch_log_probs)

    surr1 = ratios * A_k
    surr2 = torch.clamp(ratios, 1 - clip, 1 + clip) * A_k

    actor_loss = (-torch.min(surr1, surr2)).mean()
    critic_loss = nn.MSELoss()(V, batch_rtgs)

    actor_loss_arr.append(actor_loss)
    critic_loss_arr.append(critic_loss)

    actor_optim.zero_grad()
    actor_loss.backward(retain_graph=True)
    actor_optim.step()

    critic_optim.zero_grad()
    critic_loss.backward()
    critic_optim.step()


    print("* V \n",V.detach())
    print("* curr_log_probs \n",curr_log_probs.detach())
    print("* ratios \n",ratios.detach())
    print("* surr1 \n",surr1.detach())
    print("* surr2 \n",surr2.detach())

print(actor_loss)
print(critic_loss)

## update start ##
tensor([-5.1444, -6.9009, -5.5648, -3.8300, -7.4699, -7.2108])
tensor([-7.6773, -7.7762, -7.8732, -7.6773, -7.7762, -7.8732],
       grad_fn=<SqueezeBackward0>)
tensor([-24573.3047, -24026.8809, -24084.8027, -24505.0898, -23910.9277,
        -23803.5879], grad_fn=<SubBackward0>)
* V 
 tensor([-7.6773, -7.7762, -7.8732, -7.6773, -7.7762, -7.8732])
* curr_log_probs 
 tensor([-24573.3047, -24026.8809, -24084.8027, -24505.0898, -23910.9277,
        -23803.5879])
* ratios 
 tensor([0., 0., 0., 0., 0., 0.])
* surr1 
 tensor([-0., -0., 0., -0., -0., 0.])
* surr2 
 tensor([-0.8862, -0.0163,  0.9025, -0.8862, -0.0163,  0.9025])
tensor(0.3008, grad_fn=<MeanBackward0>)
tensor(31.8651, grad_fn=<MseLossBackward0>)
