In [91]:
import numpy as np
import torch
import gym

We can consider using a *baseline* $b$ in our result, which was originally introduced by [Williams, 1992](https://link.springer.com/article/10.1007/BF00992696). The paper in which the REINFORCE algorithm is introduced. 
$$
\nabla_{\boldsymbol{\theta}} J(\boldsymbol{\theta})=\frac{1}{m} \sum_{i=1}^{m} \sum_{t=1}^{H} \nabla_{\boldsymbol{\theta}} \log \pi_{\boldsymbol{\theta}}\left(\mathbf{a}_{t}^{(i)} \mid \mathbf{s}_{t}^{(i)}\right)\left[\sum_{k=t}^{H} \gamma^{k} r\left(\mathbf{s}_{k}^{(i)}, \mathbf{a}_{k}^{(i)}\right)-b\right]
$$

In [108]:
env = gym.make("CartPole-v1")

env.observation_space, env.action_space, env._max_episode_steps

(Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32),
 Discrete(2),
 500)

In [119]:
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical


class Model(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super(Model, self).__init__()
        self.linear1 = nn.Linear(obs_dim, 128)
        self.linear2 = nn.Linear(128, act_dim)
        self.activation = nn.Tanh()
        
    def forward(self, x):
        x = self.activation(self.linear1(x))
        x = self.linear2(x)
        return x
    
class Agent:
    def __init__(self, env, gamma=0.99, lr=3e-2, hidden_size=64):
        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.act_dim = env.action_space.n
        self.max_ep_len = env._max_episode_steps
        
        self.log_pi = Model(self.obs_dim, self.act_dim)
        self.opt = optim.Adam(self.log_pi.parameters(), lr=lr)
    
    def update_policy(self, act, obs, baseline):
        self.opt.zero_grad()
        
        # Calculate the loss
        logp = self.get_policy(obs).log_prob(act)
        batch_loss = -(logp * baseline).mean()  
        
        batch_loss.backward()
        self.opt.step()
        return batch_loss
        
    def get_policy(self, obs):
        logits = self.log_pi(obs)
        return Categorical(logits=logits)
    
    def get_action(self, obs, deterministic = False):
        # TODO: Setup deterministic get_action
        return self.get_policy(obs).sample().item()
    
    def sample_batch(self, batch_size=5000):
        batch_obs = []          # for observations
        batch_acts = []         # for actions
        batch_weights = []      # for R(tau) weighting in policy gradient
        batch_rets = []         # for measuring episode returns
        batch_lens = []         # for measuring episode lengths
        ep_rews = []            # list for rewards accrued throughout ep
        
        done = False
        o, ep_ret, ep_len = env.reset(), 0, 0
        while True:
            batch_obs.append(o)
            
            a = self.get_action(torch.as_tensor(o, dtype=torch.float32))
            o, r, done, _ = env.step(a)
            
            # save action, reward
            batch_acts.append(a)
            ep_rews.append(r)
            
            if done or(ep_len == self.max_ep_len):
                # if episode is over, record info about episode
                ep_ret, ep_len = sum(ep_rews), len(ep_rews)
                batch_rets.append(ep_ret)
                batch_lens.append(ep_len)

                # the weight for each logprob(a|s) is R(tau)
                # batch_weights += [ep_ret] * ep_len
                
                # Reward-to-go
                # batch_weights += list(reward_to_go(ep_rews))
                
                # Discounted reward
                batch_weights += list(discounted_reward(ep_rews))
                
                # reset episode-specific variables
                o, done, ep_rews = env.reset(), False, []
                
                if len(batch_obs) > batch_size:
                    break
        
        batch_loss = self.update_policy(
            torch.as_tensor(batch_acts, dtype=torch.float32),
            torch.as_tensor(batch_obs, dtype=torch.float32),
            torch.as_tensor(batch_weights, dtype=torch.float32))
  
        return batch_loss, batch_rets, batch_lens
    
    def train(self, epochs=50, batch_size=5000):
        
        for i in range(epochs):
            batch_loss, batch_rets, batch_lens = self.sample_batch(
                batch_size=batch_size)
            print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f' %
                  (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))       


In [None]:
agent = Agent(env)
agent.train(epochs=1000)

epoch:   0 	 loss: 7.111 	 return: 19.761 	 ep_len: 19.761
epoch:   1 	 loss: 10.994 	 return: 40.500 	 ep_len: 40.500
epoch:   2 	 loss: 10.426 	 return: 43.216 	 ep_len: 43.216
epoch:   3 	 loss: 11.518 	 return: 59.106 	 ep_len: 59.106
epoch:   4 	 loss: 12.210 	 return: 80.556 	 ep_len: 80.556
epoch:   5 	 loss: 11.339 	 return: 83.967 	 ep_len: 83.967
epoch:   6 	 loss: 11.193 	 return: 85.661 	 ep_len: 85.661
epoch:   7 	 loss: 11.428 	 return: 94.717 	 ep_len: 94.717
epoch:   8 	 loss: 11.613 	 return: 125.146 	 ep_len: 125.146
epoch:   9 	 loss: 11.066 	 return: 194.000 	 ep_len: 194.000
epoch:  10 	 loss: 11.013 	 return: 220.435 	 ep_len: 220.435
epoch:  11 	 loss: 11.561 	 return: 184.500 	 ep_len: 184.500
epoch:  12 	 loss: 10.930 	 return: 213.375 	 ep_len: 213.375
epoch:  13 	 loss: 10.561 	 return: 244.429 	 ep_len: 244.429
epoch:  14 	 loss: 10.036 	 return: 268.842 	 ep_len: 268.842
epoch:  15 	 loss: 10.219 	 return: 255.450 	 ep_len: 255.450
epoch:  16 	 loss: 9.370 

In [110]:
# TODO: Trick introduced in DeepRL bootcamp, lecture 5
def reward_to_go(rews):
    n = len(rews)
    rtgs = np.zeros_like(rews)
    for i in reversed(range(n)):
        rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
    return rtgs

reward_to_go([10, 1, 2, 1, 3, 10])

array([27, 17, 16, 14, 13, 10])

In [118]:
def discounted_reward(rews, gamma=0.99):
    n = len(rews)
    rtgs = np.zeros_like(rews)
    for i in reversed(range(n)):
        rtgs[i] = gamma**i * rews[i] + (rtgs[i+1] if i+1 < n else 0)
    return rtgs


discounted_reward([10, 1, 2, 1, 3, 10])

array([22, 12, 12, 11, 11,  9])

Finally there are several possibilities for picking a baseline, namely,
$\nabla_{\boldsymbol{\theta}} J(\boldsymbol{\theta})=\mathbb{E}_{\mathbf{s}_{t}, \mathbf{a}_{t} \pi_{\boldsymbol{\theta}}(.)}\left[\nabla_{\boldsymbol{\theta}} \log \pi_{\boldsymbol{\theta}}\left(\mathbf{a}_{t}^{(i)} \mid \mathbf{s}_{t}^{(i)}\right)\right] \psi_{t} \text { where } \psi_{t} \text { can be: }$

* $\sum_{t=0}^{H} \gamma^{t} r_{t}:$ total (discounted) reward of trajectory
* $\sum_{k=t}^{H} \gamma^{k-t} r_{k}:$ sum of rewards after $\mathbf{a}_{t}$
* $\sum_{k=t}^{H} \gamma^{k-t} r_{k}-b\left(\mathbf{s}_{t}\right):$ sum of rewards after $\mathbf{a}_{t}$ with baseline
* $\delta_{t}=r_{t}+\gamma V^{\pi}\left(\mathbf{s}_{t+1}\right)-V^{\pi}\left(\mathbf{s}_{t}\right):$ error, with $V^{\pi}\left(\mathbf{s}_{t}\right)=\mathbb{E}_{\mathbf{a}_{t}}\left[\sum_{k=0}^{H} \gamma^{k} r_{t+l}\right]$
* $\hat{Q}_{\phi}^{\pi_{\theta}}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)=\mathbb{E}_{a_{t+1}}\left[\sum_{k=0}^{H} \gamma^{k} r_{t+l}\right]:$ action-value function
* $\hat{A}_{\phi}^{\pi_{\theta}}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)=\hat{Q}_{\phi}^{\pi_{\theta}}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-\hat{V}_{\phi}^{\pi_{\theta}}\left(\mathbf{s}_{t}\right)=\mathbb{E}\left[\delta_{t}\right]$, advantage function