In [2]:
import numpy
import torch
import gym

We can consider using a *baseline* $b$ in our result, which was originally introduced by [Williams, 1992](https://link.springer.com/article/10.1007/BF00992696). The paper in which the REINFORCE algorithm is introduced. 
$$
\nabla_{\boldsymbol{\theta}} J(\boldsymbol{\theta})=\frac{1}{m} \sum_{i=1}^{m} \sum_{t=1}^{H} \nabla_{\boldsymbol{\theta}} \log \pi_{\boldsymbol{\theta}}\left(\mathbf{a}_{t}^{(i)} \mid \mathbf{s}_{t}^{(i)}\right)\left[\sum_{k=t}^{H} \gamma^{k} r\left(\mathbf{s}_{k}^{(i)}, \mathbf{a}_{k}^{(i)}\right)-b\right]
$$

In [8]:
env = gym.make("CartPole-v1")

env.observation_space, env.action_space

(Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32),
 Discrete(2))

In [25]:
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical


class Model(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super(Model, self).__init__()
        self.linear1 = nn.Linear(obs_dim, 128)
        self.linear2 = nn.Linear(128, act_dim)
        self.activation = nn.Tanh()
        
    def forward(self, x):
        x = self.activation(self.linear1(x))
        x = self.linear2(x)
        return x
    
class Agent:
    def __init__(self, env, lr=3e-2, hidden_size=64):
        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.act_dim = env.action_space.n
        
        self.log_pi = Model(self.obs_dim, self.act_dim)
        self.opt = optim.Adam(self.log_pi.parameters(), lr=lr)
    
    def sample_batch(batch_size):
        # TODO: Setup deterministic get_action
        max_ep_len = self.env._max_episode_steps
        obs = []
        acts = []
        ep_rews = []
        
        o, ep_ret, ep_len = env.reset(), 0, 0
        for i in range(batch_size):
            a = self.get_action(obs)
            o2, r, d, _ = env.step(a)
            
            ep_ret += r
            ep_len += 1
            d = False if ep_len == max_ep_len else d
            
            o = o2
            
            if d or(ep_len == max_ep_len):
                o, ep_ret, ep_len = env.reset(), 0, 0
                
            
    
    def train(self, epochs=50, batch_size=50):
        
        for i in range(epochs):
            for j in range(batch_size):
                
            
        return NotImplemented
    
    def get_action(self, obs, deterministic = False):
        logits = self.log_pi(obs)
        return Categorical(logits=logits).sample().item()

    

In [21]:
agent = Agent(env)
agent.train()

NotImplementedError

500

Finally there are several possibilities for picking a baseline, namely,
$\nabla_{\boldsymbol{\theta}} J(\boldsymbol{\theta})=\mathbb{E}_{\mathbf{s}_{t}, \mathbf{a}_{t} \pi_{\boldsymbol{\theta}}(.)}\left[\nabla_{\boldsymbol{\theta}} \log \pi_{\boldsymbol{\theta}}\left(\mathbf{a}_{t}^{(i)} \mid \mathbf{s}_{t}^{(i)}\right)\right] \psi_{t} \text { where } \psi_{t} \text { can be: }$

* $\sum_{t=0}^{H} \gamma^{t} r_{t}:$ total (discounted) reward of trajectory
* $\sum_{k=t}^{H} \gamma^{k-t} r_{k}:$ sum of rewards after $\mathbf{a}_{t}$
* $\sum_{k=t}^{H} \gamma^{k-t} r_{k}-b\left(\mathbf{s}_{t}\right):$ sum of rewards after $\mathbf{a}_{t}$ with baseline
* $\delta_{t}=r_{t}+\gamma V^{\pi}\left(\mathbf{s}_{t+1}\right)-V^{\pi}\left(\mathbf{s}_{t}\right):$ error, with $V^{\pi}\left(\mathbf{s}_{t}\right)=\mathbb{E}_{\mathbf{a}_{t}}\left[\sum_{k=0}^{H} \gamma^{k} r_{t+l}\right]$
* $\hat{Q}_{\phi}^{\pi_{\theta}}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)=\mathbb{E}_{a_{t+1}}\left[\sum_{k=0}^{H} \gamma^{k} r_{t+l}\right]:$ action-value function
* $\hat{A}_{\phi}^{\pi_{\theta}}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)=\hat{Q}_{\phi}^{\pi_{\theta}}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-\hat{V}_{\phi}^{\pi_{\theta}}\left(\mathbf{s}_{t}\right)=\mathbb{E}\left[\delta_{t}\right]$, advantage function