# Cartpole solving by Policy Gradient

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import sys

import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.distributions import Categorical

In [113]:
class PolicyNetwork(nn.Module):
    """
    The policy network
    Args:
        n_inputs (int)
        n_outputs (int)
    """
    
    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        
        self.reward_history = []
        self.loss_history = []
        
        self.fc1 = nn.Linear(self.n_inputs, self.n_outputs)
        
    def forward(self, x):
        """
        Forward pass
        Args:
            x (torch.Tensor)
        """
        x = F.relu(self.fc1(x))
        x = F.softmax(x, dim=1)
        return x

    def act(self, state):
        probs = self.forward(torch.from_numpy(np.array(state)).float().unsqueeze(0))
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action).reshape(1)

#### Preview the environment

In [52]:
env = gym.make("CartPole-v1")
obs = env.reset()
next_obs, reward, done, info = env.step(env.action_space.sample())
env.render()
env.close()

In [53]:
next_obs

array([ 0.02465587,  0.23436827,  0.01544156, -0.27958903])

In [54]:
env.observation_space.shape[0]

4

#### Preview the policy

In [55]:
policy = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)

In [56]:
policy.forward(torch.from_numpy(next_obs).float().unsqueeze(0))

tensor([[0.5000, 0.5000]], grad_fn=<SoftmaxBackward>)

In [57]:
policy.state_dict()

OrderedDict([('fc1.weight', tensor([[-0.3598, -0.4244, -0.0431, -0.3849],
                      [-0.3363,  0.1368, -0.1586,  0.4164]])),
             ('fc1.bias', tensor([-0.3971, -0.1613]))])

In [63]:
policy.act(obs)

(1, tensor([-0.6931], grad_fn=<SqueezeBackward1>))

## Algorithms

---
```
Input: a differentiable policy parameterization pi(a|s, theta)
Algorithm parameter: step size alpha > 0
Initialise policy parameter theta with dimension d'

Loop forever for each episode:
        Generate an episode S0, A0, R1, ..., ST-1, AT-1, RT, following pi(.|., theta) (def generate())
        Loop for each step of the episode t = 0, 1, ..., T-1
        G = sum(t+1:T)(gamma^(k-t-1))Rk (def calculate_reward)
        theta = theta + alpha * gamma^t * grad of ln pi(At|St, theta) * Gt
```
---
####  Policy Gradient as a supervised learning problem
http://karpathy.github.io/2016/05/31/rl/ 

> Okay, but what do we do if we do not have the correct label in the Reinforcement Learning setting?  
Here is the Policy Gradients solution (again refer to diagram below). Our policy network calculated probability of going UP as 30% (logprob -1.2) and DOWN as 70% (logprob -0.36). We will now sample an action from this distribution; E.g. suppose we sample DOWN, and we will execute it in the game. **At this point notice one interesting fact: We could immediately fill in a gradient of 1.0 for DOWN as we did in supervised learning, and find the gradient vector that would encourage the network to be slightly more likely to do the DOWN action in the future.**

> So we can immediately evaluate this gradient and that’s great, but the problem is that at least for now we do not yet know if going DOWN is good.   
But the critical point is that that’s okay, because we can simply wait a bit and see!  
For example in Pong we could wait until the end of the game, then take the reward we get (either +1 if we won or -1 if we lost), and enter that scalar as the gradient for the action we have taken (DOWN in this case). In the example below, going DOWN ended up to us losing the game (-1 reward). So if we fill in -1 for log probability of DOWN and do backprop we will find a gradient that discourages the network to take the DOWN action for that input in the future (and rightly so, since taking that action led to us losing the game)

> Policy gradients is exactly the same as supervised learning with two minor differences:  
    1) We don’t have the correct labels yi so as a “fake label” we substitute the action we happened to sample from the policy when it saw xi, and  
    2) We modulate the loss for each example multiplicatively based on the eventual outcome, since we want to increase the log probability for actions that worked and decrease it for those that didn’t.  
    So in summary our loss now looks like $\sum_{i}A_{i}\log p(yi∣xi)$, where yi is the action we happened to sample and Ai is a number that we call an advantage. 
 
i.e., using the sampled outcome of the SARSA sequence as the label

In [114]:
pn = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)

In [115]:
def generate_episode(env, policy):
    """
    Generate an episode
    Args:
        env (gym.env)
    """
    
    obs = env.reset()
    done = False
    episode_series = []
    
    while not done:
        action, log_prob = policy.act(obs)
        next_obs, rewards, done, _ = env.step(action)
        episode_series.append((obs, next_obs, rewards, log_prob, done))
        obs = next_obs
        
    return episode_series

In [116]:
sample = generate_episode(env, pn)

In [117]:
obs, next_obs, rewards, log_probs, dones = zip(*sample)

In [119]:
def calculate_reward(episode_series, gamma=0.9):
    """
    Calculate the discounted return by a given episode series
    We use the start value of the episode as the performance measure function
    Recall the return of a monte carlo policy is
    G = sum(t+1:T)(gamma^(k-t-1))Rk
    Args:
        episode_series (list)
        gamma (float)
    Return:
        Return (float)
    """
    G = 0
    G_list = []
    
    obs, next_obs, rewards, log_probs, dones = zip(*episode_series)
    for r in rewards:
        G = gamma * G + r
        G_list.append(G)
    G_list = torch.FloatTensor(G_list)
    G_list = (G_list - G_list.mean()) / (G_list.std() + eps)
    return G_list

#### Steps
1. Loss function: $\sum \log p(A_t | S_t) * G_t$

In [120]:
obs, next_obs, rewards, log_probs, dones = zip(*sample)

In [121]:
g_list = calculate_reward(sample)

In [122]:
g_list

tensor([-2.5068, -2.1433, -1.8162, -1.5218, -1.2569, -1.0184, -0.8038, -0.6107,
        -0.4368, -0.2804, -0.1396, -0.0129,  0.1012,  0.2038,  0.2962,  0.3794,
         0.4542,  0.5215,  0.5822,  0.6367,  0.6858,  0.7300,  0.7698,  0.8056,
         0.8378,  0.8668,  0.8928,  0.9163,  0.9375,  0.9565,  0.9736])

In [124]:
pn = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
optimizer = optim.Adam(pn.parameters(), lr=1e-2)

print(pn.state_dict())

total_rewards = []

# Generate 10 episodes
for i in range(5000):
    
    sample = generate_episode(env, pn)
    obs, next_obs, rewards, log_probs, dones = zip(*sample)
    
    reward_sum = sum(rewards)
    g_list = calculate_reward(sample)
    
    log_probs_torch = torch.cat(log_probs)
    
    loss = (torch.sum(torch.mul(log_probs_torch, g_list).mul(-1), -1))
    
    # Backprop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    total_rewards.append(reward_sum)
    
    if i % 1000 == 0:
        print('\rEpisode: %s \tLoss: %s \tSum of Reward: %s' % (i, loss, np.mean(total_rewards)))
    
print(pn.state_dict())

OrderedDict([('fc1.weight', tensor([[-0.3934,  0.2478,  0.4851,  0.4900],
        [ 0.0833,  0.4846,  0.2927, -0.0955]])), ('fc1.bias', tensor([ 0.2978, -0.3185]))])
Episode: 0 	Loss: tensor(0.5315, grad_fn=<SumBackward2>) 	Sum of Reward: 15.0
Episode: 1000 	Loss: tensor(-3.7940, grad_fn=<SumBackward2>) 	Sum of Reward: 11.698301698301698
Episode: 2000 	Loss: tensor(-1.2205, grad_fn=<SumBackward2>) 	Sum of Reward: 10.91704147926037
Episode: 3000 	Loss: tensor(-0.9151, grad_fn=<SumBackward2>) 	Sum of Reward: 10.548483838720427
Episode: 4000 	Loss: tensor(-1.1759, grad_fn=<SumBackward2>) 	Sum of Reward: 10.3056735816046
OrderedDict([('fc1.weight', tensor([[-0.6245, -2.2341,  1.4952,  3.0133],
        [-0.1660,  3.0492, -1.0103, -2.7476]])), ('fc1.bias', tensor([1.0885, 1.5203]))])


In [126]:
rewards

(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)