# Cartpole solving by Policy Gradient

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import sys

from collections import deque

import torch
from torch.autograd import Variable
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.distributions import Categorical

In [2]:
class PolicyNetwork(nn.Module):
    """
    The policy network
    Args:
        n_inputs (int)
        n_outputs (int)
    """
    
    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        
        self.reward_history = []
        self.loss_history = []
        
        self.fc1 = nn.Linear(self.n_inputs, 64)
        self.dropout1 = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(64, self.n_outputs)
        self.softmax = nn.Softmax(dim=-1)
        
        # save log probs history and rewards history
        self.saved_log_probs_list = []
        self.saved_log_probs = torch.Tensor([])
        self.rewards = []
        
        # Logs
        self.loss_history = []
        self.reward_history = []
        
    def reset(self):
        self.saved_log_probs_list = []
        self.saved_log_probs = torch.Tensor([])
        self.rewards = []
        
    def forward(self, x):
        """
        Forward pass
        Args:
            x (torch.Tensor)
        """
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=-1)

#### Preview the environment

In [3]:
env = gym.make("CartPole-v1")

In [None]:
obs = env.reset()
next_obs, reward, done, info = env.step(env.action_space.sample())
env.render()
env.close()

In [4]:
next_obs

NameError: name 'next_obs' is not defined

In [5]:
env.observation_space.shape[0]

4

#### Preview the policy

## Algorithms

---
```
Input: a differentiable policy parameterization pi(a|s, theta)
Algorithm parameter: step size alpha > 0
Initialise policy parameter theta with dimension d'

Loop forever for each episode:
        Generate an episode S0, A0, R1, ..., ST-1, AT-1, RT, following pi(.|., theta) (def generate())
        Loop for each step of the episode t = 0, 1, ..., T-1
        G = sum(t+1:T)(gamma^(k-t-1))Rk (def calculate_reward)
        theta = theta + alpha * gamma^t * grad of ln pi(At|St, theta) * Gt
```
---
####  Policy Gradient as a supervised learning problem
http://karpathy.github.io/2016/05/31/rl/   
https://amoudgl.github.io/blog/policy-gradient/

> Okay, but what do we do if we do not have the correct label in the Reinforcement Learning setting?  
Here is the Policy Gradients solution (again refer to diagram below). Our policy network calculated probability of going UP as 30% (logprob -1.2) and DOWN as 70% (logprob -0.36). We will now sample an action from this distribution; E.g. suppose we sample DOWN, and we will execute it in the game. **At this point notice one interesting fact: We could immediately fill in a gradient of 1.0 for DOWN as we did in supervised learning, and find the gradient vector that would encourage the network to be slightly more likely to do the DOWN action in the future.**

> So we can immediately evaluate this gradient and that’s great, but the problem is that at least for now we do not yet know if going DOWN is good.   
But the critical point is that that’s okay, because we can simply wait a bit and see!  
For example in Pong we could wait until the end of the game, then take the reward we get (either +1 if we won or -1 if we lost), and enter that scalar as the gradient for the action we have taken (DOWN in this case). In the example below, going DOWN ended up to us losing the game (-1 reward). So if we fill in -1 for log probability of DOWN and do backprop we will find a gradient that discourages the network to take the DOWN action for that input in the future (and rightly so, since taking that action led to us losing the game)

> Policy gradients is exactly the same as supervised learning with two minor differences:  
    1) We don’t have the correct labels yi so as a “fake label” we substitute the action we happened to sample from the policy when it saw xi, and  
    2) We modulate the loss for each example multiplicatively based on the eventual outcome, since we want to increase the log probability for actions that worked and decrease it for those that didn’t.  
    So in summary our loss now looks like $\sum_{i}A_{i}\log p(yi∣xi)$, where yi is the action we happened to sample and Ai is a number that we call an advantage. 
 
i.e., using the sampled outcome of the SARSA sequence as the label

In [6]:
policy = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
eps = np.finfo(np.float32).eps.item()

In [7]:
policy.reset()

In [8]:
def act(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(Variable(state))
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs_list.append(m.log_prob(action))

    policy.saved_log_probs = torch.cat([
        policy.saved_log_probs,
        m.log_prob(action).reshape(1)
    ])

    return action.item()

In [9]:
def generate_episode(env, policy):
    """
    Generate an episode
    Args:
        env (gym.env)
    """
    
    obs = env.reset()
    ep_reward = 0
    done = False
    
    while True:
        action = act(obs)
        next_obs, reward, done, _ = env.step(action)
        policy.rewards.append(reward)
        ep_reward += reward
        obs = next_obs
        
        if done:
            break

In [10]:
def calculate_discount_return(rewards, gamma=0.99):
    """
    Calculate the discount return by given rewards series
    Args:
        rewards (np.array)
    """
    G = 0
    T = len(rewards)
    returns = []
    for r in rewards[::-1]:
        G = gamma * G + r
        returns.insert(0, G)
    
    # Flip the returns list
#     reversed_returns = reversed(returns)
    
    return torch.FloatTensor(returns)

In [17]:
def gradient_update(episode_series, gamma=0.9):
    """
    Calculate the discounted return by a given episode series
    We use the start value of the episode as the performance measure function
    Recall the return of a monte carlo policy is
    G = sum(t+1:T)(gamma^(k-t-1))Rk
    Args:
        episode_series (list)
        gamma (float)
    Return:
        Return (float)
    """

    advantage_torch = calculate_discount_return(policy.rewards)
    advantage_torch = (advantage_torch - advantage_torch.mean()) / \
        (advantage_torch.std() + np.finfo(np.float32).eps)
    probs_torch = torch.cat(policy.saved_log_probs_list)
    
    # Calculate performance measure (~loss) function
    # Use expected average reward as the advantage At

    loss = (torch.sum(torch.mul(probs_torch, advantage_torch).mul(-1), -1))
#     loss = torch.neg(torch.matmul(probs_torch, advantage_torch))

    # Update network weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Log the records
    policy.loss_history.append(loss.item())
    policy.reward_history.append(np.sum(policy.rewards))
    
    return loss, np.sum(policy.rewards)

In [18]:
print(policy)

PolicyNetwork(
  (fc1): Linear(in_features=4, out_features=64, bias=True)
  (dropout1): Dropout(p=0.5)
  (fc2): Linear(in_features=64, out_features=2, bias=True)
  (softmax): Softmax()
)


In [19]:
policy.reset()

In [13]:
generate_episode(env, policy)
# ep_loss, ep_rewards = gradient_update(policy.rewards)
# # running_rewards.append(ep_rewards)
# print(policy.saved_log_probs)
# policy.reset()

In [14]:
policy.saved_log_probs

tensor([-0.5822, -0.7809, -0.8175, -0.8424, -0.5350, -0.8495, -0.8854, -0.8996,
        -0.9243, -0.4889, -0.5024, -0.5173, -0.8920, -0.9038, -0.9264, -0.4933,
        -0.5075], grad_fn=<CatBackward>)

In [16]:
torch.cat(policy.saved_log_probs_list)

tensor([-0.5822, -0.7809, -0.8175, -0.8424, -0.5350, -0.8495, -0.8854, -0.8996,
        -0.9243, -0.4889, -0.5024, -0.5173, -0.8920, -0.9038, -0.9264, -0.4933,
        -0.5075], grad_fn=<CatBackward>)

In [None]:
# Generate 10 episodes
running_rewards = deque(maxlen=100)

for i in range(1000):
    generate_episode(env, policy)
    ep_loss, ep_rewards = gradient_update(policy.rewards)
    running_rewards.append(ep_rewards)
#     print(policy.saved_log_probs)
    policy.reset()
    
    if i % 100 == 0:

        print('\rEpisode: %s \tLoss: %s \tAverage episode Rewards: %s' % (i, ep_loss, np.mean(running_rewards)))

Episode: 0 	Loss: tensor(0.9638, grad_fn=<SumBackward2>) 	Average episode Rewards: 34.0
Episode: 100 	Loss: tensor(-2.5283, grad_fn=<SumBackward2>) 	Average episode Rewards: 90.05
Episode: 200 	Loss: tensor(-2.6714, grad_fn=<SumBackward2>) 	Average episode Rewards: 404.29
Episode: 300 	Loss: tensor(1.4144, grad_fn=<SumBackward2>) 	Average episode Rewards: 382.67