# Taxi-v2 solving by Policy Gradient

In [35]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import sys

from collections import deque

import torch
from torch.autograd import Variable
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.distributions import Categorical

In [36]:
class PolicyNetwork(nn.Module):
    """
    The policy network
    Args:
        n_inputs (int)
        n_outputs (int)
    """
    
    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        
        self.reward_history = []
        self.loss_history = []
        
        self.fc1 = nn.Linear(self.n_inputs, 128)
        self.dropout1 = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(128, self.n_outputs)
        self.softmax = nn.Softmax(dim=-1)
        
        # save log probs history and rewards history
        self.saved_log_probs = []
        self.rewards = []
        
        # Logs
        self.loss_history = []
        self.reward_history = []
        
    def reset(self):
        self.saved_log_probs = []
        self.rewards = []
        
    def forward(self, x):
        """
        Forward pass
        Args:
            x (torch.Tensor)
        """
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.fc2(x)
        return F.log_softmax(x, dim=-1)

#### Preview the environment

In [37]:
env = gym.make("Taxi-v2")

In [6]:
obs = env.reset()
next_obs, reward, done, info = env.step(env.action_space.sample())
env.render()
env.close()

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)


In [38]:
next_obs

266

In [39]:
print("Observation space:", env.observation_space.n)
print("Action space:", env.action_space.n)

Observation space: 500
Action space: 6


#### Preview the policy

## Algorithms

---
```
Input: a differentiable policy parameterization pi(a|s, theta)
Algorithm parameter: step size alpha > 0
Initialise policy parameter theta with dimension d'

Loop forever for each episode:
        Generate an episode S0, A0, R1, ..., ST-1, AT-1, RT, following pi(.|., theta) (def generate())
        Loop for each step of the episode t = 0, 1, ..., T-1
        G = sum(t+1:T)(gamma^(k-t-1))Rk (def calculate_reward)
        theta = theta + alpha * gamma^t * grad of ln pi(At|St, theta) * Gt
```
---

In [40]:
policy = PolicyNetwork(env.observation_space.n, env.action_space.n)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
eps = np.finfo(np.float32).eps.item()

In [41]:
policy.reset()

In [42]:
def act(state):
    state = torch.from_numpy(np.array(state))
    state = F.one_hot(state, num_classes=env.observation_space.n).float()
    probs = policy(Variable(state))
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))

    return action.item()

In [43]:
def generate_episode(env, policy):
    """
    Generate an episode
    Args:
        env (gym.env)
    """
    
    obs = env.reset()
    ep_reward = 0
    done = False
    
    while True:
        action = act(obs)
        next_obs, reward, done, _ = env.step(action)
        policy.rewards.append(reward)
        ep_reward += reward
        obs = next_obs
        
        if done:
            break

In [44]:
def calculate_discount_return(rewards, gamma=0.99):
    """
    Calculate the discount return by given rewards series
    Args:
        rewards (np.array)
    """
    G = 0
    T = len(rewards)
    returns = []
    for r in reversed(rewards):
        G = gamma * G + r
        returns.insert(0, G)
    
    # Flip the returns list
#     reversed_returns = reversed(returns)
    
    return torch.FloatTensor(returns)

In [45]:
def gradient_update(episode_series, gamma=0.9):
    """
    Calculate the discounted return by a given episode series
    We use the start value of the episode as the performance measure function
    Recall the return of a monte carlo policy is
    G = sum(t+1:T)(gamma^(k-t-1))Rk
    Args:
        episode_series (list)
        gamma (float)
    Return:
        Return (float)
    """

    advantage_torch = calculate_discount_return(policy.rewards)
    advantage_torch = (advantage_torch - advantage_torch.mean()) / \
        (advantage_torch.std() + np.finfo(np.float32).eps)
    probs_torch = torch.stack(policy.saved_log_probs)
    
    # Calculate performance measure (~loss) function
    # Use expected average reward as the advantage At

    loss = torch.neg(torch.matmul(probs_torch, advantage_torch))

    # Update network weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Log the records
    policy.loss_history.append(loss.item())
    policy.reward_history.append(np.sum(policy.rewards))
    
    return loss, np.sum(policy.rewards)

#### Running one step

In [46]:
print(policy)

PolicyNetwork(
  (fc1): Linear(in_features=500, out_features=128, bias=True)
  (dropout1): Dropout(p=0.5)
  (fc2): Linear(in_features=128, out_features=6, bias=True)
  (softmax): Softmax()
)


In [47]:
policy.reset()

In [48]:
generate_episode(env, policy)
ep_loss, ep_rewards = gradient_update(policy.rewards)
print(policy.saved_log_probs)

[tensor(-1.8043, grad_fn=<SqueezeBackward1>), tensor(-1.7619, grad_fn=<SqueezeBackward1>), tensor(-1.8048, grad_fn=<SqueezeBackward1>), tensor(-1.8061, grad_fn=<SqueezeBackward1>), tensor(-1.8489, grad_fn=<SqueezeBackward1>), tensor(-1.7756, grad_fn=<SqueezeBackward1>), tensor(-1.8064, grad_fn=<SqueezeBackward1>), tensor(-1.7959, grad_fn=<SqueezeBackward1>), tensor(-1.7964, grad_fn=<SqueezeBackward1>), tensor(-1.8477, grad_fn=<SqueezeBackward1>), tensor(-1.7697, grad_fn=<SqueezeBackward1>), tensor(-1.8035, grad_fn=<SqueezeBackward1>), tensor(-1.7629, grad_fn=<SqueezeBackward1>), tensor(-1.7908, grad_fn=<SqueezeBackward1>), tensor(-1.7968, grad_fn=<SqueezeBackward1>), tensor(-1.7823, grad_fn=<SqueezeBackward1>), tensor(-1.7841, grad_fn=<SqueezeBackward1>), tensor(-1.8489, grad_fn=<SqueezeBackward1>), tensor(-1.7554, grad_fn=<SqueezeBackward1>), tensor(-1.8016, grad_fn=<SqueezeBackward1>), tensor(-1.8395, grad_fn=<SqueezeBackward1>), tensor(-1.7568, grad_fn=<SqueezeBackward1>), tensor(-1

#### Running one batch

In [51]:
policy.reset()

In [52]:
# Generate 10 episodes
running_rewards = deque(maxlen=100)

for i in range(5000):
    generate_episode(env, policy)
    ep_loss, ep_rewards = gradient_update(policy.rewards)
    running_rewards.append(ep_rewards)
    policy.reset()
    
    if i % 1000 == 0:
        print('\rEpisode: %s \tLoss: %s \tAverage episode Rewards: %s' % (i, ep_loss, np.mean(running_rewards)), end="")

Episode: 4000 	Loss: tensor(0.5291, grad_fn=<NegBackward>) 	Average episode Rewards: -220.887

## Reduce variance by using with a critic
The problem of REINFORCE is that it has a high variance, and we can use a "baseline" to reduce variance. 

Therefore, we use a "critic" to estimate the action-value function, just as what we have done in value approximation
\begin{equation}
Q_{w}(s, a) \approx Q^{\pi_{\theta}}(s, a)
\end{equation}

Now, we have two sets of parameters:
1. Critic - Updates action-value function parameters $w$
2. Actor - Updates policy parameters $\theta$, in direction suggested by critic

Hence, the actor-critic algorithms follow an approximate policy gradient
\begin{equation}
\begin{split}
\nabla_{\theta} J(\theta) &= \mathop{\mathbb{E}_{\pi_{\theta}}}\big[\nabla_{\theta} \log pi_{\theta}(s, a) Q_{w}(s,a) \big]\\
\Delta \theta &= \alpha \nabla_{\theta} \log \pi_{\theta}(s,a) Q_{w}(s,a)
\end{split}
\end{equation}

## From REINFORCE to Actor-Critic

The baseline can take various values. The set of equations below illustrates the classic variants of actor critic methods. 

\begin{equation}
\begin{split}
\nabla_{\theta}J(\theta) &= \mathop{\mathbb{E}_{\pi_{\theta}}}\big[\nabla_{\theta} \log \pi_{\theta}(s,a)G_{t} \big] \hspace{3cm} \text{REINFORCE}\\
&= \mathop{\mathbb{E}_{\pi_{\theta}}}\big[\nabla_{\theta} \log \pi_{\theta}(s,a)Q^{w}(s,a) \big] \hspace{2cm} \text{Q Actor-Critic}\\
&= \mathop{\mathbb{E}_{\pi_{\theta}}}\big[\nabla_{\theta} \log \pi_{\theta}(s,a)A^{w}(s,a) \big] \hspace{2cm} \text{Q Advantage Actor-Critic}\\
&= \mathop{\mathbb{E}_{\pi_{\theta}}}\big[\nabla_{\theta} \log \pi_{\theta}(s,a)\delta \big] \hspace{3.2cm} \text{Q TD Actor-Critic}\\
\end{split}
\end{equation}

Implementation examples: 
- https://www.datahubbs.com/policy-gradients-and-advantage-actor-critic/
- https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f

#### Algorithm
---
```
Input: a differentiable policy parameterization pi(a|s, theta)
Input: a differentiable state-value function parameterization Q_w(s, a, w)
Parameters: step sizes alpha_theta > 0; alpha_w > 0

Loop forever for each episode:

        Initialise S, theta
        Sample a in pi_theta
        
        Loop while S is not terminal for each time step:
                A = pi(.|S, theta) [policy(state)]
                Take action A, observe S', R
                delta = R + gamma * Q_w(S', A', w) - Q_w(S, A, w)  [TD(0) error, or advantage]
                theta = theta + alpha_theta * grad_pi log pi_theta(s,a) Q_w(S,A)     [policy gradient update]
                w = w + alpha_w * delta * x(s, a)    [TD(0)]
                A = A', S = S'
```
---

#### Create a new Value Network

In [53]:
class ValueNetwork(nn.Module):
    """Value network for value approximation"""
    
    def __init__(self, state_size, action_size):
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        
        # MLP layers
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, action_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
def actor_critic(env, policy_network, value_network, num_episodes, gamma=0.9)
    """
    Actor Critic Algorithm
    """
    
    for i in range(num_episodes):
        
        obs = env.reset()
        action = act(obs)
        
        while True:
            