### Advantage Actor Critic implementation

An example implementation by http://inoryy.com/post/tensorflow2-deep-reinforcement-learning/

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend

import time
import random


np.random.seed(42)


"tensorflow version", tf.__version__

('tensorflow version', '1.15.0')

In [2]:
import gym
import tensorflow_probability as tfp

In [11]:
import numpy as np
import tensorflow as tf
import tensorflow.keras.layers as kl

In [12]:
class ProbabilityDistribution(tf.keras.Model):
    def call(self, logits, **kwargs):
        # Sample a random categorical action from the given logits.
        return tf.squeeze(tf.random.categorical(logits, 1), axis=-1)
    
    
class Model(tf.keras.Model):
    def __init__(self, num_actions):
        super().__init__('mlp_policy')
        # Note: no tf.get_variable(), just simple Keras API!
        self.hidden1 = kl.Dense(128, activation='relu')
        self.hidden2 = kl.Dense(128, activation='relu')
        self.value = kl.Dense(1, name='value')
        # Logits are unnormalized log probabilities.
        self.logits = kl.Dense(num_actions, name='policy_logits')
        self.dist = ProbabilityDistribution()

    def call(self, inputs, **kwargs):
        # Inputs is a numpy array, convert to a tensor.
        x = tf.convert_to_tensor(inputs)
        # Separate hidden layers from the same input tensor.
        hidden_logs = self.hidden1(x)
        hidden_vals = self.hidden2(x)
        return self.logits(hidden_logs), self.value(hidden_vals)

    def action_value(self, obs):
        # Executes `call()` under the hood.
        # Should be model.predict instead of predict_on_batch
        # as it is deprecated
        logits, value = self.predict_on_batch(obs)
        action = self.dist.predict_on_batch(logits)
        # Another way to sample actions:
        #   action = tf.random.categorical(logits, 1)
        # Will become clearer later why we don't use it.
        return np.squeeze(action, axis=-1), np.squeeze(value, axis=-1)

### Useful example of how to debug RL algorithms

In [16]:
# env = gym.make('CartPole-v0')
# model = Model(num_actions=env.action_space.n)

# obs = env.reset()
# # No feed_dict or tf.Session() needed at all!
# action, value = model.action_value(obs[None, :])
# print(action, value) # [1] [-0.00145713]

# class A2CAgent:
#     def __init__(self, model):
#         self.model = model

#     def test(self, env, render=True):
#         obs, done, ep_reward = env.reset(), False, 0
#         while not done:
#             action, _ = self.model.action_value(obs[None, :])
#             obs, reward, done, _ = env.step(action)
#             ep_reward += reward
#             if render:
#                 env.render()
#         return ep_reward


# agent = A2CAgent(model)
# rewards_sum = agent.test(env)
# print("%d out of 200" % rewards_sum)

In [17]:
import tensorflow.keras.losses as kls
import tensorflow.keras.optimizers as ko


class A2C:
    '''Advantage Actor Critic Agent'''
    def __init__(self, model, lr=7e-3, gamma=0.99, value_c=0.5, entropy_c=1e-4):
        # Coefficients are used for the loss terms.
        self.value_c = value_c
        self.entropy_c = entropy_c
        
        # `gamma` is the discount factor
        self.gamma = gamma

        self.model = model
        self.model.compile(
          optimizer=ko.RMSprop(lr=lr),
          # Define separate losses for policy logits and value estimate.
          loss=[self._logits_loss, self._value_loss])

    def test(self, env, render=False):
        obs, done, ep_reward = env.reset(), False, 0
        while not done:
            action, _ = self.model.action_value(obs[None, :])
            obs, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
        return ep_reward

    def train(self, env, batch_sz=64, updates=250):
        # Storage helpers for a single batch of data.
        actions = np.empty((batch_sz,), dtype=np.int32)
        rewards, dones, values = np.empty((3, batch_sz))
        observations = np.empty((batch_sz,) + env.observation_space.shape)

        # Training loop: collect samples, send to optimizer, repeat updates times.
        ep_rewards = [0.0]
        next_obs = env.reset()
        for update in range(updates):
            for step in range(batch_sz):
                observations[step] = next_obs.copy()
                actions[step], values[step] = self.model.action_value(next_obs[None, :])
                next_obs, rewards[step], dones[step], _ = env.step(actions[step])

                ep_rewards[-1] += rewards[step]
                if dones[step]:
                    ep_rewards.append(0.0)
                    next_obs = env.reset()
                    print("Episode: %03d, Reward: %03d" % (
                        len(ep_rewards) - 1, ep_rewards[-2]))

            _, next_value = self.model.action_value(next_obs[None, :])

            returns, advs = self._returns_advantages(rewards, dones, values, next_value)
            # A trick to input actions and advantages through same API.
            acts_and_advs = np.concatenate([actions[:, None], advs[:, None]], axis=-1)

            # Performs a full training step on the collected batch.
            # Note: no need to mess around with gradients, Keras API handles it.
            losses = self.model.train_on_batch(observations, [acts_and_advs, returns])

            print("[%d/%d] Losses: %s" % (update + 1, updates, losses))

        return ep_rewards

    def _returns_advantages(self, rewards, dones, values, next_value):
        # `next_value` is the bootstrap value estimate of the future state (critic).
        returns = np.append(np.zeros_like(rewards), next_value, axis=-1)

        # Returns are calculated as discounted sum of future rewards.
        for t in reversed(range(rewards.shape[0])):
            returns[t] = rewards[t] + self.gamma * returns[t + 1] * (1 - dones[t])
        returns = returns[:-1]

        # Advantages are equal to returns - baseline (value estimates in our case).
        advantages = returns - values

        return returns, advantages


    def _value_loss(self, returns, value):
        # Value loss is typically MSE between value estimates and returns.
        return self.value_c * kls.mean_squared_error(returns, value)

    def _logits_loss(self, actions_and_advantages, logits):
        # A trick to input actions and advantages through the same API.
        actions, advantages = tf.split(actions_and_advantages, 2, axis=-1)

        # Sparse categorical CE loss obj that supports sample_weight arg on `call()`.
        # `from_logits` argument ensures transformation into normalized probabilities.
        weighted_sparse_ce = kls.SparseCategoricalCrossentropy(from_logits=True)

        # Policy loss is defined by policy gradients, weighted by advantages.
        # Note: we only calculate the loss on the actions we've actually taken.
        actions = tf.cast(actions, tf.int32)
        policy_loss = weighted_sparse_ce(actions, logits, sample_weight=advantages)

        # Entropy loss can be calculated as cross-entropy over itself.
        probs = tf.nn.softmax(logits)
        entropy_loss = kls.categorical_crossentropy(probs, probs)

        # We want to minimize policy and maximize entropy losses.
        # Here signs are flipped because the optimizer minimizes.
        return policy_loss - self.entropy_c * entropy_loss

In [18]:

agent = A2C(model)
rewards_history = agent.train(env)
print("Finished training, testing...")
print("%d out of 200" % agent.test(env))

Episode: 001, Reward: 019
Episode: 002, Reward: 018
Episode: 003, Reward: 024
[1/250] Losses: [69.9031753540039, 6.674508571624756, 63.22866439819336]
Episode: 004, Reward: 016
Episode: 005, Reward: 011
[2/250] Losses: [146.6148223876953, 8.87785816192627, 137.73696899414062]
Episode: 006, Reward: 051
Episode: 007, Reward: 049
[3/250] Losses: [227.6979217529297, 10.87327766418457, 216.82464599609375]
Episode: 008, Reward: 063
[4/250] Losses: [355.46429443359375, 14.739948272705078, 340.7243347167969]
[5/250] Losses: [445.7047119140625, 17.5133113861084, 428.19140625]
Episode: 009, Reward: 093
Episode: 010, Reward: 017
[6/250] Losses: [66.22960662841797, 6.010138511657715, 60.21946716308594]
Episode: 011, Reward: 031
Episode: 012, Reward: 048
[7/250] Losses: [194.1044921875, 9.785928726196289, 184.3185577392578]
Episode: 013, Reward: 014
Episode: 014, Reward: 019
Episode: 015, Reward: 018
[8/250] Losses: [44.76246643066406, 4.429111003875732, 40.33335494995117]
Episode: 016, Reward: 061

In [30]:
env = gym.make('CartPole-v0')
model = Model(num_actions=env.action_space.n)

with tf.Graph().as_default():
    print(tf.executing_eagerly()) # False

    model = Model(num_actions=env.action_space.n)
    agent = A2C(model)

    rewards_history = agent.train(env)
    print("Finished training, testing...")
    print("%d out of 200" % agent.test(env)) # 200 out of 200

False
Episode: 001, Reward: 017
Episode: 002, Reward: 026
[1/250] Losses: [82.02988, 7.392539, 74.63734]
Episode: 003, Reward: 023
Episode: 004, Reward: 042
Episode: 005, Reward: 015
[2/250] Losses: [162.14333, 9.606596, 152.53673]
Episode: 006, Reward: 011
Episode: 007, Reward: 010
Episode: 008, Reward: 022
Episode: 009, Reward: 012
[3/250] Losses: [42.31421, 4.928278, 37.385933]
Episode: 010, Reward: 022
Episode: 011, Reward: 011
Episode: 012, Reward: 012
[4/250] Losses: [89.80336, 6.76303, 83.04033]
Episode: 013, Reward: 041
Episode: 014, Reward: 022
[5/250] Losses: [115.04283, 8.0042305, 107.038605]
Episode: 015, Reward: 035
Episode: 016, Reward: 013
[6/250] Losses: [247.10014, 11.977857, 235.12228]
Episode: 017, Reward: 052
Episode: 018, Reward: 032
Episode: 019, Reward: 012
[7/250] Losses: [83.11281, 6.5223646, 76.59045]
Episode: 020, Reward: 070
[8/250] Losses: [241.58469, 12.514875, 229.06981]
Episode: 021, Reward: 029
Episode: 022, Reward: 017
[9/250] Losses: [85.89472, 6.7527

In [34]:

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import A2C

# Parallel environments
env = make_vec_env('CartPole-v1', n_envs=1)

model = A2C(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("a2c_cartpole")

# Test of A2C
# obs = env.reset()
# while True:
#     action, _states = model.predict(obs)
#     obs, rewards, dones, info = env.step(action)
#     env.render()

ModuleNotFoundError: No module named 'tensorflow.contrib'