# Actor-Critic-TensorFlow

Here we use the `Keras` API of `TensorFlow2.0` to play `Acrobot-v1` game.

The acrobot system includes two joints and two links, where the joint between the two links is actuated. Initially, the links are hanging downwards, and the goal is to swing the end of the lower link up to a given height.

### 1. Import Labraries

In [1]:
import gym
from gym.wrappers import Monitor

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

%matplotlib inline

  from ._conv import register_converters as _register_converters


### 2. Define a class for Actor

In [2]:
class Actor:
    
    def __init__(self, n_features, n_actions, learning_rate, discount):
        self.n_features = n_features
        self.n_actions = n_actions
        self.lr = learning_rate
        self.gamma = discount
        
        self._build_model()
        
    def _build_model(self):
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(self.n_features,)))
        self.model.add(tf.keras.layers.Dense(16, activation='relu'))
        self.model.add(tf.keras.layers.Dense(self.n_actions, activation='softmax'))
        
        self.optimizer = tf.keras.optimizers.Adam(self.lr)
    
    def _grad(self, state, action, td_error):
        with tf.GradientTape() as tape:
            loss_value = self._loss(state, action, td_error)
        return tape.gradient(loss_value, self.model.trainable_variables)
        
    def _loss(self, state, action, td_error):
        probs = self.model(state)
        
        return - td_error * tf.math.log(probs[0, action])
    
    def choose_action(self, state):
        state = np.expand_dims(state, axis=0)
        probs = self.model.predict(state)
        action = np.random.choice(self.n_actions, p=probs.ravel())
        
        return action
    
    def learn(self, state, action, td_error):
        state = np.expand_dims(state, axis=0)
        
        grads = self._grad(state, action, td_error)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        

### 3. Define a class for Critic

In [3]:
class Critic:
    
    def __init__(self, n_features, learning_rate, discount):
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = discount
        
        self._build_model()
        
    def _build_model(self):
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.layers.Dense(8, activation='relu', input_shape=(self.n_features,)))
        self.model.add(tf.keras.layers.Dense(8, activation='relu'))
        self.model.add(tf.keras.layers.Dense(1, activation=None))
        
        self.optimizer = tf.keras.optimizers.Adam(self.lr)
        
    def _grad(self, state, next_state):
        with tf.GradientTape() as tape:
            td_error, loss_value = self._loss(state, next_state)
        return td_error, tape.gradient(loss_value, self.model.trainable_variables)
        
    def _loss(self, state, next_state):
        v = self.model(state)
        v_ = self.model(next_state)
        td_error = reward + self.gamma * v_ - v
        
        return td_error, tf.square(td_error)
        
    def learn(self, state, reward, next_state):
        state = np.expand_dims(state, axis=0)
        next_state = np.expand_dims(next_state, axis=0)
        
        td_error, grads = self._grad(state, next_state)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        
        return td_error

### 4. Let's train an A2C

##### 4.1 Create an environment and wrap it by `Monitor` for recording

In [4]:
env = gym.make('Acrobot-v1')
env = Monitor(env, directory='./video',
              video_callable=lambda count: count > 0 and count % 50 == 0)
state = env.reset()
state

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


array([ 0.99926078,  0.03844342,  0.99999821, -0.00189067,  0.05553292,
       -0.07179652])

##### 4.2 Create an actor and a critic

In [5]:
actor = Actor(n_features=len(state), n_actions=env.action_space.n, learning_rate=0.001, discount=0.9)
critic = Critic(n_features=len(state), learning_rate=0.001, discount=0.9)

##### 4.3 Start to train

The reward is designed that the value is 0 if the goal is reached, otherwise the value is -1.

For simplicity, we don't need to modify the reward.

In [6]:
step = 0
periods = []
rewards = []

for episode in range(100):
    period = 0
    total_reward = 0
    
    state = env.reset()
    
    while True:
        env.render()
        
        action = actor.choose_action(state)
        
        next_state, reward, done, info = env.step(action)
        
        total_reward += reward
        
        td_error = critic.learn(state, reward, next_state)
        actor.learn(state, action, td_error)
        
        state = next_state
        
        if done:
            break
            
        step += 1
        period += 1
    
    print('Episode: {:3d}, period: {:3d}, reward: {:3.3f}'.format(episode, period, total_reward))
    periods.append(period)
    rewards.append(total_reward)

env.close()
print('game over')

Episode:   0, period: 499, reward: -500.000
Episode:   1, period: 499, reward: -500.000
Episode:   2, period: 499, reward: -500.000
Episode:   3, period: 499, reward: -500.000
Episode:   4, period: 321, reward: -321.000
Episode:   5, period: 249, reward: -249.000
Episode:   6, period:  85, reward: -85.000
Episode:   7, period: 132, reward: -132.000
Episode:   8, period:  97, reward: -97.000
Episode:   9, period: 149, reward: -149.000
Episode:  10, period:  93, reward: -93.000
Episode:  11, period: 112, reward: -112.000
Episode:  12, period:  95, reward: -95.000
Episode:  13, period:  98, reward: -98.000
Episode:  14, period: 109, reward: -109.000
Episode:  15, period:  83, reward: -83.000
Episode:  16, period:  99, reward: -99.000
Episode:  17, period: 109, reward: -109.000
Episode:  18, period:  84, reward: -84.000
Episode:  19, period: 113, reward: -113.000
Episode:  20, period:  97, reward: -97.000
Episode:  21, period:  83, reward: -83.000
Episode:  22, period: 200, reward: -200.00