# Actor-Critic Method

The Actor-critic method is a temporal-difference, on-policy deep reinforcement learning algorithm. The 'actor' attempts to learn the optimal policy and follow it, while the 'critic' critiques this policy by calculating the optimal value function and providing feedback. This produces a more efficient algorithm for training the agent. 

The code used for training the agent using actor-critic method on our custom environment has been developed by using the following tutorial here as a baseline for further modifications: https://keras.io/examples/rl/actor_critic_cartpole/
 

In [None]:
# Import the environment and set the goal
from ipynb.fs.full.PianoHandv1 import *
train_key = 'C'

In [None]:
# Import libraries
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Dense, Input

In [None]:
class Agent():
    
    # Create the gym environment using the specific goal 
    env = PianoHandEnv(train_key)
    eps = np.finfo(np.float32).eps.item() 
    
    # Setting parameters
    gamma           = 0.9                            # Discount rate - how important are the immediate rewards vs later rewards
    learning_rate   = 0.1                            # Learning rate - the amount of change to the model during each step 
    state_space     = env.observation_space.shape[0] # State space of theenvironment 
    action_space    = env.action_space.n             # Action space of theenvironment 
    hidden_layers   = 2                              # Hidden layers of neural network
    max_steps       = 50                             # Maximum steps the agent is allowed to take
    total_episodes  = 100                            # Total episodes  
    
    # Initialization for running tensorboard
    log_dir         = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    summary_writer  = tf.summary.create_file_writer(logdir=log_dir)
    
    # Initialising variables and arrays used in training
    actor_history           = []
    critic_history          = []
    rewards_history         = []
    rewards_in_all_episodes = []
    train_loss_results      = []
    train_rewards_results   = []

    action_distribution = np.zeros(16)
    done = False
    
    # The best optimizer determined is Root Mean Square Propagation (RMSProp) and the best loss function is Huber loss.
    optimizer = keras.optimizers.RMSprop(learning_rate)    
    loss      = keras.losses.Huber()   
    
    """
    Uncomment the following lines to use other optimizers/loss functions
    optimizer = keras.optimizers.Adagrad(learning_rate=0.1)
    optimizer = keras.optimizers.Adam(learning_rate=0.1)
    optimizer = keras.optimizers.SGD(learning_rate=0.1)
    loss = keras.losses.KLDivergence()
    loss = keras.losses.SquaredHinge()
    
    """
    
    def create_model(self, state_space, action_space, hidden_layers):
        '''
        A shared neural network structure with dense (fully connected) layers is used to implement the actor critic network.
        After testing with different number of hidden layers, the optimal number of layers is set as 2.
        ''' 
        inputs = Input(shape=(state_space,))                          
        common = Dense(hidden_layers, activation="relu")(inputs)     # The first layer is common to both actor and critic
        actor  = Dense(action_space, activation="softmax")(common)   # Output for the actor - probabilities for each action at a state
        critic = Dense(1)(common)                                    # Output for the critic - estimated total returns

        model = keras.Model(inputs=inputs, outputs=[actor, critic])
        return model
    
    
    def train(self):
        model = self.create_model(self.state_space, self.action_space, self.hidden_layers)
        count = 0
        for episode in range (self.total_episodes):           
            st = self.env.reset()
            
            #episode_reward = 0 
            state = (st[0][0], st[0][1], st[1][0], st[1][1], st[2][0], st[2][1], st[3][0], st[3][1])
            
            #To demonstrate the how the losses and rewards change in realtime usinf TensorBoard
            epoch_loss_avg = tf.keras.metrics.Mean()
            epoch_rewards_avg= tf.keras.metrics.Mean()
            epoch_critic_loss_avg = tf.keras.metrics.Mean()
            epoch_actor_loss_avg= tf.keras.metrics.Mean()
        
            #epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
            
            """
            Made use of GradientTape because of the automatic differentiation property
            which is useful in the backpropogation of this algorithm for training the 
            neural network structure.
            """
            with tf.GradientTape() as tape:
                for timestep in range(self.max_steps): 
                    
                    state = tf.convert_to_tensor(state) #State is converted into the dimensions recognised by model
                    state = tf.expand_dims(state, 0) 

                    action_probs, critic_value = model(state)
                    self.critic_history.append(critic_value[0, 0])
                    
                    #choosing an action based on the action probability
                    action = np.random.choice(self.action_space, p=np.squeeze(action_probs)) 
                    
                    #updating the action probability based on the action choosen
                    self.action_distribution[action]+=1 
                    
                    self.actor_history.append(tf.math.log(action_probs[0, action]))
                    st, reward, done, final, link = self.env.step(action)
                    state = (st[0][0], st[0][1], st[1][0], st[1][1], st[2][0], st[2][1], st[3][0], st[3][1])
                    self.rewards_history.append(reward)
                    
                    if done:
                        # Render and save weights only for last episode
                        #if episode == (self.total_epsiodes-1):
                            #env.render(timestep, done)
                            #weight_path = "./" + train_key + ".h5"   
                            #model.save_weights(weight_path)
                        break
                        
                self.rewards_in_all_episodes.append(reward)
 
                returns = []
                ds = 0.0 #DiscountedSum
        
                """
                Calculating the returns at each time step. Starting from the 
                last time-step and going to the first, the rewards for each 
                time step are discounted by the factor of gamma.
                """
                for r in self.rewards_history[::-1]:
                    ds = r + self.gamma * ds
                    returns.append(ds)
    
                returns = np.array(returns)
                
                #Since our reward structure ranges from -10 to 200, we normalized the returns.
                returns = (returns - np.mean(returns)) / (np.std(returns) + self.eps)
                returns = returns.tolist()

                history = zip(self.actor_history, self.critic_history, returns)
                actor_losses = []
                critic_losses = []

                for probabilityofaction, value, return_obtained in history:

                    variation = return_obtained - value 
                    #ret is the reward obtained by the agent and value is the reward estimated
                    #to be got by the critic.
                    
                    """
                    The below action updates the actor so that it gives a higher probability of choosing 
                    to the actions which give a higher reward.
                    """
                    actor_losses.append(-probabilityofaction * variation) 
                      
                    """
                    The below action updates the critic so that it can better approximate the future returns.
                    """
                    critic_losses.append(
                        self.loss(tf.expand_dims(value, 0), tf.expand_dims(return_obtained, 0))
                    )
                    
                #Here is where backpropogation occurs. Backpropogation calculates the
                #gradient of the error function with respect to the neural network's weights.
                loss_value = sum(actor_losses) + sum(critic_losses)
                
                #Automatic Differentiation using Gradient Tape
                grads = tape.gradient(loss_value, model.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, model.trainable_variables))
                
                epoch_loss_avg(loss_value)
                epoch_rewards_avg(reward)
                epoch_critic_loss_avg(sum(critic_losses))
                epoch_actor_loss_avg(sum(actor_losses))

                self.train_loss_results.append(epoch_loss_avg.result()) 
                self.train_rewards_results.append(epoch_rewards_avg.result())

                self.actor_history.clear()
                self.critic_history.clear()
                self.rewards_history.clear()
                
                count += 1
               
            if count % 10 == 0:
                print("Episode {} Average rewards = {}". format(count, np.mean(self.rewards_in_all_episodes)), end = '\r')
             
            # Display results in tensorboard
            with self.summary_writer.as_default():
                tf.summary.scalar('epoch_loss_avg', epoch_loss_avg.result(), step=self.optimizer.iterations)
                tf.summary.scalar('epoch_reward_avg', epoch_rewards_avg.result(), step= self.optimizer.iterations)
                tf.summary.scalar('epoch_critic_loss_avg', epoch_critic_loss_avg.result(), step=self.optimizer.iterations)
                tf.summary.scalar('epoch_actor_loss_avg', epoch_actor_loss_avg.result(), step= self.optimizer.iterations)
               

In [None]:
# Train agent
agent = Agent()
agent.train()

In [None]:
# Tensorboard 
log_dir = ".\logs"
%load_ext tensorboard
%tensorboard --logdir logs