## Import Dependencies

In [14]:
import gym

import numpy as np
import tensorflow as tf
import tensorflow.keras as ks
import os

## Environment creation

A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The pendulum is placed upright on the cart and the goal is to balance the pole by applying forces in the left and right direction on the cart.

In [15]:
seed = 14
env = gym.make("CartPole-v0")
env.seed(seed)

[14]

## Define the Actor-Critic network

As an agent takes actions and moves through an environment, it learns to map the observed state of the environment to two possible outputs: 

1. **Recommended action**: A probability value for each action in the action space. The part of the agent responsible for this output is called the **actor**. 

2. **Estimated rewards in the future**: Sum of all rewards it expects to receive in the future. The part of the agent responsible for this output is the **critic**. 

Agent and Critic learn to perform their tasks, such that the recommended actions from **the actor maximize the rewards**.

In [16]:
input_shape = env.observation_space.shape
num_actions = env.action_space.n

def actor_critic(input_shape):
    
    inputs = ks.layers.Input(input_shape)
    
    # Common layer
    x = ks.layers.Dense(128, activation="relu")(inputs)
    
    # Actor
    actor = ks.layers.Dense(num_actions, activation="softmax")(x)
    
    # Critic
    critic = ks.layers.Dense(1)(x)
    
    model = ks.Model(inputs=inputs, outputs=[actor, critic])
    
    return model


model = actor_critic(input_shape)

# Optimizer and Loss function selection
optimizer = ks.optimizers.Adam(learning_rate=1e-2)
huber_loss = ks.losses.Huber()

## Training

To train the agent, you will follow these steps:

1. Run the agent on the environment to **collect training data** per episode.
2. Compute **expected return** at each time step.
3. **Compute the loss** for the combined actor-critic model.
4. Compute gradients and **update network** parameters.
5. Repeat 1-4 until either success criterion or max episodes has been reached.

In [17]:
# PARAMETERS
episodes = 100
max_steps_per_episode = 10000

gamma = 0.99  #Discount factor

eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0

In [18]:
action_probs_history = []
critic_value_history = []

reward_history = []
running_reward = 0

In [19]:
for episode in range(1, episodes+1):
    
    state = env.reset()
    done = False
    episode_reward = 0

    with tf.GradientTape() as tape:
        
        # Generate episode
        for step in range(1, max_steps_per_episode):

            env.render()
            
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)

            # Evaluate action probs and estimate reward
            action_probs, critic_value = model(state)
            critic_value_history.append(critic_value[0, 0])

            # Choose the action
            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            action_probs_history.append(tf.math.log(action_probs[0, action]))

            # Apply action to the environment
            state, reward, done, _ = env.step(action)
            reward_history.append(reward)
            episode_reward += reward

            if done:
                break
            
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
        
        # Calculate expected value from rewards
        returns = []
        discounted_sum = 0
        for r in reward_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)
            
        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()
        
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        
        for log_prob, value, ret in history:
            diff = ret - value
            actor_losses.append(-log_prob * diff)
            critic_losses.append(huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)))
            
        # Backpropagation
        total_loss = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        reward_history.clear()
        
    # Log details
    if episode % 10 == 0:
        print(f'running reward: {running_reward:.2f} at episode {episode}')
        
env.close()

running reward: 9.54 at episode 10
running reward: 21.77 at episode 20
running reward: 32.51 at episode 30
running reward: 34.57 at episode 40
running reward: 37.97 at episode 50
running reward: 68.99 at episode 60
running reward: 50.89 at episode 70
running reward: 44.06 at episode 80
running reward: 67.44 at episode 90
running reward: 111.75 at episode 100
