In [34]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers

In [35]:
seed = 42
gamma = 0.99
max_steps_per_episode = 10000
env = gym.make("CartPole-v1")
obs, info = env.reset(seed=seed)
eps = np.finfo(np.float32).eps.item()

In [36]:
num_inputs = 4
num_actions = 2
num_hidden = 128

inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden, activation="relu")(inputs)
action = layers.Dense(num_actions, activation="softmax")(common)
critic = layers.Dense(1)(common)

model = keras.Model(inputs=inputs, outputs=[action, critic])

In [37]:
optimizer = keras.optimizers.Adam(learning_rate=0.01)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0

In [49]:
while True:  # Run until solved
    state, info = env.reset(seed=seed)
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):
            # env.render(); Adding this line would show the attempts
            # of the agent in a pop up window.

            print(state)
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)

            # Predict action probabilities and estimated future rewards
            # from environment state
            action_probs, critic_value = model(state)
            critic_value_history.append(critic_value[0, 0])

            # Sample action from action probability distribution
            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            action_probs_history.append(tf.math.log(action_probs[0, action]))

            # Apply the sampled action in our environment
            state, reward, done, _, _ = env.step(action)
            rewards_history.append(reward)
            episode_reward += reward

            if done:
                break

        # Update running reward to check condition for solving
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        # Calculate expected value from rewards
        # - At each timestep what was the total reward received after that timestep
        # - Rewards in the past are discounted by multiplying them with gamma
        # - These are the labels for our critic
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)

        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()

        # Calculating loss values to update our network
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            # At this point in history, the critic estimated that we would get a
            # total reward = `value` in the future. We took an action with log probability
            # of `log_prob` and ended up recieving a total reward = `ret`.
            # The actor must be updated so that it predicts an action that leads to
            # high rewards (compared to critic's estimate) with high probability.
            diff = ret - value
            actor_losses.append(-log_prob * diff)  # actor loss

            # The critic must be updated so that it predicts a better estimate of
            # the future rewards.
            critic_losses.append(
                huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
            )

        # Backpropagation
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    # Log details
    episode_count += 1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))

    if running_reward > 150:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break


[ 0.0273956  -0.00611216  0.03585979  0.0197368 ]
[ 0.02727336 -0.20172954  0.03625453  0.32351476]
[ 0.02323877 -0.39734846  0.04272482  0.62740684]
[ 0.0152918  -0.5930399   0.05527296  0.9332334 ]
[ 0.003431   -0.7888622   0.07393762  1.2427603 ]
[-0.01234624 -0.594763    0.09879284  0.9741247 ]
[-0.0242415  -0.7910615   0.11827533  1.2961347 ]
[-0.04006273 -0.98747045  0.14419802  1.6233817 ]
[-0.05981214 -1.1839656   0.17666565  1.9573127 ]
[ 0.0273956  -0.00611216  0.03585979  0.0197368 ]
[ 0.02727336  0.18847767  0.03625453 -0.26141977]
[ 0.03104291  0.38306385  0.03102613 -0.5424507 ]
[ 0.03870419  0.5777363   0.02017712 -0.8251987 ]
[ 0.05025892  0.7725766   0.00367314 -1.111468  ]
[ 0.06571045  0.5774066  -0.01855621 -0.817635  ]
[ 0.07725858  0.77277756 -0.03490891 -1.1160963 ]
[ 0.09271413  0.5781308  -0.05723084 -0.83456504]
[ 0.10427675  0.38383546 -0.07392213 -0.5604159 ]
[ 0.11195346  0.1898246  -0.08513046 -0.29190773]
[ 0.11574995 -0.00398689 -0.09096861 -0.02724144]
