In [1]:
import numpy as np
import tensorflow as tf
import gym

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
#warnings.filterwarnings('once')
warnings.filterwarnings('ignore')

In [2]:
# Create the CartPole Environment
env = gym.make('CartPole-v1', render_mode="human")


In [3]:
# Define the actor and critic networks
actor = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(env.action_space.n, activation='softmax')
])

critic = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])


In [4]:
# Define optimizer and loss functions
actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)


In [5]:
#Training Loop

# Main training loop runs for a specified number of episodes (1000).
# Agent interacts with the environment, and for each episode, it resets the environment and initializes the episode reward to 0.
# The with tf.GradientTape block is used to compute gradients for the actor and critic networks.
# Agent chooses an action based on the actor’s output probabilities and takes that action in the environment.
# It observes the next state, reward, and whether the episode is done.
# Advantage function is computed, which is the difference between the expected return and the estimated value at the current state.
# Actor and Critic losses are calculated based on the advantage function.
# Gradients are computed using tape.gradient and then applied to update the actor and critic networks using the respective optimisers.
# Episode’s total reward is updated, and the loop continues until the episode ends.
# Every 10 episodes, the current episode number and reward are printed.

In [None]:
# Main training loop
num_episodes = 1000
gamma = 0.99

for episode in range(num_episodes):
    state, info = env.reset()
    episode_reward = 0

    with tf.GradientTape(persistent=True) as tape:
        for t in range(1, 10000): # Limit the number of time steps
            # Choose an action using the actor
            action_probs = actor(np.array([state]))
            action = np.random.choice(env.action_space.n, p=action_probs.numpy()[0])

            # Take the chosen action and observe the next state and reward
            next_state, reward, done, _ , _= env.step(action)

            # Compute the advantage
            state_value = critic(np.array([state]))[0, 0]
            next_state_value = critic(np.array([next_state]))[0, 0]
            advantage = reward + gamma * next_state_value - state_value

            # Compute actor and critic losses
            actor_loss = -tf.math.log(action_probs[0, action]) * advantage
            critic_loss = tf.square(advantage)

            episode_reward += reward

            # Update actor and critic
            actor_gradients = tape.gradient(actor_loss, actor.trainable_variables)
            critic_gradients = tape.gradient(critic_loss, critic.trainable_variables)
            actor_optimizer.apply_gradients(zip(actor_gradients, actor.trainable_variables))
            critic_optimizer.apply_gradients(zip(critic_gradients, critic.trainable_variables))

            if done:
                break

    if episode % 10 == 0:
        print(f"Episode {episode}, Reward: {episode_reward}")

env.close()


Episode 0, Reward: 11.0
Episode 10, Reward: 16.0
Episode 20, Reward: 8.0
Episode 30, Reward: 13.0
Episode 40, Reward: 15.0
Episode 50, Reward: 51.0
Episode 60, Reward: 23.0
Episode 70, Reward: 12.0
Episode 80, Reward: 25.0
Episode 90, Reward: 46.0
Episode 100, Reward: 18.0
