In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import gym
import numpy as np
import keras
from keras import ops
from keras import layers
import tensorflow as tf

  from jax import xla_computation as _xla_computation


In [2]:
# Environment setup
env = gym.make("CartPole-v1")
num_inputs = 4
num_actions = 1  # Deterministic actions
num_hidden = 128
gamma = 0.99
exploration_noise_std = 0.1
max_steps_per_episode = 500

  and should_run_async(code)
  deprecation(
  deprecation(


In [3]:
# Actor model
actor_inputs = layers.Input(shape=(num_inputs,))
actor_hidden = layers.Dense(num_hidden, activation="relu")(actor_inputs)
actor_output = layers.Dense(num_actions, activation="tanh")(actor_hidden)
actor_model = tf.keras.Model(inputs=actor_inputs, outputs=actor_output)

In [4]:
# Critic model
critic_inputs = layers.Input(shape=(num_inputs + num_actions,))
critic_hidden = layers.Dense(num_hidden, activation="relu")(critic_inputs)
q_value = layers.Dense(1)(critic_hidden)
critic_model = tf.keras.Model(inputs=critic_inputs, outputs=q_value)

actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [5]:
# Optimizers
actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)

# Replay buffer setup
class ReplayBuffer:
    def __init__(self, buffer_capacity=100000, batch_size=64):
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        self.buffer_counter = 0
        self.state_buffer = np.zeros((buffer_capacity, num_inputs))
        self.action_buffer = np.zeros((buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((buffer_capacity, 1))
        self.next_state_buffer = np.zeros((buffer_capacity, num_inputs))
        self.done_buffer = np.zeros((buffer_capacity, 1))

    def store(self, state, action, reward, next_state, done):
        index = self.buffer_counter % self.buffer_capacity
        self.state_buffer[index] = state
        self.action_buffer[index] = action
        self.reward_buffer[index] = reward
        self.next_state_buffer[index] = next_state
        self.done_buffer[index] = done
        self.buffer_counter += 1

    def sample(self):
        max_buffer = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(max_buffer, self.batch_size)
        return (
            self.state_buffer[batch_indices],
            self.action_buffer[batch_indices],
            self.reward_buffer[batch_indices],
            self.next_state_buffer[batch_indices],
            self.done_buffer[batch_indices],
        )

# Initialize replay buffer
buffer = ReplayBuffer()

In [6]:
# Training loop
for episode in range(1000):
    state = env.reset()
    episode_reward = 0

    for step in range(max_steps_per_episode):
        state_tensor = tf.convert_to_tensor(state, dtype=tf.float32)
        state_tensor = tf.expand_dims(state_tensor, axis=0)

        # Actor chooses action deterministically
        action = actor_model(state_tensor).numpy()[0]
        action += np.random.normal(0, exploration_noise_std)  # Add noise for exploration
        action = np.clip(action, -1, 1)  # Ensure action is within valid range

        # Execute action in environment
        next_state, reward, done, _ = env.step(int(action > 0))  # Convert continuous action to discrete
        episode_reward += reward

        # Store transition in replay buffer
        buffer.store(state, action, reward, next_state, done)

        # Train models if the replay buffer has enough samples
        if buffer.buffer_counter >= buffer.batch_size:
            states, actions, rewards, next_states, dones = buffer.sample()

            # Convert to tensors
            states = tf.convert_to_tensor(states, dtype=tf.float32)
            actions = tf.convert_to_tensor(actions, dtype=tf.float32)
            rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
            next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
            dones = tf.convert_to_tensor(dones, dtype=tf.float32)

            # Critic training
            with tf.GradientTape() as tape:
                # Target Q-value
                target_actions = actor_model(next_states)
                target_q_values = rewards + (1 - dones) * gamma * critic_model(
                    tf.concat([next_states, target_actions], axis=1)
                )
                # Predicted Q-value
                predicted_q_values = critic_model(tf.concat([states, actions], axis=1))
                critic_loss = tf.keras.losses.MSE(target_q_values, predicted_q_values)

            critic_grads = tape.gradient(critic_loss, critic_model.trainable_variables)
            critic_optimizer.apply_gradients(zip(critic_grads, critic_model.trainable_variables))

            # Actor training
            with tf.GradientTape() as tape:
                # Actor loss is the negative of the Q-value
                actions_pred = actor_model(states)
                actor_loss = -tf.reduce_mean(critic_model(tf.concat([states, actions_pred], axis=1)))

            actor_grads = tape.gradient(actor_loss, actor_model.trainable_variables)
            actor_optimizer.apply_gradients(zip(actor_grads, actor_model.trainable_variables))

        if done:
            break
        state = next_state

    print(f"Episode {episode + 1}: Reward: {episode_reward:.2f}")

    if episode_reward > 475:
        print("Solved!")
        break

  next_state, reward, done, _ = env.step(int(action > 0))  # Convert continuous action to discrete
  if not isinstance(terminated, (bool, np.bool8)):


Episode 1: Reward: 81.00
Episode 2: Reward: 12.00
Episode 3: Reward: 9.00
Episode 4: Reward: 10.00
Episode 5: Reward: 8.00
Episode 6: Reward: 11.00
Episode 7: Reward: 10.00
Episode 8: Reward: 12.00
Episode 9: Reward: 10.00
Episode 10: Reward: 8.00
Episode 11: Reward: 8.00
Episode 12: Reward: 9.00
Episode 13: Reward: 10.00
Episode 14: Reward: 23.00
Episode 15: Reward: 9.00
Episode 16: Reward: 10.00
Episode 17: Reward: 9.00
Episode 18: Reward: 10.00
Episode 19: Reward: 10.00
Episode 20: Reward: 10.00
Episode 21: Reward: 11.00
Episode 22: Reward: 9.00
Episode 23: Reward: 9.00
Episode 24: Reward: 8.00
Episode 25: Reward: 8.00
Episode 26: Reward: 10.00
Episode 27: Reward: 10.00
Episode 28: Reward: 10.00
Episode 29: Reward: 8.00
Episode 30: Reward: 9.00
Episode 31: Reward: 9.00
Episode 32: Reward: 10.00
Episode 33: Reward: 9.00
Episode 34: Reward: 9.00
Episode 35: Reward: 8.00
Episode 36: Reward: 10.00
Episode 37: Reward: 9.00
Episode 38: Reward: 9.00
Episode 39: Reward: 9.00
Episode 40: Rew