In [32]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import mse

# Create the environment
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Define the actor and critic models
def create_actor():
    actor = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(state_dim,)),
        Dense(action_dim, activation='softmax')
    ])
    return actor

def create_critic():
    critic = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(state_dim,)),
        Dense(1)
    ])
    return critic

# Initialize the models and optimizers
actor = create_actor()
critic = create_critic()
actor_optimizer = Adam(learning_rate=0.001)
critic_optimizer = Adam(learning_rate=0.005)

# Define the loss functions
mse_loss = mse

# Set the hyperparameters
num_episodes = 1000
gamma = 0.99

# Run the training loop
for episode in range(num_episodes):
    state = env.reset()
    done = False
    episode_reward = 0
    while not done:
        # Predict the action probabilities and choose an action
        with tf.GradientTape() as tape1:
            action_probs = actor(np.expand_dims(state, axis=0))
        action_probs /= tf.reduce_sum(action_probs)
        action = np.random.choice(env.action_space.n, p=action_probs.numpy()[0])

        
        # Take a step in the environment and observe the reward and next state
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        
        # Calculate the TD error and update the critic's value estimate
        with tf.GradientTape() as tape2:
            td_error = reward + gamma * critic(np.expand_dims(next_state, axis=0)) - critic(np.expand_dims(state, axis=0))
            critic_loss = mse_loss(td_error, tf.constant([[reward]], dtype=tf.float32))
        critic_gradients = tape2.gradient(critic_loss, critic.trainable_variables)
        critic_optimizer.apply_gradients(zip(critic_gradients, critic.trainable_variables))
        
        # Update the actor's policy
        with tf.GradientTape() as tape3:
            action_probs = actor(np.expand_dims(state, axis=0))
            action_probs /= tf.reduce_sum(action_probs)
            actor_loss = -tf.math.log(action_probs[0][action]) * td_error
        actor_gradients = tape3.gradient(actor_loss, actor.trainable_variables)
        actor_gradients = [tf.clip_by_value(grad, -1., 1.) for grad in actor_gradients if grad is not None]
        actor_optimizer.apply_gradients(zip(actor_gradients, actor.trainable_variables))
        
        # Update the state
        state = next_state
    
    # Print the episode score
    print("Episode {}: {}".format(episode+1, episode_reward))
    
env.close()


Episode 1: 45.0
Episode 2: 20.0
Episode 3: 38.0
Episode 4: 13.0
Episode 5: 23.0
Episode 6: 42.0
Episode 7: 11.0
Episode 8: 49.0
Episode 9: 19.0
Episode 10: 33.0
Episode 11: 32.0
Episode 12: 31.0
Episode 13: 40.0
Episode 14: 15.0
Episode 15: 31.0
Episode 16: 22.0
Episode 17: 89.0
Episode 18: 20.0
Episode 19: 65.0
Episode 20: 21.0
Episode 21: 18.0
Episode 22: 36.0
Episode 23: 17.0
Episode 24: 22.0
Episode 25: 81.0
Episode 26: 46.0
Episode 27: 34.0
Episode 28: 21.0
Episode 29: 13.0
Episode 30: 17.0
Episode 31: 37.0
Episode 32: 25.0
Episode 33: 83.0
Episode 34: 15.0
Episode 35: 45.0
Episode 36: 16.0
Episode 37: 14.0
Episode 38: 15.0
Episode 39: 34.0
Episode 40: 10.0
Episode 41: 34.0
Episode 42: 18.0
Episode 43: 12.0
Episode 44: 22.0
Episode 45: 66.0
Episode 46: 25.0
Episode 47: 87.0
Episode 48: 16.0
Episode 49: 24.0
Episode 50: 35.0
Episode 51: 25.0
Episode 52: 13.0
Episode 53: 35.0
Episode 54: 11.0
Episode 55: 14.0
Episode 56: 14.0
Episode 57: 73.0
Episode 58: 21.0
Episode 59: 45.0
Episod