In [2]:
pip install gymnasium



In [None]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Hyperparameters
learning_rate = 0.01
gamma = 0.99
max_episodes = 1000

# Environment
env = gym.make("CartPole-v1")
num_states = env.observation_space.shape[0]
num_actions = env.action_space.n

# Model
model = keras.Sequential([
    keras.Input(shape=(num_states,)),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(num_actions, activation='softmax')
])
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

# Discount rewards
def discount_rewards(rewards, gamma):
    discounted = np.zeros_like(rewards, dtype=np.float32)
    running_sum = 0
    for t in reversed(range(len(rewards))):
        running_sum = rewards[t] + gamma * running_sum
        discounted[t] = running_sum
    discounted -= np.mean(discounted)
    discounted /= (np.std(discounted) + 1e-8)
    return discounted

# Training loop
for episode in range(max_episodes):
    state, _ = env.reset()
    states, actions, rewards = [], [], []
    done = False

    while not done:
        state_tensor = tf.convert_to_tensor([state], dtype=tf.float32)  # (1,4)
        probs = model(state_tensor).numpy()[0]
        action = np.random.choice(num_actions, p=probs)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        state = next_state

    discounted_rewards = discount_rewards(rewards, gamma)

    states_tensor = tf.convert_to_tensor(states, dtype=tf.float32)
    actions_tensor = tf.convert_to_tensor(actions, dtype=tf.int32)
    rewards_tensor = tf.convert_to_tensor(discounted_rewards, dtype=tf.float32)

    with tf.GradientTape() as tape:
        logits = model(states_tensor)
        indices = tf.stack([tf.range(len(actions)), actions_tensor], axis=1)
        selected_action_probs = tf.gather_nd(logits, indices)
        loss = -tf.reduce_mean(tf.math.log(selected_action_probs + 1e-10) * rewards_tensor)

    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    print(f"Episode {episode+1}: reward = {sum(rewards)}")

env.close()


Episode 1: reward = 11.0
Episode 2: reward = 25.0
Episode 3: reward = 10.0
Episode 4: reward = 11.0
Episode 5: reward = 16.0
Episode 6: reward = 11.0
Episode 7: reward = 13.0
Episode 8: reward = 22.0
Episode 9: reward = 19.0
Episode 10: reward = 10.0
Episode 11: reward = 9.0
Episode 12: reward = 10.0
Episode 13: reward = 12.0
Episode 14: reward = 14.0
Episode 15: reward = 9.0
Episode 16: reward = 15.0
Episode 17: reward = 15.0
Episode 18: reward = 11.0
Episode 19: reward = 12.0
Episode 20: reward = 12.0
Episode 21: reward = 10.0
Episode 22: reward = 12.0
Episode 23: reward = 10.0
Episode 24: reward = 11.0
Episode 25: reward = 19.0
Episode 26: reward = 13.0
Episode 27: reward = 12.0
Episode 28: reward = 11.0
Episode 29: reward = 9.0
Episode 30: reward = 14.0
Episode 31: reward = 14.0
Episode 32: reward = 9.0
Episode 33: reward = 14.0
Episode 34: reward = 9.0
Episode 35: reward = 15.0
Episode 36: reward = 14.0
Episode 37: reward = 10.0
Episode 38: reward = 17.0
Episode 39: reward = 10.0
