In [1]:
pip install gymnasium



In [None]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Hyperparameters
learning_rate = 0.01
gamma = 0.99
max_episodes = 2000
render_every = 200

# Environment (switch "MountainCar-v0" to "CartPole-v1" for the other assignment)
env = gym.make("MountainCar-v0")
num_states = env.observation_space.shape[0]
num_actions = env.action_space.n

# Build Policy Network
model = keras.Sequential([
    keras.Input(shape=(num_states,)),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(num_actions, activation='softmax')
])
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

# Discount and normalize rewards
def discount_rewards(rewards, gamma):
    discounted = np.zeros_like(rewards, dtype=np.float32)
    running_sum = 0
    for t in reversed(range(len(rewards))):
        running_sum = rewards[t] + gamma * running_sum
        discounted[t] = running_sum
    discounted -= np.mean(discounted)
    discounted /= (np.std(discounted) + 1e-8)
    return discounted

# Training loop
for episode in range(max_episodes):
    state, _ = env.reset()
    states, actions, rewards = [], [], []
    done = False

    while not done:
        # Shape (1, num_states) for model
        state_tensor = tf.convert_to_tensor([state], dtype=tf.float32)
        probs = model(state_tensor).numpy()[0]

        # Sample an action
        action = np.random.choice(num_actions, p=probs)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Save trajectory
        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state

    # Compute discounted rewards
    discounted_rewards = discount_rewards(rewards, gamma)

    # Convert to tensors
    states_tensor = tf.convert_to_tensor(states, dtype=tf.float32)
    actions_tensor = tf.convert_to_tensor(actions, dtype=tf.int32)
    rewards_tensor = tf.convert_to_tensor(discounted_rewards, dtype=tf.float32)

    # Policy gradient update
    with tf.GradientTape() as tape:
        logits = model(states_tensor)
        indices = tf.stack([tf.range(len(actions)), actions_tensor], axis=1)
        selected_action_probs = tf.gather_nd(logits, indices)
        loss = -tf.reduce_mean(tf.math.log(selected_action_probs + 1e-10) * rewards_tensor)

    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    print(f"Episode {episode+1}: reward = {sum(rewards)}")

    # Optional: render every few episodes
    if (episode + 1) % render_every == 0:
        state, _ = env.reset()
        done = False
        while not done:
            env.render()
            action = np.argmax(model(np.array([state])).numpy()[0])
            state, _, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

env.close()


Episode 1: reward = -200.0
Episode 2: reward = -200.0
Episode 3: reward = -200.0
Episode 4: reward = -200.0
Episode 5: reward = -200.0
Episode 6: reward = -200.0
Episode 7: reward = -200.0
Episode 8: reward = -200.0
Episode 9: reward = -200.0
Episode 10: reward = -200.0
Episode 11: reward = -200.0
Episode 12: reward = -200.0
Episode 13: reward = -200.0
Episode 14: reward = -200.0
Episode 15: reward = -200.0
Episode 16: reward = -200.0
Episode 17: reward = -200.0
Episode 18: reward = -200.0
Episode 19: reward = -200.0
Episode 20: reward = -200.0
Episode 21: reward = -200.0
Episode 22: reward = -200.0
Episode 23: reward = -200.0
Episode 24: reward = -200.0
Episode 25: reward = -200.0
Episode 26: reward = -200.0
Episode 27: reward = -200.0
Episode 28: reward = -200.0
Episode 29: reward = -200.0
Episode 30: reward = -200.0
Episode 31: reward = -200.0
Episode 32: reward = -200.0
Episode 33: reward = -200.0
Episode 34: reward = -200.0
Episode 35: reward = -200.0
Episode 36: reward = -200.0
E

  gym.logger.warn(


Episode 201: reward = -200.0
Episode 202: reward = -200.0
Episode 203: reward = -200.0
Episode 204: reward = -200.0
Episode 205: reward = -200.0
Episode 206: reward = -200.0
Episode 207: reward = -200.0
Episode 208: reward = -200.0
Episode 209: reward = -200.0
Episode 210: reward = -200.0
Episode 211: reward = -200.0
Episode 212: reward = -200.0
Episode 213: reward = -200.0
Episode 214: reward = -200.0
Episode 215: reward = -200.0
Episode 216: reward = -200.0
Episode 217: reward = -200.0
Episode 218: reward = -200.0
Episode 219: reward = -200.0
Episode 220: reward = -200.0
Episode 221: reward = -200.0
Episode 222: reward = -200.0
Episode 223: reward = -200.0
Episode 224: reward = -200.0
Episode 225: reward = -200.0
Episode 226: reward = -200.0
Episode 227: reward = -200.0
Episode 228: reward = -200.0
Episode 229: reward = -200.0
Episode 230: reward = -200.0
Episode 231: reward = -200.0
Episode 232: reward = -200.0
Episode 233: reward = -200.0
Episode 234: reward = -200.0
Episode 235: r