# REINFORCE with Baseline — Policy Gradient on CartPole-v1

This notebook demonstrates the use of the REINFORCE algorithm with a baseline to train a neural network policy for the CartPole-v1 environment. The baseline helps reduce the variance of the policy gradient updates.

---


In [None]:
# Imports and setup
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Set seeds for reproducibility
seed = 65
np.random.seed(seed)
tf.random.set_seed(seed)

## Initial Setup

We start by building a policy network that outputs probabilities for taking an action, and test how it performs without training.

In [None]:
# Create environment and untrained policy model
env = gym.make("CartPole-v1")
n_inputs = 4

model = keras.Sequential([
    keras.layers.InputLayer(shape=(n_inputs,)),
    keras.layers.Dense(5, activation="elu"),
    keras.layers.Dense(1, activation="sigmoid")
])

# Evaluate untrained model
def evaluate_episode(model, env, max_steps=500):
    obs = env.reset()
    total_reward = 0
    done = False
    steps = 0

    while not done and steps < max_steps:
        p_left = model(obs.reshape(1, -1), training=False)[0, 0].numpy()
        action = np.random.rand() > p_left
        obs, reward, done, _ = env.step(int(action))
        total_reward += reward
        steps += 1
    return total_reward

print(f"Initial reward before training: {evaluate_episode(model, env)}")


## Helper Function to Play One Game

We'll play full episodes, collecting states, actions, and rewards. Then we compute the discounted and normalized returns.

In [None]:
def play_game(env, policy_model, gamma=0.99, epsilon=1e-9):
    states, actions, rewards = [], [], []
    obs = env.reset()
    done = False

    while not done:
        states.append(obs)
        p_left = policy_model(obs.reshape(1, -1), training=False)[0, 0].numpy()
        action = np.random.rand() > p_left
        obs, reward, done, _ = env.step(int(action))
        actions.append(action)
        rewards.append(reward)

    # Compute discounted return
    disc_rewards = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        disc_rewards.insert(0, G)
    disc_rewards = np.array(disc_rewards)

    # Normalize
    norm_rewards = (disc_rewards - disc_rewards.mean()) / (disc_rewards.std() + epsilon)
    return np.array(states), np.array(actions), norm_rewards, rewards

## Training the Policy

We train the network using policy gradients and a baseline — the baseline in this case is the average return of the episode.


In [None]:
def train_policy_model(n_episodes, env_name, policy_model, gamma=0.99, epsilon=1e-9):
    reward_history = []
    env = gym.make(env_name)
    env.seed(seed)
    env.action_space.seed(seed)
    optimizer = keras.optimizers.Adam(learning_rate=0.01)

    for episode in range(n_episodes):
        with tf.GradientTape() as tape:
            states, actions, norm_rewards, rewards = play_game(env, policy_model, gamma, epsilon)

            states = tf.convert_to_tensor(states, dtype=tf.float32)
            actions = tf.convert_to_tensor(actions, dtype=tf.float32)
            norm_rewards = tf.convert_to_tensor(norm_rewards, dtype=tf.float32)

            p_left = tf.reshape(policy_model(states), (-1,))
            action_probs = tf.where(actions == 0, p_left, 1 - p_left)
            log_probs = tf.math.log(tf.clip_by_value(action_probs, 1e-8, 1.0))

            # Baseline as average reward
            advantage = norm_rewards - tf.reduce_mean(norm_rewards)
            loss = -tf.reduce_mean(log_probs * advantage)

        grads = tape.gradient(loss, policy_model.trainable_variables)
        optimizer.apply_gradients(zip(grads, policy_model.trainable_variables))

        reward_history.append(sum(rewards))
        if episode % 50 == 0:
            print(f"Episode {episode}, Reward: {sum(rewards)}, Loss: {loss.numpy():.4f}")
    return reward_history

## Training the Agent


In [None]:
reward_history = train_policy_model(n_episodes=500,env_name="CartPole-v1",
                                    policy_model=model,gamma=0.99)


## Post-Training Evaluation


In [None]:
final_reward = evaluate_episode(model, env)
print(f"Total reward after training: {final_reward}")

test_runs = 20
avg_reward = np.mean([evaluate_episode(model, env) for _ in range(test_runs)])
print(f"Average reward over {test_runs} episodes: {avg_reward:.2f}")


## Reward Curve

In [None]:
plt.plot(reward_history)
plt.title("Training Progress — REINFORCE with Baseline")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.grid(True)
plt.show()
