# Actor-Critic on CartPole-v1: Training and Demo

This notebook implements an **Actor-Critic** reinforcement learning agent to solve the classic CartPole-v1 environment from OpenAI Gym.  
We'll train the model, evaluate its performance, and visualize training progress.

---

## Setup and Imports


In [None]:
# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Imports
import numpy as np
import gym
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

# Set seed for reproducibility
seed = 65
np.random.seed(seed)
tf.random.set_seed(seed)

# Create environment
env_name = 'CartPole-v1'
env = gym.make(env_name)
env.seed(seed)
env.action_space.seed(seed)

print(f"Environment '{env_name}' created with seed {seed}.")


## Utility Functions

We define helper functions for:

- Running one episode with the current policy and critic (play_game)  
- Evaluating the actor model's performance on the environment (evaluate_episode)  
- Calculating discounted rewards and normalizing them for stable training  


In [None]:
def play_game(env, actor_model, critic_model, gamma=0.99, epsilon=1e-9):
    states, actions, rewards, state_values = [], [], [], []
    obs = env.reset()
    done = False

    while not done:
        states.append(obs)
        state_val = critic_model(obs.reshape(1, -1), training=False)[0, 0]
        state_values.append(state_val)

        p_left = actor_model(obs.reshape(1, -1), training=False)[0, 0]
        action = np.random.rand() > p_left
        obs, reward, done, _ = env.step(int(action))

        actions.append(action)
        rewards.append(reward)

    discounted_rewards = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        discounted_rewards.insert(0, G)
    discounted_rewards = np.array(discounted_rewards)
    discounted_rewards_norm = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + epsilon)

    return np.array(states), np.array(actions), discounted_rewards_norm, rewards, state_values


def evaluate_episode(model, env, max_steps=500):
    obs = env.reset()
    done = False
    total_reward = 0
    steps = 0

    while not done and steps < max_steps:
        p_left = model(obs.reshape(1, -1), training=False)[0, 0].numpy()
        action = np.random.rand() > p_left
        obs, reward, done, _ = env.step(int(action))
        total_reward += reward
        steps += 1

    return total_reward


## Model Definitions

We define:

- **Actor model**: outputs probability of moving left (sigmoid output)  
- **Critic model**: outputs estimated value of a state (no activation on final layer)  


In [None]:
n_inputs = env.observation_space.shape[0]

actor_model = keras.Sequential([
    keras.layers.InputLayer(shape=(n_inputs,)),
    keras.layers.Dense(5, activation='elu'),
    keras.layers.Dense(1, activation='sigmoid')
])

critic_model = keras.Sequential([
    keras.layers.InputLayer(shape=(n_inputs,)),
    keras.layers.Dense(5, activation='elu'),
    keras.layers.Dense(1)
])

print("Actor and Critic models defined.")


## Training Setup and Loop

We set up:

- Loss functions and optimizers for actor and critic  
- The main training loop over episodes  
- Policy gradient with advantage calculation  
- Periodic logging of rewards and losses  


In [None]:
# Loss function for critic
critic_loss_fn = keras.losses.MeanSquaredError()

# Optimizers
learning_rate = 0.01
actor_optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
critic_optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

# Training parameters
n_episodes = 500
gamma = 0.99  # Discount factor
epsilon = 1e-9  # Small value to avoid division by zero


reward_history = []

for episode in range(n_episodes):
    with tf.GradientTape(persistent=True) as tape:
        states, actions, discounted_rewards, rewards, state_values = play_game(env, actor_model, critic_model, gamma, epsilon)

        states = tf.convert_to_tensor(states, dtype=tf.float32)
        actions = tf.convert_to_tensor(actions, dtype=tf.float32)
        discounted_rewards = tf.convert_to_tensor(discounted_rewards, dtype=tf.float32)
        state_values = tf.convert_to_tensor(state_values, dtype=tf.float32)

        # Critic loss: MSE between predicted values and discounted rewards
        critic_loss = critic_loss_fn(state_values, discounted_rewards)

        # Actor loss: Policy gradient with advantage
        p_left = tf.reshape(actor_model(states), (-1,))
        action_probs = tf.where(actions == 0, p_left, 1 - p_left)
        action_probs = tf.clip_by_value(action_probs, 1e-8, 1.0)

        advantage = discounted_rewards - state_values
        advantage = (advantage - tf.reduce_mean(advantage)) / (tf.math.reduce_std(advantage) + epsilon)

        log_actions = tf.math.log(action_probs + epsilon)
        actor_loss = -tf.reduce_mean(log_actions * advantage)

    # Compute gradients and apply updates
    actor_grads = tape.gradient(actor_loss, actor_model.trainable_variables)
    actor_optimizer.apply_gradients(zip(actor_grads, actor_model.trainable_variables))

    critic_grads = tape.gradient(critic_loss, critic_model.trainable_variables)
    critic_optimizer.apply_gradients(zip(critic_grads, critic_model.trainable_variables))

    reward_history.append(sum(rewards))

    if episode % 50 == 0:
        print(f"Episode {episode:4d} | Reward: {sum(rewards):5.1f} | "
              f"Actor Loss: {actor_loss.numpy():8.4f} | Critic Loss: {critic_loss.numpy():8.4f}")


## Evaluation and Visualization

After training, we evaluate the trained actor model over multiple episodes to estimate performance.  
Finally, we plot the training rewards to visualize learning progress.


In [None]:
# Evaluate trained actor model
test_episodes = 100
test_rewards = [evaluate_episode(actor_model, env) for _ in range(test_episodes)]
print(f"\nAverage reward over {test_episodes} test episodes: {np.mean(test_rewards):.2f}")

# Plot training rewards
plt.figure(figsize=(12,6))
plt.plot(reward_history, label='Episode Reward')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Training Progress of Actor-Critic on CartPole-v1')
plt.legend()
plt.grid(True)
plt.show()


## Conclusion

- Our actor-critic agent learns a policy that balances the pole effectively over time.  
- The training rewards generally increase, demonstrating improvement in policy quality.  
- This simple architecture and training loop provide a solid foundation for policy gradient methods with function approximation.  
- Potential improvements include tuning hyperparameters, experimenting with more complex networks, or applying to other environments.

---

Thanks for following along! Feel free to modify the code and explore further.
