In [None]:
# solving bipedal walker using PPO

In [None]:
!pip install swig
!pip install gymnasium['box2d']

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import math
import random
import matplotlib
import matplotlib.pyplot as plt

from collections import namedtuple, deque
from itertools import count

import gymnasium as gym

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

In [3]:
is_ipython = 'inline' in matplotlib.get_backend()

if is_ipython:
    from IPython import display

# enable interactive mode
plt.ion()

# setup the device to be used
# all the tensors are to be sent to this device for rendering and manipulation
# in this case video rendering
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

In [4]:
# PPO Hyperparameters

LEARNING_RATE = 1e-4
# Learning rate for the optimizer (e.g., Adam optimizer).
# - High values (e.g., > 0.01): Faster updates, but may lead to instability or divergence.
# - Low values (e.g., < 0.0001): Slower updates, may take longer to converge but often more stable.

GAMMA = 0.99
# Discount factor for future rewards in the return calculation.
# - High values (close to 1, e.g., 0.99): The agent places more importance on long-term rewards.
# - Low values (close to 0, e.g., 0.8): The agent focuses more on immediate rewards, which can hinder performance in environments requiring long-term planning.

CLIP_EPSILON = 0.1
# Clipping range for the probability ratio in the PPO loss function.
# - High values (e.g., 0.4): Allows larger updates, which can lead to faster learning but risks instability.
# - Low values (e.g., 0.1): Constrains updates to be more conservative, improving stability but potentially slowing learning.

ENTROPY_BETA = 0.01
# Coefficient for the entropy bonus in the PPO loss function.
# - High values (e.g., 0.05): Increases exploration by encouraging more diverse actions, which is useful in highly stochastic environments but may slow convergence.
# - Low values (e.g., 0.001): Reduces exploration, focusing on exploitation of the current policy, which may lead to premature convergence to suboptimal policies.

EPOCHS = 3
# Number of passes over the collected data during each PPO update.
# - High values (e.g., 10): Allows more optimization for each batch, but may lead to overfitting the sampled data.
# - Low values (e.g., 1): Reduces the risk of overfitting but may under-optimize the policy during updates.

STEPS_PER_UPDATE = 2048
# Number of environment steps collected before performing a PPO update.
# - High values (e.g., 5000): Collects more diverse data per update, improving training stability but increasing memory and computational requirements.
# - Low values (e.g., 512): Faster updates but with less diverse data, which can lead to noisier updates and reduced performance.

MAX_TIMESTEPS = 200000
# Total number of timesteps to train the agent.
# - High values (e.g., > 1,000,000): Allows more time for training, which is essential for complex tasks but increases training time.
# - Low values (e.g., 50,000): Trains faster but risks insufficient exploration and learning.

HIDDEN_SIZE = 256
# Number of neurons in the hidden layers of the neural network.
# - High values (e.g., 256): Increases the model's capacity to learn complex patterns but may lead to overfitting and slower computation.
# - Low values (e.g., 16): Reduces model capacity, which may result in underfitting and failure to capture environment dynamics.

LAM = 0.8
# Lambda parameter for Generalized Advantage Estimation (GAE).
# - High values (close to 1, e.g., 0.98): Reduces variance in advantage estimates but introduces more bias.
# - Low values (close to 0, e.g., 0.9): Increases variance in advantage estimates but reduces bias.


In [5]:
# env = gym.make("BipedalWalker-v3", hardcore=False, render_mode="human")
env = gym.make("LunarLander-v3", render_mode="human", continuous=True)
# env = gym.make("InvertedDoublePendulum-v4", render_mode="human")

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

In [6]:
class ActorCritic(nn.Module):
    """
    Simplified Actor-Critic Neural Network for PPO.

    This class represents a combined actor-critic model:
    - The shared layers process the input state to extract features.
    - The actor network outputs the mean and standard deviation of a Gaussian distribution for the actions.
    - The critic network outputs the estimated value of the input state.

    Parameters:
    ----------
    state_dim : int
        The dimension of the input state space.
    action_dim : int
        The dimension of the action space.
    hidden_size : int
        The number of neurons in the hidden layers.

    Returns:
    -------
    mean : torch.Tensor
        The mean of the Gaussian distribution for the actions. Shape: [batch_size, action_dim]
    std : torch.Tensor
        The standard deviation of the Gaussian distribution for the actions. Shape: [batch_size, action_dim]
    value : torch.Tensor
        The estimated value of the input state. Shape: [batch_size, 1]
    """
    def __init__(self, state_dim, action_dim, hidden_size):
        super(ActorCritic, self).__init__()
        # Shared feature extractor (simplified structure)
        self.layer1 = nn.Linear(state_dim, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)

        # Actor-specific layers
        self.actor_mean = nn.Linear(hidden_size, action_dim)
        self.actor_std = nn.Linear(hidden_size, action_dim)

        # Critic-specific layer
        self.critic = nn.Linear(hidden_size, 1)

    def forward(self, state):
        """
        Forward pass through the shared, actor, and critic networks.

        Parameters:
        ----------
        state : torch.Tensor
            The input state tensor. Shape: [batch_size, state_dim]

        Returns:
        -------
        mean : torch.Tensor
            The mean of the Gaussian distribution for the actions. Shape: [batch_size, action_dim]
        std : torch.Tensor
            The standard deviation of the Gaussian distribution for the actions. Shape: [batch_size, action_dim]
        value : torch.Tensor
            The estimated value of the input state. Shape: [batch_size, 1]
        """
        # Shared layers
        x = F.relu(self.layer1(state))
        x = F.relu(self.layer2(x))

        # Actor outputs
        mean = torch.tanh(self.actor_mean(x))  # Mean of actions
        std = torch.clamp(torch.exp(self.actor_std(x)), 1e-3, 1.0)  # Standard deviation

        # Critic output
        value = self.critic(x)  # State value

        return mean, std, value


In [7]:

# Initialize model and optimizer
model = ActorCritic(state_dim, action_dim, HIDDEN_SIZE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)


In [8]:

# Memory to store trajectory data
class Memory:
    def __init__(self):
        self.states = []
        self.actions = []
        self.log_probs = []
        self.rewards = []
        self.dones = []

    def clear(self):
        self.states.clear()
        self.actions.clear()
        self.log_probs.clear()
        self.rewards.clear()
        self.dones.clear()




In [9]:
memory = Memory()

# Compute discounted rewards and advantages
def compute_gae(next_value, rewards, dones, values, gamma, lam):
    """
    Compute Generalized Advantage Estimation (GAE).
    Args:
        next_value: Value of the next state (scalar).
        rewards: List of rewards.
        dones: List of done flags.
        values: List of value predictions (scalars).
        gamma: Discount factor.
        lam: GAE lambda.

    Returns:
        List of discounted rewards (returns).
    """
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * (1 - dones[step]) - values[step]
        gae = delta + gamma * lam * (1 - dones[step]) * gae
        returns.insert(0, gae + values[step])
        print(f"Step {step}: Reward={rewards[step]}, Value={values[step]}, Delta={delta}")

    return returns


In [10]:
# create function to plot the episode duration to monitor learning progress
episode_rewards = []
def plot_rewards(show_result=False):
    """
    Plots the rewards for each episode and the moving average of the last 100 episodes.

    Parameters:
    ----------
    show_result : bool
        If True, show the final plot at the end of training.
    """
    rewards_t = torch.tensor(episode_rewards, dtype=torch.float)  # Convert rewards to tensor
    plt.figure(1)
    plt.clf()

    if show_result:
        plt.title('Result')
    else:
        plt.title('Training')

    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.plot(rewards_t.numpy(), label='Episode Rewards')

    # Plot moving average of the last 100 episodes
    if len(rewards_t) >= 100:
        means = rewards_t.unfold(0, 100, 1).mean(1)  # 100-episode moving average
        means = torch.cat((torch.zeros(99), means))  # Pad to align with episodes
        plt.plot(means.numpy(), label='100-Episode Reward Average', linestyle='--')

    plt.legend()
    plt.pause(0.001)  # Pause to update the plot in real-time

    # Display in Jupyter/IPython
    if is_ipython:
        if not show_result:
            display.display(plt.gcf())
            display.clear_output(wait=True)
        else:
            display.display(plt.gcf())
    

In [11]:
timesteps = 0
while timesteps < MAX_TIMESTEPS:
    state = env.reset()
    if isinstance(state, tuple):
        state = state[0]
    memory.clear()
    critic_values = []  # Separate list for critic values
    cumulative_reward = 0  # Track cumulative reward for the episode

    # Collect trajectories
    for _ in range(STEPS_PER_UPDATE):
        state_tensor = torch.FloatTensor(state).unsqueeze(0)  # [1, state_dim]
        mean, std, value = model(state_tensor)
        dist = Normal(mean, std)
        action = torch.tanh(dist.sample())
        log_prob = dist.log_prob(action).sum(axis=-1)

        # Step the environment
        next_state, reward, terminated, truncated, _ = env.step(action.squeeze(0).detach().numpy())
        done = terminated or truncated

        # Track cumulative rewards
        cumulative_reward += reward

        # Store trajectory data
        memory.states.append(state_tensor.detach())  # Store state tensors
        memory.actions.append(action.detach())
        memory.log_probs.append(log_prob.detach())
        memory.rewards.append(reward)
        memory.dones.append(done)
        critic_values.append(value.squeeze().item())  # Store critic value scalars separately

        state = next_state
        timesteps += 1

        if done:
            # Log cumulative reward for the completed episode
            episode_rewards.append(cumulative_reward)
            plot_rewards()  # Update the plot in real-time

            # Reset cumulative reward for the next episode
            cumulative_reward = 0

            # Reset environment if episode ends
            state = env.reset()
            if isinstance(state, tuple):
                state = state[0]

        if timesteps >= MAX_TIMESTEPS:
            break

    # Compute advantages and returns
    next_value = 0 if done else model(torch.FloatTensor(state).unsqueeze(0))[2].item()
    returns = compute_gae(
        next_value,
        memory.rewards,
        memory.dones,
        critic_values,
        GAMMA,
        LAM
    )
    advantages = np.array([ret - val for ret, val in zip(returns, critic_values)])
    advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)

    # Convert data to tensors
    states = torch.cat(memory.states)  # Concatenate full state tensors
    actions = torch.cat(memory.actions)
    log_probs = torch.cat(memory.log_probs)
    returns = torch.FloatTensor(returns).detach()
    advantages = torch.FloatTensor(advantages).detach()

    # PPO updates
    for _ in range(EPOCHS):
        # Get new action probabilities and values
        mean, std, values = model(states)
        dist = Normal(mean, std)
        new_log_probs = dist.log_prob(actions).sum(axis=-1)

        # Probability ratio
        ratio = torch.exp(new_log_probs - log_probs.detach())

        # Clipped loss
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1 - CLIP_EPSILON, 1 + CLIP_EPSILON) * advantages
        policy_loss = -torch.min(surr1, surr2).mean()

        # Value loss
        value_loss = nn.MSELoss()(values.squeeze(), returns)

        # Entropy bonus
        entropy = dist.entropy().mean()

        # Total loss
        loss = policy_loss + 0.5 * value_loss - ENTROPY_BETA * entropy

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Log progress
    print(f"Timesteps: {timesteps}, Loss: {loss.item()}")

env.close()
print('Training Complete')
plot_rewards(show_result=True)  # Show the final plot
plt.ioff()
plt.show()

KeyboardInterrupt: 

<Figure size 640x480 with 0 Axes>

In [None]:
action.squeeze(0).detach().numpy().shape

In [None]:

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)

for _ in range(100):
    # this is where you would insert your policy
    action = env.action_space.sample()

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()

In [None]:
env.close()

In [None]:
print(f"Critic values mean: {np.mean(critic_values)}, std: {np.std(critic_values)}")
print(f"Episode rewards mean: {np.mean(episode_rewards)}, std: {np.std(episode_rewards)}")


In [None]:
print(f"Loss: {loss.item()}, Policy Loss: {policy_loss.item()}, Value Loss: {value_loss.item()}")


In [None]:
print(f"Advantages mean: {advantages.numpy().mean()}, std: {advantages.numpy().std()}")



In [None]:
print(random.sample(memory.rewards, 10))
    

In [None]:
print(f"Entropy: {entropy.item()}")


In [None]:
the reward is established with the environment and should be OK for PPO. Anyway it is OK when I train the lunar lander using DQN. I should not touch the reward.
the model:
ActorCritic(
  (layer1): Linear(in_features=8, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=256, bias=True)
  (actor_mean): Linear(in_features=256, out_features=2, bias=True)
  (actor_std): Linear(in_features=256, out_features=2, bias=True)
  (critic): Linear(in_features=256, out_features=1, bias=True)
)
How to check if the network is not properly initialized?

In [None]:
print(torch.var(torch.cat(memory.actions)))


In [None]:
print(f"Returns mean: {returns.numpy().mean()}, Critic Values mean: {np.mean(critic_values)}")


In [None]:
torch.tanh(dist.sample())

In [None]:

 2045: Reward=0.2979755661431485, Value=-0.003003731369972229, Delta=0.2964518185499841
Step 2044: Reward=0.38367845851512034, Value=0.0006709322333335876, Delta=0.38003383222551423
Step 2043: Reward=0.27197683146362817, Value=-0.0011911019682884216, Delta=0.27383215634291685
Step 2042: Reward=0.3942888655563048, Value=-0.0008895173668861389, Delta=0.39399919197458544
Step 2041: Reward=0.3404212006632272, Value=0.001573670655488968, Delta=0.33796690781452093
Step 2040: Reward=0.28639629020577567, Value=0.002844579517841339, Delta=0.2851096446368684
Step 2039: Reward=0.3488555654650009, Value=0.005282096564769745, Delta=0.3463896026228941
Step 2038: Reward=0.3266526474470416, Value=0.007256411015987396, Delta=0.32462551203017626