In [1]:
%matplotlib inline
%load_ext tensorboard

In [None]:
import numpy as np

from collections import deque

import matplotlib
import matplotlib.pyplot as plt

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import torch.optim as optim


from tqdm import tqdm

# Gym
import gymnasium as gym
# import gym_pygame

from gymnasium.envs.registration import register, registry

In [16]:
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
if 'MarineEnv-v0' not in registry:
    register(
        id='MarineEnv-v0',
        entry_point='environments:MarineEnv',  # String reference to the class
    )

In [18]:
env_id = 'MarineEnv-v0'
# Create the env
env = gym.make(env_id, continuous=True)

# Create the evaluation env
eval_env = gym.make(env_id)

# Get the state space and action space
s_size = env.observation_space.shape[0]
a_size = env.action_space.shape[0]

In [19]:
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  36
Sample observation [ 2.76829620e+02  6.65461361e-01  2.33108807e+01  4.62088928e+01
  1.20273834e+02  2.08435669e+02  3.91180573e+01  2.22652969e+01
  2.48438370e+02  1.83602009e+01 -5.42292099e+01  1.92076096e+02
  1.61909504e+01  1.74007301e+01 -1.12170672e+00 -4.71105270e+01
 -1.91446018e+01  2.31901665e+01  3.29247345e+02  1.41065136e-01
  6.54572067e+01  2.28515656e+02  3.44887772e+01 -1.82503951e+00
 -7.41438818e+00 -3.30053673e+01  2.74871311e+01  4.98555756e+01
  9.88023300e+01  1.97848873e+01  1.37403397e+02  6.96321487e+01
  4.77097206e+01  1.35796299e+01  5.98651600e+00 -1.20427475e+01]


In [20]:
print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env.action_space.sample()) # Take a random action


 _____ACTION SPACE_____ 

The Action Space is:  2
Action Space Sample [0.7571459  0.86562335]


In [30]:
marine_env_params = {
    "h_size": 128,
    "n_training_episodes": 10000,
    "n_evaluation_episodes": 10,
    "max_t": 1000,
    "lr": 1e-4,
    "env_id": env_id,
    "continuous": True,
    "state_space": s_size,
    "action_space": a_size,
    'gamma': 0.99,
    'clip_epsilon': 0.2,  # PPO Clipping
    'update_epochs': 4,  # Multiple updates per batch
    'print_every': 10
}

In [46]:
class ContinuousPolicy(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(ContinuousPolicy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, h_size)

        # Actor (outputs mean and standard deviation for actions)
        self.mu_layer = nn.Linear(h_size, a_size)
        self.sigma_layer = nn.Linear(h_size, a_size)

        # Critic (outputs a single scalar value)
        self.value_layer = nn.Linear(h_size, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        mu = torch.tanh(self.mu_layer(x))  # Mean (bounded [-1,1])
        sigma = F.softplus(self.sigma_layer(x)) + 1e-5  # Ensure sigma > 0

        value = self.value_layer(x)  # Value function output
        return mu, sigma, value

    def act(self, state):
        """
        Given a state, take an action for a continuous action space.
        Returns: (action, log_prob, value)
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    
        mu, sigma, value = self.forward(state)  # Forward pass now returns a tensor for value
        dist = Normal(mu, sigma)
    
        action = dist.sample()  # Sample from Gaussian distribution
        log_prob = dist.log_prob(action).sum(dim=-1)  # Sum log probs over multiple actions
    
        return action.detach().cpu().numpy()[0], log_prob, value  # Return `value` as a tensor


In [47]:
def ppo_train(policy, optimizer, n_training_episodes, max_t, gamma, clip_epsilon, update_epochs, print_every):
    """
    Implements PPO with Gaussian policy and proper detach() usage.
    """

    scores_deque = deque(maxlen=100)
    scores = []

    for i_episode in range(1, n_training_episodes + 1):
        saved_log_probs = []
        saved_values = []
        rewards = []
        states = []
        actions = []

        state, _ = env.reset()
        
        for t in range(max_t):
            action, log_prob, value = policy.act(state)  

            # Detach `log_prob` and `value` before storing
            saved_log_probs.append(log_prob.detach())  
            saved_values.append(value.detach())
            actions.append(action)
            states.append(state)

            state, reward, terminated, truncated, _ = env.step(action)
            rewards.append(reward)

            if terminated or truncated:
                break 

        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        # Compute advantage function
        returns = deque(maxlen=max_t)
        advantages = deque(maxlen=max_t)
        n_steps = len(rewards)

        last_advantage = 0
        last_value = saved_values[-1]

        for t in range(n_steps - 1, -1, -1):
            delta = rewards[t] + (gamma * saved_values[t + 1] if t + 1 < n_steps else last_value) - saved_values[t]
            last_advantage = delta + gamma * 0.95 * last_advantage  

            returns.appendleft(last_advantage + saved_values[t])  
            advantages.appendleft(last_advantage)  

        # Convert to tensors
        returns = torch.tensor(returns, dtype=torch.float32).to(device)
        advantages = torch.tensor(advantages, dtype=torch.float32).to(device)

        # Normalize advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        # PPO Update
        for _ in range(update_epochs):
            optimizer.zero_grad()  
            
            new_log_probs = []
            new_values = []

            for state, action in zip(states, actions):
                _, new_log_prob, new_value = policy.act(state)
                new_log_probs.append(new_log_prob)
                new_values.append(new_value)

            new_log_probs = torch.stack(new_log_probs).to(device)
            new_values = torch.stack(new_values).squeeze().to(device)

            # Compute ratio correctly (log_prob must be detached!)
            ratio = torch.exp(new_log_probs - torch.tensor(saved_log_probs, dtype=torch.float32).to(device))

            # Clipped Surrogate Loss
            clipped_ratio = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon)
            policy_loss = -torch.min(ratio * advantages, clipped_ratio * advantages).mean()

            # Separate Value Function Update
            value_loss = nn.MSELoss()(new_values, returns)

            # Combine losses for backpropagation
            total_loss = policy_loss + 0.5 * value_loss

            total_loss.backward()
            optimizer.step()

        if i_episode % print_every == 0:
            print(f"Episode {i_episode}\tAverage Score: {np.mean(scores_deque):.2f}")

    return scores


In [48]:
# Create policy and place it to the device
marine_env_policy = ContinuousPolicy(marine_env_params["state_space"], marine_env_params["action_space"], marine_env_params["h_size"]).to(device)
marine_env_optimizer = optim.Adam(marine_env_policy.parameters(), lr=marine_env_params["lr"])

In [49]:
scores = ppo_train(marine_env_policy,
                   marine_env_optimizer,
                   marine_env_params["n_training_episodes"], 
                   marine_env_params["max_t"],
                   marine_env_params["gamma"], 
                   marine_env_params["update_epochs"], 
                   marine_env_params["print_every"], 
                   100)

Episode 100	Average Score: -549.30
Episode 200	Average Score: -351.50
Episode 300	Average Score: -290.60
Episode 400	Average Score: -226.09
Episode 500	Average Score: -196.77
Episode 600	Average Score: -139.46
Episode 700	Average Score: -97.55
Episode 800	Average Score: -100.42
Episode 900	Average Score: -96.95
Episode 1000	Average Score: -69.82
Episode 1100	Average Score: -52.78


  advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
  return F.mse_loss(input, target, reduction=self.reduction)


ValueError: Expected parameter loc (Tensor of shape (1, 2)) of distribution Normal(loc: torch.Size([1, 2]), scale: torch.Size([1, 2])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan]], device='cuda:0', grad_fn=<TanhBackward0>)