In [None]:
!pip install --upgrade grpcio grpcio-tools

In [1]:
%matplotlib inline
%load_ext tensorboard

In [2]:
import gymnasium as gym
from gymnasium.envs.registration import register, registry
import time
import numpy as np
import pygame

import matplotlib
import matplotlib.pyplot as plt

from typing import Any, Dict
import torch
import torch.nn as nn
import tensorboard

from stable_baselines3 import PPO, A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from stable_baselines3.common.vec_env import SubprocVecEnv

import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

import environments

2025-01-30 14:58:36.734436: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-30 14:58:36.834676: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738249116.891788   18303 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738249116.908732   18303 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-30 14:58:37.031300: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
if 'MarineEnv-v0' not in registry:
    register(
        id='MarineEnv-v0',
        entry_point='environments:MarineEnv',  # String reference to the class
    )

In [4]:
# is_ipython = 'inline' in matplotlib.get_backend()
# if is_ipython:
#     from IPython import display

# plt.ion()

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
env_kwargs = dict(
    render_mode='rgb_array',
    continuous=True,
    max_episode_steps=1200,
    training_stage=2,
    timescale=1/3
)

In [None]:
# Create the environment
def make_env():
    env = gym.make('MarineEnv-v0', **env_kwargs)
    env = Monitor(env)  # ✅ Apply Monitor FIRST before vectorization
    return env

# # Wrap it in `DummyVecEnv` FIRST
env = DummyVecEnv([make_env])  

# Now apply normalization
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)

In [None]:
env.reset()

In [None]:
env.get_original_obs()

In [6]:
n_envs = 4  # Number of parallel environments
vec_env = make_vec_env(env_id='MarineEnv-v0', n_envs=n_envs, env_kwargs=env_kwargs)
env = gym.make('MarineEnv-v0', **env_kwargs)

In [7]:
kwargs = { 
    'clip_range': 0.2,  # Reduce to prevent large updates
    'ent_coef': 0.01,  # Higher entropy to encourage exploration
    'gamma': 0.99, 
    'learning_rate': 3e-4,  # Slightly higher for better learning
    'n_steps': 2048,  # Increase from default (512) to 2048
    'batch_size': 512,  # Adjust batch size for stability
    'gae_lambda': 0.95,  # Generalized Advantage Estimation smoothing
    'max_grad_norm': 0.9, 
    'policy_kwargs': {'net_arch': [256, 256], 'activation_fn': torch.nn.Tanh},  # Slightly deeper network
}

In [8]:
model = PPO(
    policy='MlpPolicy',
    env=env,
    # env=vec_env,
    verbose=1,
    device='cpu', 
    tensorboard_log='./stage_1_tensorboard_logs/',
    **kwargs
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [10]:
model.learn(total_timesteps=(1e5), reset_num_timesteps=False, progress_bar=True, tb_log_name='ppo_2')

Logging to ./stage_1_tensorboard_logs/ppo_2_0


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 188      |
|    ep_rew_mean     | -727     |
| time/              |          |
|    fps             | 527      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 179         |
|    ep_rew_mean          | -611        |
| time/                   |             |
|    fps                  | 387         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007825161 |
|    clip_fraction        | 0.0971      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.83       |
|    explained_variance   | -0.000815   |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x7f19eb716350>

In [11]:
eval_env = gym.make('MarineEnv-v0', **env_kwargs)
mean, std = evaluate_policy(model=model, env=eval_env, n_eval_episodes=10, deterministic=True)
print(f'Mean: {mean:.2f}, Std: {std:.2f}')



Mean: 709.47, Std: 513.89


In [None]:
!kill 5813

In [9]:
%tensorboard --logdir ./stage_1_tensorboard_logs/ --host=0.0.0.0

In [13]:
# Save environment normalization stats
# env.save("ppo_normalized_env.pkl")
model.save("ppo_marine_good")
# model = model.load("ppo_marine_stage_2", device='cpu')
# model = model.load('ppo_marine_stage_1.zip')

In [None]:
env = VecNormalize.load("ppo_normalized_env.pkl", env)

# Disable reward normalization for evaluation
env.training = False
env.norm_reward = False

import cv2
import numpy as np

obs = env.reset()
for _ in range(100):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, dones, _ = env.step(action)

    # ✅ Ensure env.get_images() is not empty
    images = env.get_images()
    if images and images[0] is not None:
        frame = images[0]
        
        # ✅ Ensure the frame has valid dimensions before displaying
        if frame.shape[0] > 0 and frame.shape[1] > 0:
            cv2.imshow("PPO MarineEnv Evaluation", frame)
            cv2.waitKey(1)  # Display for 1ms
        else:
            print("Warning: Received an empty frame from env.get_images()")

    if dones:
        break

env.close()
cv2.destroyAllWindows()  # Close display window


In [None]:
timescale = 1 / 6
for _ in range(10):
    env = gym.make('MarineEnv-v0', render_mode='human', continuous=True, training_stage=2, timescale=timescale, training=False)
    state, _ = env.reset()
    print(state)
    episode_rewards = 0 
    # flatten_state = flatten(env.observation_space, state)
    # state = torch.tensor(flatten_state, dtype=torch.float32, device=device).unsqueeze(0)
    for _ in range(int(400 / timescale)):
        action = model.predict(state, deterministic=True)
        # print(action)
        # observation, reward, terminated, truncated, info = env.step((0, 0))
        observation, reward, terminated, truncated, info = env.step(action[0])
        env.render()
        time.sleep(0.005)
        episode_rewards += reward
        print('===========================')
        print(observation)
        print(reward)
        
        if terminated or truncated:
            print(episode_rewards)
            break
    
        state = observation
            
    print(episode_rewards)
    print(state)
    env.close()

[253.49675    10.495135   13.182254   75.36208    -4.4058228  75.36208
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.       ]
[250.99641    10.492107   13.153106   75.21714    -1.9097087  75.19542
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.       ]
4.291481018066406
[249.49289    10.488469   13.123952   75.07646    -0.4070899  75.028755
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.       ]
9.29153823852539
[248.6651      10.484145    13.0948105   74.94064      0.42163566
  74.86209      0.           0.           0.           0.
   0.           0.           0.           0.           0.
   0.        ]
10.291414260864258
[248.23483    10.479444   13.065684   74.80751     0.8538047  74.69543
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.       ]


In [None]:
model.predict(state)

In [None]:
env_kwargs['render_mode'] = 'human'

vec_env = make_vec_env(env_id='MarineEnv-v0', n_envs=n_envs, env_kwargs=env_kwargs)
vec_env.unwrapped.timescale = 1 / 3

state = vec_env.reset()
# print(state)
episode_rewards = 0 
# flatten_state = flatten(env.observation_space, state)
# state = torch.tensor(flatten_state, dtype=torch.float32, device=device).unsqueeze(0)
for _ in range(400):
    action = model.predict(state, deterministic=True)
    # print(action)
    # observation, reward, terminated, truncated, info = env.step((0, 0))
    observation, reward, dones, info = vec_env.step(action[0])
    vec_env.render()
    time.sleep(0.005)
    episode_rewards += reward
    # print('===========================')
    # print(observation)
    # if terminated or truncated:
    #     print(episode_rewards)
    #     break

    state = observation
        
print(episode_rewards)
print(state)
env.close()

# Optimizing hyperparams

In [None]:
N_TRIALS = 100  # Maximum number of trials
N_JOBS = 1 # Number of jobs to run in parallel
N_STARTUP_TRIALS = 5  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 2  # Number of evaluations during the training
N_TIMESTEPS = int(2e4)  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 10
N_EVAL_EPISODES = 10
TIMEOUT = int(60 * 15)  # 15 minutes

ENV_ID = 'MarineEnv-v0'


DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
}

def make_env(env_id: str, env_kwargs: dict):
    """
    Creates a Gym environment with given parameters.

    :param env_id: ID of the Gym environment.
    :param env_kwargs: Keyword arguments for environment configuration.
    :return: A function returning the created environment.
    """
    def _init():
        env = gym.make(env_id, **env_kwargs)
        env = Monitor(env)  # ✅ Apply Monitor FIRST before vectorization
        return env
    return _init

def make_vec_env(env_id: str, n_envs: int, env_kwargs: dict):
    """
    Creates a vectorized environment with the specified number of parallel environments.

    :param env_id: The ID of the Gym environment.
    :param n_envs: Number of parallel environments.
    :param env_kwargs: Keyword arguments for environment configuration.
    :return: A vectorized environment.
    """
    return DummyVecEnv([make_env(env_id, env_kwargs) for _ in range(n_envs)])


In [None]:
def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    n_steps = 2 ** trial.suggest_int('n_steps', 7, 12)  # Number of steps per update (512-4096)
    
    batch_size = 2 ** trial.suggest_int('batch_size', 5, 10)  # Minibatch size (32-1024)
    
    gamma = trial.suggest_float('gamma', 0.9, 0.9999)  # Discount factor (close to 1 for long-term rewards)
    
    gae_lambda = trial.suggest_float('gae_lambda', 0.8, 1.0)  # GAE lambda (trade-off bias/variance)
    
    clip_range = trial.suggest_float('clip_range', 0.1, 0.3)  # PPO clipping range
    
    ent_coef = trial.suggest_float('ent_coef', 0.0001, 0.1, log=True)  # Entropy coefficient (for exploration)
    
    vf_coef = trial.suggest_float('vf_coef', 0.1, 1.0)  # Value function loss coefficient
    
    max_grad_norm = trial.suggest_float('max_grad_norm', 0.3, 5.0)  # Gradient clipping
    
    target_kl = trial.suggest_float('target_kl', 0.01, 0.2)  # KL divergence target
    
    n_epochs = trial.suggest_int('n_epochs', 3, 10)  # PPO update epochs per batch
    
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu'])
    
    net_arch = trial.suggest_categorical('net_arch', ['tiny', 'small'])
    
    # Convert architecture choices
    net_arch = [128, 128] if net_arch == 'tiny' else [256, 256]
    
    activation_fn = {'tanh': nn.Tanh, 'relu': nn.ReLU}[activation_fn]
    
    # Store gamma value in Optuna logs
    trial.set_user_attr('gamma', gamma)

    return {
        'n_steps': n_steps,
        'batch_size': batch_size,
        'gamma': gamma,
        'gae_lambda': gae_lambda,
        'learning_rate': learning_rate,
        'clip_range': clip_range,
        'ent_coef': ent_coef,
        'vf_coef': vf_coef,
        'max_grad_norm': max_grad_norm,
        'target_kl': target_kl,
        'n_epochs': n_epochs,
        'policy_kwargs': {
            'net_arch': net_arch,
            'activation_fn': activation_fn
        }
    }

In [None]:
class TrialEvalCallback(EvalCallback):
    """
    Callback used for evaluating and reporting a trial.
    
    :param eval_env: Evaluation environement
    :param trial: Optuna trial object
    :param n_eval_episodes: Number of evaluation episodes
    :param eval_freq:   Evaluate the agent every ``eval_freq`` call of the callback.
    :param deterministic: Whether the evaluation should
        use a stochastic or deterministic policy.
    :param verbose:
    """

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [None]:
def objective(trial: optuna.Trial) -> float:
    """
    Objective function using by Optuna to evaluate one configuration (i.e., one set of hyperparameters).

    :param trial: Optuna trial object.
    :return: Mean episodic reward after training.
    """

    kwargs = DEFAULT_HYPERPARAMS.copy()
    
    # Sample hyperparameters
    kwargs.update(**sample_ppo_params(trial))

    # Ensure env_kwargs is passed correctly
    env_kwargs = {
        "render_mode": "rgb_array",
        "continuous": True,
        "max_episode_steps": 400,
        "training_stage": 2,
        "timescale": 1
    }

    # Create the training environment
    train_env = make_vec_env(ENV_ID, n_envs=N_EVAL_ENVS, env_kwargs=env_kwargs)

    # Create the RL model
    model = PPO(device='cpu', verbose=1, env=train_env, progress_bar=True, **kwargs)

    # Create evaluation environment with same parameters
    eval_envs = make_vec_env(ENV_ID, n_envs=N_EVAL_ENVS, env_kwargs=env_kwargs)

    # Create callback for evaluation
    eval_callback = TrialEvalCallback(eval_envs, trial, N_EVAL_EPISODES, EVAL_FREQ, deterministic=True, verbose=0)

    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=eval_callback, progress_bar=True)
    except AssertionError as e:
        print(e)  # Debugging
        nan_encountered = True
    finally:
        model.env.close()
        eval_envs.close()

    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


In [None]:
# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)
# Create the study and start the hyperparameter optimization
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

# Write report
study.trials_dataframe().to_csv("study_results_ppo_marineenv.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()