In [None]:
!pip install --upgrade grpcio grpcio-tools

In [1]:
%matplotlib inline
%load_ext tensorboard

In [2]:
import gymnasium as gym
from gymnasium import spaces
from gymnasium.spaces.utils import flatten
from gymnasium.envs.registration import register, registry
import time
import numpy as np
import pygame

import matplotlib
import matplotlib.pyplot as plt

from typing import Any, Dict
import torch
import torch.nn as nn
import tensorboard

from stable_baselines3 import PPO, A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from stable_baselines3.common.vec_env import SubprocVecEnv

import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

import environments

2025-01-28 22:20:56.264505: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-28 22:20:56.274709: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738102856.286687    2506 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738102856.290317    2506 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-28 22:20:56.303871: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
if 'MarineEnv-v0' not in registry:
    register(
        id='MarineEnv-v0',
        entry_point='environments:MarineEnv',  # String reference to the class
    )

In [None]:
# is_ipython = 'inline' in matplotlib.get_backend()
# if is_ipython:
#     from IPython import display

# plt.ion()

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
env_kwargs = dict(
    render_mode='rgb_array',
    continuous=True,
    max_episode_steps=400,
    training_stage=2,
    timescale=1
)

In [None]:
# Create the environment
def make_env():
    env = gym.make('MarineEnv-v0', **env_kwargs)
    env = Monitor(env)  # ✅ Apply Monitor FIRST before vectorization
    return env

# # Wrap it in `DummyVecEnv` FIRST
env = DummyVecEnv([make_env])  

# Now apply normalization
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)

In [None]:
env.reset()

In [None]:
env.get_original_obs()

In [None]:
n_envs = 8  # Number of parallel environments
vec_env = make_vec_env(env_id='MarineEnv-v0', n_envs=n_envs, env_kwargs=env_kwargs)
env = gym.make('MarineEnv-v0', **env_kwargs)

In [None]:
kwargs = { 
    'clip_range': 0.2,  # Reduce to prevent large updates
    'ent_coef': 0.01,  # Higher entropy to encourage exploration
    'gamma': 0.99, 
    'learning_rate': 3e-4,  # Slightly higher for better learning
    'n_steps': 2048,  # Increase from default (512) to 2048
    'batch_size': 512,  # Adjust batch size for stability
    'gae_lambda': 0.95,  # Generalized Advantage Estimation smoothing
    'max_grad_norm': 0.9, 
    'policy_kwargs': {'net_arch': [256, 256], 'activation_fn': torch.nn.Tanh},  # Slightly deeper network
}

In [None]:
model = PPO(
    policy='MlpPolicy',
    env=env,
    # env=vec_env,
    verbose=1,
    device='cpu', 
    tensorboard_log='./stage_1_tensorboard_logs/',
    **kwargs
)

In [None]:
model.learn(total_timesteps=(3e4), reset_num_timesteps=False, progress_bar=True, tb_log_name='ppo_1', log_interval=1000)

In [None]:
eval_env = gym.make('MarineEnv-v0', **env_kwargs)
mean, std = evaluate_policy(model=model, env=eval_env, n_eval_episodes=10, deterministic=True)
print(f'Mean: {mean:.2f}, Std: {std:.2f}')

In [None]:
!kill 5813

In [None]:
%tensorboard --logdir ./stage_1_tensorboard_logs/ --host=0.0.0.0

In [None]:
# Save environment normalization stats
# env.save("ppo_normalized_env.pkl")
# model.save("ppo_marine_stage_2")
model = model.load("ppo_marine_stage_2")
# model = model.load('ppo_marine_stage_1.zip')

In [None]:
env = VecNormalize.load("ppo_normalized_env.pkl", env)

# Disable reward normalization for evaluation
env.training = False
env.norm_reward = False

import cv2
import numpy as np

obs = env.reset()
for _ in range(100):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, dones, _ = env.step(action)

    # ✅ Ensure env.get_images() is not empty
    images = env.get_images()
    if images and images[0] is not None:
        frame = images[0]
        
        # ✅ Ensure the frame has valid dimensions before displaying
        if frame.shape[0] > 0 and frame.shape[1] > 0:
            cv2.imshow("PPO MarineEnv Evaluation", frame)
            cv2.waitKey(1)  # Display for 1ms
        else:
            print("Warning: Received an empty frame from env.get_images()")

    if dones:
        break

env.close()
cv2.destroyAllWindows()  # Close display window


In [None]:
timescale = 1 / 6
for _ in range(1):
    env = gym.make('MarineEnv-v0', render_mode='human', continuous=True, training_stage=2, timescale=timescale)
    state, _ = env.reset()
    print(state)
    episode_rewards = 0 
    # flatten_state = flatten(env.observation_space, state)
    # state = torch.tensor(flatten_state, dtype=torch.float32, device=device).unsqueeze(0)
    for _ in range(int(400 / timescale)):
        action = model.predict(state, deterministic=True)
        # print(action)
        # observation, reward, terminated, truncated, info = env.step((0, 0))
        observation, reward, terminated, truncated, info = env.step(action[0])
        env.render()
        time.sleep(0.01)
        episode_rewards += reward
        print('===========================')
        print(observation)
        print(reward)
        
        if terminated or truncated:
            print(episode_rewards)
            break
    
        state = observation
            
    print(episode_rewards)
    print(state)
    env.close()

In [None]:
model.predict(state)

In [None]:
# env_kwargs['render_mode'] = 'human'

# vec_env = make_vec_env(env_id='MarineEnv-v0', n_envs=n_envs, env_kwargs=env_kwargs)
# vec_env.unwrapped.timescale = 1 / 3

# state = vec_env.reset()
# # print(state)
# episode_rewards = 0 
# # flatten_state = flatten(env.observation_space, state)
# # state = torch.tensor(flatten_state, dtype=torch.float32, device=device).unsqueeze(0)
# for _ in range(400):
#     action = model.predict(state, deterministic=True)
#     # print(action)
#     # observation, reward, terminated, truncated, info = env.step((0, 0))
#     observation, reward, dones, info = vec_env.step(action[0])
#     vec_env.render()
#     time.sleep(0.05)
#     episode_rewards += reward
#     # print('===========================')
#     # print(observation)
#     # if terminated or truncated:
#     #     print(episode_rewards)
#     #     break

#     state = observation
        
# print(episode_rewards)
# print(state)
# env.close()

# Optimizing hyperparams

In [15]:
N_TRIALS = 100  # Maximum number of trials
N_JOBS = 1 # Number of jobs to run in parallel
N_STARTUP_TRIALS = 5  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 2  # Number of evaluations during the training
N_TIMESTEPS = int(2e4)  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 10
N_EVAL_EPISODES = 10
TIMEOUT = int(60 * 15)  # 15 minutes

ENV_ID = 'MarineEnv-v0'


DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
}

def make_env(env_id: str, env_kwargs: dict):
    """
    Creates a Gym environment with given parameters.

    :param env_id: ID of the Gym environment.
    :param env_kwargs: Keyword arguments for environment configuration.
    :return: A function returning the created environment.
    """
    def _init():
        env = gym.make(env_id, **env_kwargs)
        env = Monitor(env)  # ✅ Apply Monitor FIRST before vectorization
        return env
    return _init

def make_vec_env(env_id: str, n_envs: int, env_kwargs: dict):
    """
    Creates a vectorized environment with the specified number of parallel environments.

    :param env_id: The ID of the Gym environment.
    :param n_envs: Number of parallel environments.
    :param env_kwargs: Keyword arguments for environment configuration.
    :return: A vectorized environment.
    """
    return DummyVecEnv([make_env(env_id, env_kwargs) for _ in range(n_envs)])


In [16]:
def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    n_steps = 2 ** trial.suggest_int('n_steps', 7, 12)  # Number of steps per update (512-4096)
    
    batch_size = 2 ** trial.suggest_int('batch_size', 5, 10)  # Minibatch size (32-1024)
    
    gamma = trial.suggest_float('gamma', 0.9, 0.9999)  # Discount factor (close to 1 for long-term rewards)
    
    gae_lambda = trial.suggest_float('gae_lambda', 0.8, 1.0)  # GAE lambda (trade-off bias/variance)
    
    clip_range = trial.suggest_float('clip_range', 0.1, 0.3)  # PPO clipping range
    
    ent_coef = trial.suggest_float('ent_coef', 0.0001, 0.1, log=True)  # Entropy coefficient (for exploration)
    
    vf_coef = trial.suggest_float('vf_coef', 0.1, 1.0)  # Value function loss coefficient
    
    max_grad_norm = trial.suggest_float('max_grad_norm', 0.3, 5.0)  # Gradient clipping
    
    target_kl = trial.suggest_float('target_kl', 0.01, 0.2)  # KL divergence target
    
    n_epochs = trial.suggest_int('n_epochs', 3, 10)  # PPO update epochs per batch
    
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu'])
    
    net_arch = trial.suggest_categorical('net_arch', ['tiny', 'small'])
    
    # Convert architecture choices
    net_arch = [128, 128] if net_arch == 'tiny' else [256, 256]
    
    activation_fn = {'tanh': nn.Tanh, 'relu': nn.ReLU}[activation_fn]
    
    # Store gamma value in Optuna logs
    trial.set_user_attr('gamma', gamma)

    return {
        'n_steps': n_steps,
        'batch_size': batch_size,
        'gamma': gamma,
        'gae_lambda': gae_lambda,
        'learning_rate': learning_rate,
        'clip_range': clip_range,
        'ent_coef': ent_coef,
        'vf_coef': vf_coef,
        'max_grad_norm': max_grad_norm,
        'target_kl': target_kl,
        'n_epochs': n_epochs,
        'policy_kwargs': {
            'net_arch': net_arch,
            'activation_fn': activation_fn
        }
    }

In [17]:
class TrialEvalCallback(EvalCallback):
    """
    Callback used for evaluating and reporting a trial.
    
    :param eval_env: Evaluation environement
    :param trial: Optuna trial object
    :param n_eval_episodes: Number of evaluation episodes
    :param eval_freq:   Evaluate the agent every ``eval_freq`` call of the callback.
    :param deterministic: Whether the evaluation should
        use a stochastic or deterministic policy.
    :param verbose:
    """

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [18]:
def objective(trial: optuna.Trial) -> float:
    """
    Objective function using by Optuna to evaluate one configuration (i.e., one set of hyperparameters).

    :param trial: Optuna trial object.
    :return: Mean episodic reward after training.
    """

    kwargs = DEFAULT_HYPERPARAMS.copy()
    
    # Sample hyperparameters
    kwargs.update(**sample_ppo_params(trial))

    # ✅ Ensure env_kwargs is passed correctly
    env_kwargs = {
        "render_mode": "rgb_array",
        "continuous": True,
        "max_episode_steps": 400,
        "training_stage": 2,
        "timescale": 1
    }

    # ✅ Create the training environment
    train_env = make_vec_env(ENV_ID, n_envs=N_EVAL_ENVS, env_kwargs=env_kwargs)

    # ✅ Create the RL model
    model = PPO(device='cpu', verbose=1, env=train_env, progress_bar=True, **kwargs)

    # ✅ Create evaluation environment with same parameters
    eval_envs = make_vec_env(ENV_ID, n_envs=N_EVAL_ENVS, env_kwargs=env_kwargs)

    # ✅ Create callback for evaluation
    eval_callback = TrialEvalCallback(eval_envs, trial, N_EVAL_EPISODES, EVAL_FREQ, deterministic=True, verbose=0)

    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=eval_callback, progress_bar=True)
    except AssertionError as e:
        print(e)  # Debugging
        nan_encountered = True
    finally:
        model.env.close()
        eval_envs.close()

    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


In [19]:
# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)
# Create the study and start the hyperparameter optimization
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

# Write report
study.trials_dataframe().to_csv("study_results_ppo_marineenv.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()

[I 2025-01-28 22:23:24,027] A new study created in memory with name: no-name-9a55cdd5-b39b-4567-8c47-fb0345582ee2


Using cpu device


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 160      |
|    ep_rew_mean     | 178      |
| time/              |          |
|    fps             | 1277     |
|    iterations      | 1        |
|    time_elapsed    | 32       |
|    total_timesteps | 40960    |
---------------------------------


[I 2025-01-28 22:23:57,684] Trial 0 finished with value: -inf and parameters: {'learning_rate': 1.5982843041843627e-05, 'n_steps': 12, 'batch_size': 6, 'gamma': 0.9107238741574982, 'gae_lambda': 0.8655387073048394, 'clip_range': 0.2405527117611638, 'ent_coef': 0.02189762679402271, 'vf_coef': 0.7099663631144432, 'max_grad_norm': 1.554596593101503, 'target_kl': 0.1821336744932124, 'n_epochs': 6, 'activation_fn': 'relu', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 75.8     |
|    ep_rew_mean     | 27.2     |
| time/              |          |
|    fps             | 1943     |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 10240    |
---------------------------------


--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 137      |
|    ep_rew_mean          | 293      |
| time/                   |          |
|    fps                  | 1444     |
|    iterations           | 2        |
|    time_elapsed         | 14       |
|    total_timesteps      | 20480    |
| train/                  |          |
|    approx_kl            | 906.5136 |
|    clip_fraction        | 0.5      |
|    clip_range           | 0.196    |
|    entropy_loss         | -2.86    |
|    explained_variance   | -0.0707  |
|    learning_rate        | 0.0309   |
|    loss                 | 1.39e+05 |
|    n_updates            | 1        |
|    policy_gradient_loss | 0.159    |
|    std                  | 1.02     |
|    value_loss           | 1.56e+05 |
--------------------------------------


[I 2025-01-28 22:24:12,021] Trial 1 finished with value: -inf and parameters: {'learning_rate': 0.03092872830963851, 'n_steps': 10, 'batch_size': 7, 'gamma': 0.9147320157831164, 'gae_lambda': 0.8803476955345844, 'clip_range': 0.19617610099375893, 'ent_coef': 0.030665442269363862, 'vf_coef': 0.44667162998308374, 'max_grad_norm': 0.5358331936750367, 'target_kl': 0.14127675178046406, 'n_epochs': 8, 'activation_fn': 'relu', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 94.1     |
|    ep_rew_mean     | -89.1    |
| time/              |          |
|    fps             | 2629     |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 10240    |
---------------------------------


--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 134      |
|    ep_rew_mean          | 122      |
| time/                   |          |
|    fps                  | 1772     |
|    iterations           | 2        |
|    time_elapsed         | 11       |
|    total_timesteps      | 20480    |
| train/                  |          |
|    approx_kl            | 200.4863 |
|    clip_fraction        | 0.5      |
|    clip_range           | 0.29     |
|    entropy_loss         | -2.84    |
|    explained_variance   | 0.000449 |
|    learning_rate        | 0.798    |
|    loss                 | 1.12e+03 |
|    n_updates            | 1        |
|    policy_gradient_loss | 0.121    |
|    std                  | 1.33     |
|    value_loss           | 1.21e+03 |
--------------------------------------


[I 2025-01-28 22:24:23,735] Trial 2 finished with value: -inf and parameters: {'learning_rate': 0.7975309759330347, 'n_steps': 10, 'batch_size': 7, 'gamma': 0.9032804943008576, 'gae_lambda': 0.9467036302486889, 'clip_range': 0.2899046586071111, 'ent_coef': 0.019693172330313827, 'vf_coef': 0.7630962040930603, 'max_grad_norm': 4.87596191329839, 'target_kl': 0.09136665790855543, 'n_epochs': 5, 'activation_fn': 'tanh', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Output()

Using cpu device


  logger.warn(f"{pre} is not within the observation space.")


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 81.5     |
|    ep_rew_mean     | -105     |
| time/              |          |
|    fps             | 3245     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 5120     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 114         |
|    ep_rew_mean          | 35.4        |
| time/                   |             |
|    fps                  | 2330        |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.033033565 |
|    clip_fraction        | 0.256       |
|    clip_range           | 0.237       |
|    entropy_loss         | -2.84       |
|    explained_variance   | -0.138      |
|    learning_rate        | 0.

[I 2025-01-28 22:24:35,598] Trial 3 finished with value: -inf and parameters: {'learning_rate': 0.00020794866799093092, 'n_steps': 9, 'batch_size': 7, 'gamma': 0.9559894612955998, 'gae_lambda': 0.8915940776376616, 'clip_range': 0.23745629373876048, 'ent_coef': 0.0010649137937541323, 'vf_coef': 0.6579981026093832, 'max_grad_norm': 4.714404889555553, 'target_kl': 0.12410437587193673, 'n_epochs': 3, 'activation_fn': 'relu', 'net_arch': 'tiny'}. Best is trial 0 with value: -inf.


Using cpu device


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 74.2     |
|    ep_rew_mean     | -178     |
| time/              |          |
|    fps             | 4160     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2560     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 102         |
|    ep_rew_mean          | -131        |
| time/                   |             |
|    fps                  | 2918        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 5120        |
| train/                  |             |
|    approx_kl            | 0.014850467 |
|    clip_fraction        | 0.187       |
|    clip_range           | 0.203       |
|    entropy_loss         | -2.84       |
|    explained_variance   | -0.00325    |
|    learning_rate        | 0.

[I 2025-01-28 22:24:51,245] Trial 4 finished with value: -inf and parameters: {'learning_rate': 0.0023297626902393774, 'n_steps': 8, 'batch_size': 9, 'gamma': 0.9809862808116705, 'gae_lambda': 0.8508563111564579, 'clip_range': 0.20301189485619237, 'ent_coef': 0.011927237833534634, 'vf_coef': 0.4914691875480699, 'max_grad_norm': 3.3200769507748937, 'target_kl': 0.1023232058648112, 'n_epochs': 8, 'activation_fn': 'tanh', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 120      |
|    ep_rew_mean     | 200      |
| time/              |          |
|    fps             | 1106     |
|    iterations      | 1        |
|    time_elapsed    | 37       |
|    total_timesteps | 40960    |
---------------------------------


[I 2025-01-28 22:25:28,442] Trial 5 finished with value: -inf and parameters: {'learning_rate': 1.0561435922621972e-05, 'n_steps': 12, 'batch_size': 5, 'gamma': 0.9367308990623988, 'gae_lambda': 0.8268665354790885, 'clip_range': 0.10782572013383833, 'ent_coef': 0.00012260613301712407, 'vf_coef': 0.9622853971152655, 'max_grad_norm': 0.9286382833549754, 'target_kl': 0.1916031533527281, 'n_epochs': 10, 'activation_fn': 'relu', 'net_arch': 'tiny'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 86.5     |
|    ep_rew_mean     | 81.4     |
| time/              |          |
|    fps             | 833      |
|    iterations      | 1        |
|    time_elapsed    | 49       |
|    total_timesteps | 40960    |
---------------------------------


[I 2025-01-28 22:26:17,766] Trial 6 finished with value: -inf and parameters: {'learning_rate': 1.4860633441238727e-05, 'n_steps': 12, 'batch_size': 5, 'gamma': 0.9407024603538277, 'gae_lambda': 0.9915039953720793, 'clip_range': 0.2994576013425273, 'ent_coef': 0.09806256692246783, 'vf_coef': 0.11973878887949557, 'max_grad_norm': 2.011141117890231, 'target_kl': 0.011270195622464091, 'n_epochs': 5, 'activation_fn': 'relu', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Using cpu device


Output()

We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=128 and n_envs=10)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 43.5     |
|    ep_rew_mean     | -140     |
| time/              |          |
|    fps             | 4428     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 1280     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 60.6       |
|    ep_rew_mean          | -61.9      |
| time/                   |            |
|    fps                  | 3600       |
|    iterations           | 2          |
|    time_elapsed         | 0          |
|    total_timesteps      | 2560       |
| train/                  |            |
|    approx_kl            | 0.04103118 |
|    clip_fraction        | 0.453      |
|    clip_range           | 0.131      |
|    entropy_loss         | -2.84      |
|    explained_variance   | 0.071      |
|    learning_rate        | 0.000255   |
|   

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 255        |
|    ep_rew_mean          | -194       |
| time/                   |            |
|    fps                  | 2493       |
|    iterations           | 11         |
|    time_elapsed         | 5          |
|    total_timesteps      | 14080      |
| train/                  |            |
|    approx_kl            | 0.33272526 |
|    clip_fraction        | 0.291      |
|    clip_range           | 0.131      |
|    entropy_loss         | -2.84      |
|    explained_variance   | 0.664      |
|    learning_rate        | 0.000255   |
|    loss                 | 998        |
|    n_updates            | 59         |
|    policy_gradient_loss | 0.0253     |
|    std                  | 0.999      |
|    value_loss           | 1.21e+03   |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 261        |
|    ep_rew_mean          | -212       |
| time/                   |            |
|    fps                  | 2475       |
|    iterations           | 12         |
|    time_elapsed         | 6          |
|    total_timesteps      | 15360      |
| train/                  |            |
|    approx_kl            | 0.36932936 |
|    clip_fraction        | 0.132      |
|    clip_range           | 0.131      |
|    entropy_loss         | -2.84      |
|    explained_variance   | 0.675      |
|    learning_rate        | 0.000255   |
|    loss                 | 1.86e+03   |
|    n_updates            | 62         |
|    policy_gradient_loss | 0.00306    |
|    std                  | 0.999      |
|    value_loss           | 2.11e+03   |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 274        |
|    ep_rew_mean          | -152       |
| time/                   |            |
|    fps                  | 2452       |
|    iterations           | 13         |
|    time_elapsed         | 6          |
|    total_timesteps      | 16640      |
| train/                  |            |
|    approx_kl            | 0.19674988 |
|    clip_fraction        | 0.0355     |
|    clip_range           | 0.131      |
|    entropy_loss         | -2.84      |
|    explained_variance   | 0.0611     |
|    learning_rate        | 0.000255   |
|    loss                 | 1.81e+04   |
|    n_updates            | 65         |
|    policy_gradient_loss | 0.0207     |
|    std                  | 0.999      |
|    value_loss           | 9.44e+03   |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_me

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 272       |
|    ep_rew_mean          | -71.3     |
| time/                   |           |
|    fps                  | 2324      |
|    iterations           | 16        |
|    time_elapsed         | 8         |
|    total_timesteps      | 20480     |
| train/                  |           |
|    approx_kl            | 2.0969694 |
|    clip_fraction        | 0.00456   |
|    clip_range           | 0.131     |
|    entropy_loss         | -2.84     |
|    explained_variance   | -27.1     |
|    learning_rate        | 0.000255  |
|    loss                 | 3.27e+06  |
|    n_updates            | 79        |
|    policy_gradient_loss | -0.00158  |
|    std                  | 1         |
|    value_loss           | 2.35e+06  |
---------------------------------------


[I 2025-01-28 22:26:26,850] Trial 7 finished with value: -inf and parameters: {'learning_rate': 0.0002547539180369284, 'n_steps': 7, 'batch_size': 10, 'gamma': 0.9983575222849137, 'gae_lambda': 0.934000725398765, 'clip_range': 0.13120926031261398, 'ent_coef': 0.002364925236591501, 'vf_coef': 0.9770190246675703, 'max_grad_norm': 2.1767108172488663, 'target_kl': 0.1960624407037773, 'n_epochs': 6, 'activation_fn': 'relu', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Using cpu device


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 112      |
|    ep_rew_mean     | 147      |
| time/              |          |
|    fps             | 1613     |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 20480    |
---------------------------------


[I 2025-01-28 22:26:39,730] Trial 8 finished with value: -inf and parameters: {'learning_rate': 0.006703127745458914, 'n_steps': 11, 'batch_size': 6, 'gamma': 0.9237676001587299, 'gae_lambda': 0.8040892375317884, 'clip_range': 0.23284602467671336, 'ent_coef': 0.00043696037449290594, 'vf_coef': 0.26023122426651113, 'max_grad_norm': 3.2833028360082244, 'target_kl': 0.04791892616755358, 'n_epochs': 3, 'activation_fn': 'tanh', 'net_arch': 'tiny'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 150      |
|    ep_rew_mean     | 83.8     |
| time/              |          |
|    fps             | 1891     |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 20480    |
---------------------------------


[I 2025-01-28 22:26:50,798] Trial 9 finished with value: -inf and parameters: {'learning_rate': 0.0001367032487664507, 'n_steps': 11, 'batch_size': 9, 'gamma': 0.9636837400232788, 'gae_lambda': 0.8656171272883338, 'clip_range': 0.159177253733036, 'ent_coef': 0.005764974844051823, 'vf_coef': 0.7536240110770918, 'max_grad_norm': 1.4816571334874062, 'target_kl': 0.16502835743111693, 'n_epochs': 8, 'activation_fn': 'relu', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Using cpu device


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 102      |
|    ep_rew_mean     | -223     |
| time/              |          |
|    fps             | 4426     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 5120     |
---------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 163       |
|    ep_rew_mean          | -30.9     |
| time/                   |           |
|    fps                  | 3526      |
|    iterations           | 2         |
|    time_elapsed         | 2         |
|    total_timesteps      | 10240     |
| train/                  |           |
|    approx_kl            | 6.5433993 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.242     |
|    entropy_loss         | -3.07     |
|    explained_variance   | 0.000872  |
|    learning_rate        | 0.234     |
|    loss                 | 110       |
|    n_updates            | 1         |
|    policy_gradient_loss | 0.123     |
|    std                  | 1.26      |
|    value_loss           | 389       |
---------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 208       |
|    ep_rew_mean          | 68.7      |
| time/                   |           |
|    fps                  | 3125      |
|    iterations           | 3         |
|    time_elapsed         | 4         |
|    total_timesteps      | 15360     |
| train/                  |           |
|    approx_kl            | 160.98416 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.242     |
|    entropy_loss         | -3.45     |
|    explained_variance   | -0.0829   |
|    learning_rate        | 0.234     |
|    loss                 | 402       |
|    n_updates            | 2         |
|    policy_gradient_loss | 0.157     |
|    std                  | 1.46      |
|    value_loss           | 1.05e+03  |
---------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 236       |
|    ep_rew_mean          | 8.4       |
| time/                   |           |
|    fps                  | 2818      |
|    iterations           | 4         |
|    time_elapsed         | 7         |
|    total_timesteps      | 20480     |
| train/                  |           |
|    approx_kl            | 11.506716 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.242     |
|    entropy_loss         | -3.74     |
|    explained_variance   | -0.394    |
|    learning_rate        | 0.234     |
|    loss                 | 353       |
|    n_updates            | 3         |
|    policy_gradient_loss | 0.148     |
|    std                  | 1.68      |
|    value_loss           | 1.21e+03  |
---------------------------------------


[I 2025-01-28 22:26:58,245] Trial 10 finished with value: -inf and parameters: {'learning_rate': 0.2340047624345783, 'n_steps': 9, 'batch_size': 6, 'gamma': 0.9022606376484393, 'gae_lambda': 0.9218157984333905, 'clip_range': 0.24161963727754332, 'ent_coef': 0.053056511358456376, 'vf_coef': 0.3295018823275413, 'max_grad_norm': 3.311461216751466, 'target_kl': 0.06947790654268692, 'n_epochs': 10, 'activation_fn': 'tanh', 'net_arch': 'tiny'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 75.2     |
|    ep_rew_mean     | -9.06    |
| time/              |          |
|    fps             | 2166     |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 10240    |
---------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 146       |
|    ep_rew_mean          | 145       |
| time/                   |           |
|    fps                  | 1625      |
|    iterations           | 2         |
|    time_elapsed         | 12        |
|    total_timesteps      | 20480     |
| train/                  |           |
|    approx_kl            | 1471.1123 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.183     |
|    entropy_loss         | -2.84     |
|    explained_variance   | -0.0243   |
|    learning_rate        | 0.0305    |
|    loss                 | 1.27e+05  |
|    n_updates            | 1         |
|    policy_gradient_loss | 0.152     |
|    std                  | 1         |
|    value_loss           | 1.31e+05  |
---------------------------------------


[I 2025-01-28 22:27:11,030] Trial 11 finished with value: -inf and parameters: {'learning_rate': 0.03054220282589258, 'n_steps': 10, 'batch_size': 8, 'gamma': 0.9190655741963608, 'gae_lambda': 0.8818285162176263, 'clip_range': 0.18317528582647352, 'ent_coef': 0.026859288646219785, 'vf_coef': 0.4886607211697953, 'max_grad_norm': 0.3175377514081626, 'target_kl': 0.1427130081781469, 'n_epochs': 8, 'activation_fn': 'relu', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 178      |
|    ep_rew_mean     | 63.1     |
| time/              |          |
|    fps             | 2140     |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 20480    |
---------------------------------


[I 2025-01-28 22:27:20,785] Trial 12 finished with value: -inf and parameters: {'learning_rate': 0.03181994405488457, 'n_steps': 11, 'batch_size': 6, 'gamma': 0.9228295627737116, 'gae_lambda': 0.8443421644709659, 'clip_range': 0.20312574629706168, 'ent_coef': 0.010628476063163715, 'vf_coef': 0.6351554807666688, 'max_grad_norm': 0.4790243381890379, 'target_kl': 0.15783456462012047, 'n_epochs': 7, 'activation_fn': 'relu', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 59.4     |
|    ep_rew_mean     | -60      |
| time/              |          |
|    fps             | 620      |
|    iterations      | 1        |
|    time_elapsed    | 65       |
|    total_timesteps | 40960    |
---------------------------------


[I 2025-01-28 22:28:26,956] Trial 13 finished with value: -inf and parameters: {'learning_rate': 0.0020928820966171716, 'n_steps': 12, 'batch_size': 8, 'gamma': 0.9126279925827064, 'gae_lambda': 0.905877250798997, 'clip_range': 0.26359788195747047, 'ent_coef': 0.03936530820519544, 'vf_coef': 0.39542068266232544, 'max_grad_norm': 1.074160252909945, 'target_kl': 0.17065126009932555, 'n_epochs': 6, 'activation_fn': 'relu', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 72.3     |
|    ep_rew_mean     | -198     |
| time/              |          |
|    fps             | 4602     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2560     |
---------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 82.7      |
|    ep_rew_mean          | -184      |
| time/                   |           |
|    fps                  | 3812      |
|    iterations           | 2         |
|    time_elapsed         | 1         |
|    total_timesteps      | 5120      |
| train/                  |           |
|    approx_kl            | 3307.6946 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.173     |
|    entropy_loss         | -2.81     |
|    explained_variance   | -0.0275   |
|    learning_rate        | 0.0504    |
|    loss                 | 1.46e+06  |
|    n_updates            | 1         |
|    policy_gradient_loss | 0.157     |
|    std                  | 0.97      |
|    value_loss           | 8.94e+05  |
---------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 140       |
|    ep_rew_mean          | 116       |
| time/                   |           |
|    fps                  | 3395      |
|    iterations           | 3         |
|    time_elapsed         | 2         |
|    total_timesteps      | 7680      |
| train/                  |           |
|    approx_kl            | 1479.7421 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.173     |
|    entropy_loss         | -2.76     |
|    explained_variance   | -5.17     |
|    learning_rate        | 0.0504    |
|    loss                 | 5.43e+04  |
|    n_updates            | 2         |
|    policy_gradient_loss | 0.169     |
|    std                  | 0.953     |
|    value_loss           | 5.46e+05  |
---------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 162       |
|    ep_rew_mean          | 380       |
| time/                   |           |
|    fps                  | 3040      |
|    iterations           | 4         |
|    time_elapsed         | 3         |
|    total_timesteps      | 10240     |
| train/                  |           |
|    approx_kl            | 1969.5583 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.173     |
|    entropy_loss         | -2.73     |
|    explained_variance   | -3.73     |
|    learning_rate        | 0.0504    |
|    loss                 | 2.5e+04   |
|    n_updates            | 3         |
|    policy_gradient_loss | 0.161     |
|    std                  | 0.941     |
|    value_loss           | 1.14e+05  |
---------------------------------------


--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 178      |
|    ep_rew_mean          | 375      |
| time/                   |          |
|    fps                  | 2801     |
|    iterations           | 5        |
|    time_elapsed         | 4        |
|    total_timesteps      | 12800    |
| train/                  |          |
|    approx_kl            | 64299.35 |
|    clip_fraction        | 0.5      |
|    clip_range           | 0.173    |
|    entropy_loss         | -2.71    |
|    explained_variance   | -1.54    |
|    learning_rate        | 0.0504   |
|    loss                 | 1.05e+04 |
|    n_updates            | 4        |
|    policy_gradient_loss | 0.172    |
|    std                  | 0.934    |
|    value_loss           | 1.16e+04 |
--------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 199       |
|    ep_rew_mean          | 526       |
| time/                   |           |
|    fps                  | 2620      |
|    iterations           | 6         |
|    time_elapsed         | 5         |
|    total_timesteps      | 15360     |
| train/                  |           |
|    approx_kl            | 52773.395 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.173     |
|    entropy_loss         | -2.69     |
|    explained_variance   | -0.59     |
|    learning_rate        | 0.0504    |
|    loss                 | 2.98e+03  |
|    n_updates            | 5         |
|    policy_gradient_loss | 0.166     |
|    std                  | 0.925     |
|    value_loss           | 3.34e+03  |
---------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 226       |
|    ep_rew_mean          | 726       |
| time/                   |           |
|    fps                  | 2471      |
|    iterations           | 7         |
|    time_elapsed         | 7         |
|    total_timesteps      | 17920     |
| train/                  |           |
|    approx_kl            | 161501.56 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.173     |
|    entropy_loss         | -2.67     |
|    explained_variance   | -0.531    |
|    learning_rate        | 0.0504    |
|    loss                 | 2.04e+03  |
|    n_updates            | 6         |
|    policy_gradient_loss | 0.177     |
|    std                  | 0.917     |
|    value_loss           | 2.8e+03   |
---------------------------------------


--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 234      |
|    ep_rew_mean          | 748      |
| time/                   |          |
|    fps                  | 2351     |
|    iterations           | 8        |
|    time_elapsed         | 8        |
|    total_timesteps      | 20480    |
| train/                  |          |
|    approx_kl            | 445547.3 |
|    clip_fraction        | 0.5      |
|    clip_range           | 0.173    |
|    entropy_loss         | -2.66    |
|    explained_variance   | -0.139   |
|    learning_rate        | 0.0504   |
|    loss                 | 1.72e+03 |
|    n_updates            | 7        |
|    policy_gradient_loss | 0.185    |
|    std                  | 0.913    |
|    value_loss           | 2.17e+03 |
--------------------------------------


[I 2025-01-28 22:28:35,849] Trial 14 finished with value: -inf and parameters: {'learning_rate': 0.0503733645797028, 'n_steps': 8, 'batch_size': 7, 'gamma': 0.9332966345808937, 'gae_lambda': 0.9600033330755745, 'clip_range': 0.1726644069256619, 'ent_coef': 0.007018589283438836, 'vf_coef': 0.81603898118974, 'max_grad_norm': 1.5987553914976624, 'target_kl': 0.13433762533389507, 'n_epochs': 9, 'activation_fn': 'relu', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 92       |
|    ep_rew_mean     | 42.8     |
| time/              |          |
|    fps             | 1857     |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 10240    |
---------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 161       |
|    ep_rew_mean          | 352       |
| time/                   |           |
|    fps                  | 1478      |
|    iterations           | 2         |
|    time_elapsed         | 13        |
|    total_timesteps      | 20480     |
| train/                  |           |
|    approx_kl            | 109.90289 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.267     |
|    entropy_loss         | -2.85     |
|    explained_variance   | 0.0273    |
|    learning_rate        | 0.00743   |
|    loss                 | 1.96e+03  |
|    n_updates            | 1         |
|    policy_gradient_loss | 0.136     |
|    std                  | 1.01      |
|    value_loss           | 1.95e+03  |
---------------------------------------


[I 2025-01-28 22:28:49,874] Trial 15 finished with value: -inf and parameters: {'learning_rate': 0.007433524263906323, 'n_steps': 10, 'batch_size': 6, 'gamma': 0.9093866302240107, 'gae_lambda': 0.8719747651943934, 'clip_range': 0.26660143546032533, 'ent_coef': 0.08665788200540918, 'vf_coef': 0.5924936065197004, 'max_grad_norm': 2.6342005615442856, 'target_kl': 0.12128916875090284, 'n_epochs': 5, 'activation_fn': 'relu', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 94.7     |
|    ep_rew_mean     | 129      |
| time/              |          |
|    fps             | 1500     |
|    iterations      | 1        |
|    time_elapsed    | 13       |
|    total_timesteps | 20480    |
---------------------------------


[I 2025-01-28 22:29:03,704] Trial 16 finished with value: -inf and parameters: {'learning_rate': 0.0005515425817187722, 'n_steps': 11, 'batch_size': 5, 'gamma': 0.9488285389302562, 'gae_lambda': 0.9064007200230252, 'clip_range': 0.21671243020293326, 'ent_coef': 0.0028103521645436503, 'vf_coef': 0.415785705684269, 'max_grad_norm': 0.9631026843681079, 'target_kl': 0.16009352548647532, 'n_epochs': 7, 'activation_fn': 'relu', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 79.2     |
|    ep_rew_mean     | -111     |
| time/              |          |
|    fps             | 3358     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 5120     |
---------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 126       |
|    ep_rew_mean          | -13.5     |
| time/                   |           |
|    fps                  | 2796      |
|    iterations           | 2         |
|    time_elapsed         | 3         |
|    total_timesteps      | 10240     |
| train/                  |           |
|    approx_kl            | 21150.447 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.154     |
|    entropy_loss         | -2.84     |
|    explained_variance   | -0.103    |
|    learning_rate        | 0.0846    |
|    loss                 | 2.52e+05  |
|    n_updates            | 1         |
|    policy_gradient_loss | 0.147     |
|    std                  | 1.01      |
|    value_loss           | 1.48e+05  |
---------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 161       |
|    ep_rew_mean          | 226       |
| time/                   |           |
|    fps                  | 2509      |
|    iterations           | 3         |
|    time_elapsed         | 6         |
|    total_timesteps      | 15360     |
| train/                  |           |
|    approx_kl            | 13200.338 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.154     |
|    entropy_loss         | -2.86     |
|    explained_variance   | 0.467     |
|    learning_rate        | 0.0846    |
|    loss                 | 4.41e+06  |
|    n_updates            | 2         |
|    policy_gradient_loss | 0.173     |
|    std                  | 1.02      |
|    value_loss           | 2.66e+06  |
---------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 191       |
|    ep_rew_mean          | 395       |
| time/                   |           |
|    fps                  | 2305      |
|    iterations           | 4         |
|    time_elapsed         | 8         |
|    total_timesteps      | 20480     |
| train/                  |           |
|    approx_kl            | 17772.467 |
|    clip_fraction        | 0.5       |
|    clip_range           | 0.154     |
|    entropy_loss         | -2.86     |
|    explained_variance   | -1.38     |
|    learning_rate        | 0.0846    |
|    loss                 | 3.08e+06  |
|    n_updates            | 3         |
|    policy_gradient_loss | 0.173     |
|    std                  | 1.02      |
|    value_loss           | 2.12e+06  |
---------------------------------------


[I 2025-01-28 22:29:12,769] Trial 17 finished with value: -inf and parameters: {'learning_rate': 0.08459580193386712, 'n_steps': 9, 'batch_size': 8, 'gamma': 0.9274014274084127, 'gae_lambda': 0.8254504037644114, 'clip_range': 0.15392020731444667, 'ent_coef': 0.02187477497894267, 'vf_coef': 0.8521731608976866, 'max_grad_norm': 1.5734499152599477, 'target_kl': 0.18476504868067928, 'n_epochs': 4, 'activation_fn': 'relu', 'net_arch': 'small'}. Best is trial 0 with value: -inf.


Output()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 62       |
|    ep_rew_mean     | -171     |
| time/              |          |
|    fps             | 5905     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 1280     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 73.4         |
|    ep_rew_mean          | -208         |
| time/                   |              |
|    fps                  | 3949         |
|    iterations           | 2            |
|    time_elapsed         | 0            |
|    total_timesteps      | 2560         |
| train/                  |              |
|    approx_kl            | 0.0027158451 |
|    clip_fraction        | 0.000868     |
|    clip_range           | 0.264        |
|    entropy_loss         | -2.84        |
|    explained_variance   | -0.0149      

[I 2025-01-28 22:29:22,861] Trial 18 finished with value: -inf and parameters: {'learning_rate': 5.5338463090962595e-05, 'n_steps': 7, 'batch_size': 7, 'gamma': 0.9150955958629363, 'gae_lambda': 0.8549745091981609, 'clip_range': 0.26372886978067545, 'ent_coef': 0.0011733094688339293, 'vf_coef': 0.19982286698045404, 'max_grad_norm': 2.3095745882769645, 'target_kl': 0.14265997220944474, 'n_epochs': 9, 'activation_fn': 'tanh', 'net_arch': 'tiny'}. Best is trial 0 with value: -inf.


Output()

Using cpu device


[W 2025-01-28 22:30:03,293] Trial 19 failed with parameters: {'learning_rate': 0.0007703603133968627, 'n_steps': 12, 'batch_size': 6, 'gamma': 0.9711060929398074, 'gae_lambda': 0.8344294028655276, 'clip_range': 0.21919981922330883, 'ent_coef': 0.04644086444532992, 'vf_coef': 0.6808189017981737, 'max_grad_norm': 0.639815129465893, 'target_kl': 0.17287832741566356, 'n_epochs': 7, 'activation_fn': 'relu', 'net_arch': 'small'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_2506/2069060311.py", line 38, in objective
    model.learn(N_TIMESTEPS, callback=eval_callback, progress_bar=True)
  File "/usr/local/lib/python3.11/dist-packages/stable_baselines3/ppo/ppo.py", line 311, in learn
    return super().learn(
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.