In [1]:
import wandb
import os
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
from env_V3_final_2 import CustomEnv  # Assuming CustomEnv is in this file
from stable_baselines3.common.monitor import Monitor
import tensorboard


# Initialize the project in WandB (login to WandB manually if needed)
wandb.login()  # Use this for logging in manually
wandb.init(project="sb3_custom_env", sync_tensorboard=True)

# Set hyperparameters directly for Jupyter
learning_rate = 0.00005  # Lower learning rate for more stable gradient updates
batch_size = 32  # Larger batch size for better gradient estimates
n_steps = 2048  # Shorter rollout steps for faster updates
n_epochs = 20  # Increase epochs to improve stability per update
ent_coef = 0.02  # Encourage more exploration (default is often 0.0)
gamma = 0.98  # Discount factor remains unchanged
episodes = 2  # Total number of episodes
iterations_per_episode = 20000  # Iterations per episode

# Set up the environment
env = CustomEnv(render=False, max_steps=n_steps)  # Initialize the custom environment

# Set up PPO model with the custom environment, command-line arguments, and TensorBoard logging
model = PPO(
    'MlpPolicy', 
    env, 
    verbose=1,
    learning_rate=learning_rate, 
    batch_size=batch_size, 
    n_steps=n_steps, 
    n_epochs=n_epochs, 
    ent_coef=ent_coef, 
    tensorboard_log=f"./ppo_custom_env_tensorboard/{wandb.run.id}/"
)


# Training loop for a specific number of episodes
for episode in range(1, episodes + 1):
    print(f"Starting episode {episode}/{episodes}")

    # Total steps per episode = iterations * steps per iteration
    total_timesteps = iterations_per_episode * n_steps  # 200,000 iterations * 1,000 steps/iteration

    # Train the model for the calculated number of timesteps
    model.learn(total_timesteps=total_timesteps, reset_num_timesteps=False, tb_log_name=f"PPO_run_{wandb.run.id}_episode_{episode}")

    # Save the model incrementally after each episode
    model.save(f"ppo_model_episode_{episode}")
    
    # Log the model checkpoint to WandB
    wandb.save(f"ppo_model_episode_{episode}.zip")

# Finish WandB logging after training is complete
wandb.finish()


2025-01-18 14:16:07.433036: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737206167.453140  295385 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737206167.459393  295385 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-18 14:16:07.507815: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
pybullet build time: Sep  3 2024 12:51:03
[34m[1mwandb[0m: Currently logged in as: [33m222855[0m ([33m222855-breda-univ

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Starting episode 1/2
Logging to ./ppo_custom_env_tensorboard/74cddbb0/PPO_run_74cddbb0_episode_1_0




----------------------------------
| rollout/           |           |
|    ep_len_mean     | 2.05e+03  |
|    ep_rew_mean     | -1.87e+03 |
| time/              |           |
|    fps             | 568       |
|    iterations      | 1         |
|    time_elapsed    | 3         |
|    total_timesteps | 2048      |
----------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 2.05e+03     |
|    ep_rew_mean          | -1.83e+03    |
| time/                   |              |
|    fps                  | 348          |
|    iterations           | 2            |
|    time_elapsed         | 11           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0040955488 |
|    clip_fraction        | 0.00544      |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.25        |
|    explained_variance   | 0.00698      |
|    

KeyboardInterrupt: 