https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html#notes

In [2]:
import gymnasium as gym
import os
import csv
import datetime
import tensorboard
import mujoco
import pygame

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env


rollout/
ep_len_mean: Mean episode length (averaged over stats_window_size episodes, 100 by default)

ep_rew_mean: Mean episodic training reward (averaged over stats_window_size episodes, 100 by default), a Monitor wrapper is required to compute that value (automatically added by make_vec_env).

exploration_rate: Current value of the exploration rate when using DQN, it corresponds to the fraction of actions taken randomly (epsilon of the “epsilon-greedy” exploration)

success_rate: Mean success rate during training (averaged over stats_window_size episodes, 100 by default), you must pass an extra argument to the Monitor wrapper to log that value (info_keywords=("is_success",)) and provide info["is_success"]=True/False on the final step of the episode

time/
episodes: Total number of episodes

fps: Number of frames per seconds (includes time taken by gradient update)

iterations: Number of iterations (data collection + policy update for A2C/PPO)

time_elapsed: Time in seconds since the beginning of training

total_timesteps: Total number of timesteps (steps in the environments)

train/
actor_loss: Current value for the actor loss for off-policy algorithms

approx_kl: approximate mean KL divergence between old and new policy (for PPO), it is an estimation of how much changes happened in the update

clip_fraction: mean fraction of surrogate loss that was clipped (above clip_range threshold) for PPO.

clip_range: Current value of the clipping factor for the surrogate loss of PPO

critic_loss: Current value for the critic function loss for off-policy algorithms, usually error between value function output and TD(0), temporal difference estimate

ent_coef: Current value of the entropy coefficient (when using SAC)

ent_coef_loss: Current value of the entropy coefficient loss (when using SAC)

entropy_loss: Mean value of the entropy loss (negative of the average policy entropy)

explained_variance: Fraction of the return variance explained by the value function, see https://scikit-learn.org/stable/modules/model_evaluation.html#explained-variance-score (ev=0 => might as well have predicted zero, ev=1 => perfect prediction, ev<0 => worse than just predicting zero)

learning_rate: Current learning rate value

loss: Current total loss value

n_updates: Number of gradient updates applied so far

policy_gradient_loss: Current value of the policy gradient loss (its value does not have much meaning)

value_loss: Current value for the value function loss for on-policy algorithms, usually error between value function output and Monte-Carlo estimate (or TD(lambda) estimate)

std: Current standard deviation of the noise when using generalized State-Dependent Exploration (gSDE)

In [45]:
# wrap custom enviroment with changed reward strategies
class CustomMountainCarEnv(gym.Wrapper):       # Reward wird vergeben je höher car kommt
    def __init__(self, inheritance_env):
        super(CustomMountainCarEnv, self).__init__(inheritance_env)

    def step(self, action):

        observation, reward, terminated, truncated, info = self.env.step(action)    # "normale" step methode aufrufen
 
        x_position = observation[0]
        if x_position >= -0.5:
            exp = int((x_position + 0.5) * 10)  # Exponent berechnen (z.B. -0.4 -> 0, -0.3 -> 1, ..., 0,4 -> 8)
            reward = 2 ** exp
            #print("Reward:", reward)

        # elif x_position >= -0.6:                
        #     exp = int((-x_position - 0.6) * 10)  # Exponent berechnen (z.B. -0,6 -> 0, -0,7 -> 1, ..., -1,2 -> 6)
        #     reward = 2 ** exp

        return observation, reward, terminated, truncated, info

In [4]:
# Parallel environments
# vec_env = make_vec_env("CartPole-v1", n_envs=4)

#Single vectorized enviroment
#inheritance_env = gym.make("MountainCar-v0")
#inheritance_env = make_vec_env("MountainCarContinuous-v0")
#env = CustomMountainCarEnv(inheritance_env)
env = make_vec_env('Hopper-v4')

#create model and logging the callbacks
jetzt = datetime.datetime.now()
datum_uhrzeit = jetzt.strftime("%Y%m%d_%H%M%S")
savedir = f'model\\Baselines_Hopper_2000000EP{datum_uhrzeit}\\data'

os.makedirs('model', exist_ok=True)
os.makedirs(savedir, exist_ok=True)

log_dir = f"{savedir}\\log"

model = PPO("MlpPolicy", env, verbose=1, tensorboard_log= log_dir)    #verbose 0-> no output, verbose 1-> info messages, verbose 2-> debug messages

        
model.learn(total_timesteps=2000000) 
model.save(savedir)

Using cpu device
Logging to model\Baselines_Hopper_2000000EP20240105_211146\data\log\PPO_1


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 18.8     |
|    ep_rew_mean     | 14.3     |
| time/              |          |
|    fps             | 719      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 23.4        |
|    ep_rew_mean          | 22.1        |
| time/                   |             |
|    fps                  | 555         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015819628 |
|    clip_fraction        | 0.208       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.24       |
|    explained_variance   | 0.00978     |
|    learning_rate        | 0.

In [5]:
#del model # remove to demonstrate saving and loading
loaddir = './model/Baselines_Hopper_2000000EP20240105_211146/data.zip'
#loaddir = './ppo_cart_pole_2.zip'
model = PPO.load(loaddir)

env = make_vec_env('Hopper-v4')

render_episodes = 10
counter = 0
obs = env.reset()
running = True
pygame.init()

while running:
    for event in pygame.event.get():    #doesnt work with stable baselines
        if event.type == pygame.QUIT:
            running = False

    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render("human")

    if dones == True:
        counter += 1

    if counter > render_episodes:
        break

pygame.quit()
env.close()

KeyboardInterrupt: 

In [None]:
model = PPO.load("ppo_mountain_cart_v0")

print(model.rollout_buffer)

<stable_baselines3.common.buffers.RolloutBuffer object at 0x0000029000AD8B50>
