In [1]:
from tensorflow.keras import backend as K
from tensorflow.keras import activations, initializers
from tensorflow.keras.layers import Layer

import tensorflow as tf
import tensorflow_probability as tfp

tfd = tfp.distributions
tfp.__version__

'0.8.0'

In [2]:
def bnn_extractor(flat_observations, net_arch, act_fun):
    """
    Constructs an variational layer that receives observations as an input and outputs a latent representation for the policy and
    a value network. The ``net_arch`` parameter allows to specify the amount and size of the hidden layers and how many
    of them are shared between the policy network and the value network. It is assumed to be a list with the following
    structure:
    1. An arbitrary length (zero allowed) number of integers each specifying the number of units in a shared layer.
       If the number of ints is zero, there will be no shared layers.
    2. An optional dict, to specify the following non-shared layers for the value network and the policy network.
       It is formatted like ``dict(vf=[<value layer sizes>], pi=[<policy layer sizes>])``.
       If it is missing any of the keys (pi or vf), no non-shared layers (empty list) is assumed.
    For example to construct a network with one shared layer of size 55 followed by two non-shared layers for the value
    network of size 255 and a single non-shared layer of size 128 for the policy network, the following layers_spec
    would be used: ``[55, dict(vf=[255, 255], pi=[128])]``. A simple shared network topology with two layers of size 128
    would be specified as [128, 128].
    :param flat_observations: (tf.Tensor) The observations to base policy and value function on.
    :param net_arch: ([int or dict]) The specification of the policy and value networks.
        See above for details on its formatting.
    :param act_fun: (tf function) The activation function to use for the networks.
    :return: (tf.Tensor, tf.Tensor) latent_policy, latent_value of the specified network.
        If all layers are shared, then ``latent_policy == latent_value``
    """
    latent = flat_observations
    policy_only_layers = []  # Layer sizes of the network that only belongs to the policy network
    value_only_layers = []  # Layer sizes of the network that only belongs to the value network
    kernel_divergence_fn=lambda q, p, _: tfp.distributions.kl_divergence(q, p)

    # Iterate through the shared layers and build the shared parts of the network
    for idx, layer in enumerate(net_arch):
        if isinstance(layer, int):  # Check that this is a shared layer
            layer_size = layer
#             latent = act_fun(linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2)))
            latent = act_fun(tfp.layers.DenseFlipout(layer_size, activation = 'relu', kernel_divergence_fn=kernel_divergence_fn)(latent))
        else:
            assert isinstance(layer, dict), "Error: the net_arch list can only contain ints and dicts"
            if 'pi' in layer:
                assert isinstance(layer['pi'], list), "Error: net_arch[-1]['pi'] must contain a list of integers."
                policy_only_layers = layer['pi']

            if 'vf' in layer:
                assert isinstance(layer['vf'], list), "Error: net_arch[-1]['vf'] must contain a list of integers."
                value_only_layers = layer['vf']
            break  # From here on the network splits up in policy and value network

    # Build the non-shared part of the network
    latent_policy = latent
    latent_value = latent
    for idx, (pi_layer_size, vf_layer_size) in enumerate(zip_longest(policy_only_layers, value_only_layers)):
        if pi_layer_size is not None:
            assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers."
#             latent_policy = act_fun(linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2)))
            latent_policy = act_fun(tfp.layers.DenseFlipout(pi_layer_size, activation = 'relu', kernel_divergence_fn=kernel_divergence_fn)(latent))

        if vf_layer_size is not None:
            assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers."
#             latent_value = act_fun(linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2)))
            latent_value = act_fun(tfp.layers.DenseFlipout(vf_layer_size, activation = 'relu', kernel_divergence_fn=kernel_divergence_fn)(latent))

    return latent_policy, latent_value

In [3]:
from stable_baselines.common.policies import ActorCriticPolicy, nature_cnn

class FeedForwardPolicy(ActorCriticPolicy):
    """
    Policy object that implements actor critic, using a feed forward neural network.
    :param sess: (TensorFlow session) The current TensorFlow session
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param n_env: (int) The number of environments to run
    :param n_steps: (int) The number of steps to run for each environment
    :param n_batch: (int) The number of batch to run (n_envs * n_steps)
    :param reuse: (bool) If the policy is reusable or not
    :param layers: ([int]) (deprecated, use net_arch instead) The size of the Neural network for the policy
        (if None, default to [64, 64])
    :param net_arch: (list) Specification of the actor-critic policy network architecture (see mlp_extractor
        documentation for details).
    :param act_fun: (tf.func) the activation function to use in the neural network.
    :param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction
    :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp")
    :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
    """

    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None,
                 act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="cnn", **kwargs):
        super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse,
                                                scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        if layers is not None:
            warnings.warn("Usage of the `layers` parameter is deprecated! Use net_arch instead "
                          "(it has a different semantics though).", DeprecationWarning)
            if net_arch is not None:
                warnings.warn("The new `net_arch` parameter overrides the deprecated `layers` parameter!",
                              DeprecationWarning)

        if net_arch is None:
            if layers is None:
                layers = [64, 64]
            net_arch = [dict(vf=layers, pi=layers)]

        with tf.variable_scope("model", reuse=reuse):
            if feature_extraction == "cnn":
                pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs)
            elif feature_extraction == "bnn":
                pi_latent, vf_latent = bnn_extractor(tf.layers.flatten(self.processed_obs), net_arch, act_fun)
            else:
                pi_latent, vf_latent = mlp_extractor(tf.layers.flatten(self.processed_obs), net_arch, act_fun)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()

    def step(self, obs, state=None, mask=None, deterministic=False):
        if deterministic:
            action, value, neglogp = self.sess.run([self.deterministic_action, self.value_flat, self.neglogp],
                                                   {self.obs_ph: obs})
        else:
            action, value, neglogp = self.sess.run([self.action, self.value_flat, self.neglogp],
                                                   {self.obs_ph: obs})
        return action, value, self.initial_state, neglogp

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, {self.obs_ph: obs})

    def value(self, obs, state=None, mask=None):
        return self.sess.run(self.value_flat, {self.obs_ph: obs})

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [4]:
import warnings
from itertools import zip_longest
from abc import ABC, abstractmethod

import numpy as np
import tensorflow as tf
from gym.spaces import Discrete

from stable_baselines.common.tf_util import batch_to_seq, seq_to_batch
from stable_baselines.common.tf_layers import conv, linear, conv_to_fc, lstm
from stable_baselines.common.distributions import make_proba_dist_type, CategoricalProbabilityDistribution, \
    MultiCategoricalProbabilityDistribution, DiagGaussianProbabilityDistribution, BernoulliProbabilityDistribution
from stable_baselines.common.input import observation_input
from stable_baselines.common.policies import nature_cnn

In [5]:
class BnnPolicy(FeedForwardPolicy):
    """
    Policy object that implements actor critic, using a Bayesian neural net (2 layers of 64)
    :param sess: (TensorFlow session) The current TensorFlow session
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param n_env: (int) The number of environments to run
    :param n_steps: (int) The number of steps to run for each environment
    :param n_batch: (int) The number of batch to run (n_envs * n_steps)
    :param reuse: (bool) If the policy is reusable or not
    :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
    """

    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs):
        super(BnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
                                        feature_extraction="bnn", **_kwargs)

# DNN Cartpole

In [8]:
import os
import gym

from stable_baselines.ppo1 import PPO1
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import logger
from stable_baselines.common.callbacks import EvalCallback

NUM_TIMESTEPS = int(1e4)
SEED = 722
EVAL_FREQ = 250000
EVAL_EPISODES = 10  # was 1000

LOGDIR = "dnn_acrobot" # moved to zoo afterwards.
logger.configure(folder=LOGDIR)

env = gym.make("Acrobot-v1")
env.seed(SEED)

Logging to dnn_acrobot


[722]

In [9]:
# take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
dnn = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
                 optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)

eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)

dnn.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

dnn.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.

env.close()





Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.










Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
********** Iteration 0 ************


  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
      0.00051 |       0.00000 |     240.78682 |       0.00021 |       1.09840
     -0.00028 |       0.00000 |     178.88135 |       0.00060 |       1.09799
     -0.00096 |       0.00000 |     112.32079 |       0.00164 |       1.09695
     -0.00157 |       0.00000 |      64.52588 |       0.00269 |       1.09588
     -0.00190 |       0.00000 |      37.71918 |       0.00356 |       1.09502
     -0.00212 |       0.00000 |      22.85038 |       0.00337 |       1.09522
     -0.00244 |       0.00000 |      14.43047 |       0.00381 |       1.09480
     -0.00302 |       0.00000 |       9.68169 |       0.00420 |       1.09440
     -0.00325 |       0.00000 |       7.02277 |       0.00432 |       1.09429
     -0.00325 |       0.00000 |       5.57533 |       0.00461 |       1.09402
Evaluating losses...
     -0.00424 |       0.00000 |       5.10339 |       0.00407 |       1.09453
-----------------------------

# BNN Cartpole

In [10]:
NUM_TIMESTEPS = int(1e4)
SEED = 722
EVAL_FREQ = 250000
EVAL_EPISODES = 10  # was 1000

LOGDIR = "bnn_acrobot" # moved to zoo afterwards.
logger.configure(folder=LOGDIR)

env = gym.make("Acrobot-v1")
env.seed(SEED)

Logging to bnn_acrobot


[722]

In [None]:
# take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
bnn = PPO1(BnnPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
                 optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)

eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)

bnn.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

bnn.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.

env.close()

Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
********** Iteration 0 ************


  "{} != {}".format(self.training_env, self.eval_env))


# Performance

In [10]:
from stable_baselines.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(dnn, dnn.get_env(), n_eval_episodes=1000)
print(f"DNN - Mean reward: {mean_reward}, Std reward: {std_reward}")

mean_reward, std_reward = evaluate_policy(bnn, bnn.get_env(), n_eval_episodes=1000)
print(f"BNN - Mean reward: {mean_reward}, Std reward: {std_reward}")

DNN - Mean reward: 187.59, Std reward: 20.69627744305724
BNN - Mean reward: 172.414, Std reward: 27.614210182440488


In [57]:
from stable_baselines.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(dnn, dnn.get_env(), n_eval_episodes=1000)
print(f"DNN - Mean reward: {mean_reward}, Std reward: {std_reward}")

mean_reward, std_reward = evaluate_policy(bnn, bnn.get_env(), n_eval_episodes=1000)
print(f"BNN - Mean reward: {mean_reward}, Std reward: {std_reward}")

DNN - Mean reward: 175.151, Std reward: 38.99818199608797
BNN - Mean reward: 164.046, Std reward: 45.66565321989822


In [62]:
from stable_baselines.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(dnn, dnn.get_env(), n_eval_episodes=1000)
print(f"DNN - Mean reward: {mean_reward}, Std reward: {std_reward}")

mean_reward, std_reward = evaluate_policy(bnn, bnn.get_env(), n_eval_episodes=1000)
print(f"BNN - Mean reward: {mean_reward}, Std reward: {std_reward}")

DNN - Mean reward: 156.935, Std reward: 48.66798511341928
BNN - Mean reward: 189.304, Std reward: 18.416991719605022


In [67]:
from stable_baselines.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(dnn, dnn.get_env(), n_eval_episodes=1000)
print(f"DNN - Mean reward: {mean_reward}, Std reward: {std_reward}")

mean_reward, std_reward = evaluate_policy(bnn, bnn.get_env(), n_eval_episodes=1000)
print(f"BNN - Mean reward: {mean_reward}, Std reward: {std_reward}")

DNN - Mean reward: 161.503, Std reward: 47.323144348193935
BNN - Mean reward: 178.989, Std reward: 32.992982268961384


# Evalutaion

There are a few possible ways we can take this:
1. Small number of iterations on CartPole, to see which model performs better with less data.
2. Run experiment on Acrobot, to hopefully see BNN > DNN.
3. Hide 1 out of every N frames from the learning agent (self-play or single-agent), to hopefully see BNN > DNN.

For all of the above, run a hundred different initialisations using different random seeds, and evaluate using at least 1K episodes (the episodes have extremely high variance even with the same random seed)

In [18]:
import pandas as pd

#To store experiment results
df = pd.DataFrame(columns = ['Seed', 'BNN reward', 'BNN std', 'DNN reward', 'DNN std']) 

df.head()

Unnamed: 0,Seed,BNN reward,BNN std,DNN reward,DNN std


In [None]:
#Evaluation method one

import os
import gym

from stable_baselines.ppo1 import PPO1
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import logger
from stable_baselines.common.callbacks import EvalCallback
from stable_baselines.common.evaluation import evaluate_policy

NUM_TIMESTEPS = int(1e4)
SEED = 722
EVAL_FREQ = 250000
EVAL_EPISODES = 10  # was 1000

for SEED in range (600, 701):
    LOGDIR = "dnn_cartpole" # moved to zoo afterwards.
    logger.configure(folder=LOGDIR)
    
    env = gym.make("CartPole-v0")
    env.seed(SEED)

    # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
    dnn = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
                 optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)

    eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)

    dnn.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    dnn.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.

    env.close()
    
    LOGDIR = "bnn_cartpole" # moved to zoo afterwards.
    logger.configure(folder=LOGDIR)

    env = gym.make("CartPole-v0")
    env.seed(SEED)
    
    # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
    bnn = PPO1(BnnPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
                 optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)

    eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)

    bnn.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    bnn.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.

    env.close()
    
    mean_rewardDNN, std_rewardDNN = evaluate_policy(dnn, dnn.get_env(), n_eval_episodes=1000)
    print(f"DNN - Mean reward: {mean_rewardDNN}, Std reward: {std_rewardDNN}")

    mean_rewardBNN, std_rewardBNN = evaluate_policy(bnn, bnn.get_env(), n_eval_episodes=1000)
    print(f"BNN - Mean reward: {mean_rewardBNN}, Std reward: {std_rewardBNN}")
    
    data = {
    "Seed" : SEED,
    "BNN reward" : mean_rewardBNN,
    "BNN std" : std_rewardBNN,
    "DNN reward" : mean_rewardDNN,
    "DNN std" : std_rewardDNN
    }

    df = df.append(data, ignore_index=True)

    

Logging to dnn_cartpole
********** Iteration 0 ************


  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01104 |       0.00000 |      83.31959 |       0.00455 |       0.68846
     -0.01737 |       0.00000 |      70.39660 |       0.00789 |       0.68506
     -0.01893 |       0.00000 |      44.48908 |       0.00881 |       0.68414
     -0.02156 |       0.00000 |      22.42776 |       0.00901 |       0.68393
     -0.02429 |       0.00000 |      15.77996 |       0.01042 |       0.68252
     -0.02641 |       0.00000 |      14.44719 |       0.01168 |       0.68127
     -0.02701 |       0.00000 |      13.38486 |       0.01305 |       0.67991
     -0.02805 |       0.00000 |      12.24819 |       0.01337 |       0.67960
     -0.02847 |       0.00000 |      11.23798 |       0.01376 |       0.67921
     -0.02903 |       0.00000 |      10.48966 |       0.01386 |       0.67911
Evaluating losses...
     -0.02881 |       0.00000 |      10.17848 |       0.01603 |       0.67697
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00168 |       0.00000 |      93.04918 |       0.00010 |       0.69305
     -0.00627 |       0.00000 |      90.33788 |       0.00093 |       0.69221
     -0.01293 |       0.00000 |      87.07071 |       0.00436 |       0.68881
     -0.01600 |       0.00000 |      83.06079 |       0.00770 |       0.68553
     -0.01655 |       0.00000 |      78.25071 |       0.00884 |       0.68440
     -0.01734 |       0.00000 |      72.83126 |       0.00866 |       0.68458
     -0.01846 |       0.00000 |      67.07480 |       0.00882 |       0.68442
     -0.01998 |       0.00000 |      61.22139 |       0.00917 |       0.68406
     -0.02124 |       0.00000 |      55.44053 |       0.00942 |       0.68382
     -0.02289 |       0.00000 |      49.86128 |       0.01027 |       0.68297
Evaluating losses...
     -0.02381 |       0.00000 |      47.13252 |       0.01006 |       0.68318
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00831 |       0.00000 |      79.21246 |       0.00347 |       0.68986
     -0.01438 |       0.00000 |      66.12856 |       0.00804 |       0.68546
     -0.01619 |       0.00000 |      40.38298 |       0.00849 |       0.68502
     -0.01907 |       0.00000 |      20.21524 |       0.00905 |       0.68447
     -0.02272 |       0.00000 |      14.85998 |       0.01055 |       0.68300
     -0.02603 |       0.00000 |      13.72186 |       0.01193 |       0.68165
     -0.02789 |       0.00000 |      12.59174 |       0.01298 |       0.68062
     -0.02908 |       0.00000 |      11.32942 |       0.01344 |       0.68016
     -0.02968 |       0.00000 |      10.20894 |       0.01349 |       0.68011
     -0.02997 |       0.00000 |       9.40086 |       0.01393 |       0.67968
Evaluating losses...
     -0.03028 |       0.00000 |       9.09371 |       0.01264 |       0.68092
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00132 |       0.00000 |      87.21719 |       0.00013 |       0.69301
     -0.00562 |       0.00000 |      84.53976 |       0.00078 |       0.69235
     -0.01187 |       0.00000 |      81.48814 |       0.00370 |       0.68942
     -0.01646 |       0.00000 |      77.90315 |       0.00719 |       0.68598
     -0.01748 |       0.00000 |      73.66091 |       0.00859 |       0.68460
     -0.01823 |       0.00000 |      68.84760 |       0.00906 |       0.68413
     -0.01973 |       0.00000 |      63.66901 |       0.00900 |       0.68419
     -0.02108 |       0.00000 |      58.23838 |       0.00932 |       0.68387
     -0.02271 |       0.00000 |      52.80296 |       0.01061 |       0.68259
     -0.02435 |       0.00000 |      47.55459 |       0.01050 |       0.68270
Evaluating losses...
     -0.02501 |       0.00000 |      44.98935 |       0.01096 |       0.68224
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00671 |       0.00000 |      94.10288 |       0.00339 |       0.68969
     -0.01539 |       0.00000 |      80.57233 |       0.00742 |       0.68564
     -0.01633 |       0.00000 |      52.34605 |       0.00850 |       0.68456
     -0.01884 |       0.00000 |      26.87642 |       0.00842 |       0.68463
     -0.02208 |       0.00000 |      18.23467 |       0.00916 |       0.68389
     -0.02509 |       0.00000 |      16.41891 |       0.01111 |       0.68196
     -0.02714 |       0.00000 |      15.13486 |       0.01211 |       0.68097
     -0.02833 |       0.00000 |      13.73058 |       0.01296 |       0.68013
     -0.02925 |       0.00000 |      12.51557 |       0.01339 |       0.67972
     -0.02970 |       0.00000 |      11.65894 |       0.01338 |       0.67972
Evaluating losses...
     -0.03014 |       0.00000 |      11.33321 |       0.01316 |       0.67995
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00187 |       0.00000 |      78.97748 |       0.00016 |       0.69300
     -0.00719 |       0.00000 |      76.34558 |       0.00168 |       0.69152
     -0.01200 |       0.00000 |      73.13460 |       0.00499 |       0.68827
     -0.01538 |       0.00000 |      69.28011 |       0.00683 |       0.68646
     -0.01724 |       0.00000 |      64.83208 |       0.00834 |       0.68497
     -0.01816 |       0.00000 |      60.01022 |       0.00901 |       0.68431
     -0.01982 |       0.00000 |      54.93562 |       0.00914 |       0.68418
     -0.02103 |       0.00000 |      49.77633 |       0.00935 |       0.68397
     -0.02244 |       0.00000 |      44.69564 |       0.01034 |       0.68300
     -0.02365 |       0.00000 |      39.85924 |       0.01059 |       0.68275
Evaluating losses...
     -0.02441 |       0.00000 |      37.56202 |       0.00991 |       0.68341
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00836 |       0.00000 |      82.48325 |       0.00325 |       0.68983
     -0.01741 |       0.00000 |      69.66510 |       0.00836 |       0.68468
     -0.01912 |       0.00000 |      44.09813 |       0.00838 |       0.68467
     -0.02180 |       0.00000 |      22.33900 |       0.00954 |       0.68351
     -0.02466 |       0.00000 |      15.73257 |       0.01038 |       0.68267
     -0.02665 |       0.00000 |      14.31156 |       0.01175 |       0.68132
     -0.02806 |       0.00000 |      13.07666 |       0.01257 |       0.68051
     -0.02916 |       0.00000 |      11.81603 |       0.01315 |       0.67993
     -0.02978 |       0.00000 |      10.80821 |       0.01342 |       0.67967
     -0.03067 |       0.00000 |      10.15432 |       0.01374 |       0.67935
Evaluating losses...
     -0.03100 |       0.00000 |       9.89902 |       0.01457 |       0.67854
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00109 |       0.00000 |      93.84705 |      5.48e-05 |       0.69309
     -0.00466 |       0.00000 |      91.07639 |       0.00067 |       0.69247
     -0.01052 |       0.00000 |      87.49888 |       0.00304 |       0.69012
     -0.01484 |       0.00000 |      83.00015 |       0.00710 |       0.68612
     -0.01624 |       0.00000 |      77.72357 |       0.00820 |       0.68505
     -0.01691 |       0.00000 |      71.82268 |       0.00916 |       0.68411
     -0.01785 |       0.00000 |      65.54908 |       0.00928 |       0.68399
     -0.01912 |       0.00000 |      59.17629 |       0.00895 |       0.68430
     -0.02073 |       0.00000 |      52.93835 |       0.00953 |       0.68373
     -0.02251 |       0.00000 |      47.01973 |       0.01025 |       0.68301
Evaluating losses...
     -0.02325 |       0.00000 |      44.14791 |       0.01060 |       0.68267
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00830 |       0.00000 |      89.89194 |       0.00344 |       0.68954
     -0.01444 |       0.00000 |      76.85749 |       0.00797 |       0.68491
     -0.01585 |       0.00000 |      49.78482 |       0.00822 |       0.68466
     -0.01832 |       0.00000 |      25.55328 |       0.00842 |       0.68445
     -0.02140 |       0.00000 |      17.54012 |       0.00982 |       0.68306
     -0.02394 |       0.00000 |      15.83537 |       0.01154 |       0.68134
     -0.02544 |       0.00000 |      14.60463 |       0.01247 |       0.68042
     -0.02647 |       0.00000 |      13.25350 |       0.01312 |       0.67978
     -0.02721 |       0.00000 |      12.05355 |       0.01314 |       0.67977
     -0.02758 |       0.00000 |      11.20622 |       0.01381 |       0.67911
Evaluating losses...
     -0.02831 |       0.00000 |      10.84922 |       0.01375 |       0.67916
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00181 |       0.00000 |      83.62863 |      9.11e-05 |       0.69306
     -0.00788 |       0.00000 |      80.76478 |       0.00118 |       0.69198
     -0.01574 |       0.00000 |      77.57241 |       0.00505 |       0.68817
     -0.01905 |       0.00000 |      73.67188 |       0.00844 |       0.68485
     -0.01968 |       0.00000 |      69.07552 |       0.00966 |       0.68366
     -0.02038 |       0.00000 |      63.94521 |       0.00951 |       0.68380
     -0.02177 |       0.00000 |      58.56540 |       0.00934 |       0.68396
     -0.02322 |       0.00000 |      53.11386 |       0.00960 |       0.68369
     -0.02469 |       0.00000 |      47.79222 |       0.01014 |       0.68317
     -0.02634 |       0.00000 |      42.68106 |       0.01060 |       0.68270
Evaluating losses...
     -0.02664 |       0.00000 |      40.23510 |       0.01201 |       0.68133
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00947 |       0.00000 |      91.01717 |       0.00335 |       0.68990
     -0.01518 |       0.00000 |      78.08060 |       0.00834 |       0.68505
     -0.01679 |       0.00000 |      50.77236 |       0.00799 |       0.68539
     -0.01871 |       0.00000 |      25.76315 |       0.00836 |       0.68501
     -0.02195 |       0.00000 |      17.41224 |       0.00922 |       0.68416
     -0.02524 |       0.00000 |      15.76602 |       0.01113 |       0.68229
     -0.02751 |       0.00000 |      14.62425 |       0.01233 |       0.68110
     -0.02904 |       0.00000 |      13.36506 |       0.01297 |       0.68048
     -0.02991 |       0.00000 |      12.22252 |       0.01364 |       0.67982
     -0.03033 |       0.00000 |      11.36738 |       0.01401 |       0.67946
Evaluating losses...
     -0.03077 |       0.00000 |      11.03735 |       0.01286 |       0.68058
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00084 |       0.00000 |      88.45740 |      3.80e-05 |       0.69311
     -0.00350 |       0.00000 |      85.66366 |       0.00047 |       0.69269
     -0.00841 |       0.00000 |      82.34449 |       0.00199 |       0.69120
     -0.01374 |       0.00000 |      78.22843 |       0.00562 |       0.68761
     -0.01595 |       0.00000 |      73.41665 |       0.00788 |       0.68539
     -0.01642 |       0.00000 |      68.12473 |       0.00821 |       0.68507
     -0.01737 |       0.00000 |      62.56273 |       0.00889 |       0.68440
     -0.01863 |       0.00000 |      56.90676 |       0.00847 |       0.68480
     -0.02030 |       0.00000 |      51.30700 |       0.00920 |       0.68408
     -0.02165 |       0.00000 |      45.88834 |       0.01019 |       0.68310
Evaluating losses...
     -0.02282 |       0.00000 |      43.20287 |       0.00991 |       0.68337
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01049 |       0.00000 |      84.72376 |       0.00474 |       0.68845
     -0.01652 |       0.00000 |      72.21685 |       0.00752 |       0.68569
     -0.01829 |       0.00000 |      45.91370 |       0.00810 |       0.68512
     -0.02074 |       0.00000 |      22.95880 |       0.00871 |       0.68451
     -0.02307 |       0.00000 |      15.91994 |       0.01007 |       0.68317
     -0.02547 |       0.00000 |      14.46792 |       0.01144 |       0.68182
     -0.02702 |       0.00000 |      13.24975 |       0.01234 |       0.68093
     -0.02818 |       0.00000 |      11.92383 |       0.01304 |       0.68024
     -0.02865 |       0.00000 |      10.84656 |       0.01327 |       0.68001
     -0.02935 |       0.00000 |      10.09756 |       0.01394 |       0.67935
Evaluating losses...
     -0.02966 |       0.00000 |       9.82160 |       0.01307 |       0.68020
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00146 |       0.00000 |      94.97602 |       0.00016 |       0.69298
     -0.00641 |       0.00000 |      91.94418 |       0.00143 |       0.69170
     -0.01261 |       0.00000 |      88.45521 |       0.00508 |       0.68808
     -0.01593 |       0.00000 |      84.28053 |       0.00725 |       0.68594
     -0.01716 |       0.00000 |      79.40884 |       0.00867 |       0.68455
     -0.01809 |       0.00000 |      73.93264 |       0.00905 |       0.68418
     -0.01920 |       0.00000 |      68.00504 |       0.00899 |       0.68422
     -0.02029 |       0.00000 |      61.88317 |       0.00894 |       0.68427
     -0.02173 |       0.00000 |      55.75229 |       0.00956 |       0.68365
     -0.02298 |       0.00000 |      49.82974 |       0.01030 |       0.68293
Evaluating losses...
     -0.02372 |       0.00000 |      46.92803 |       0.01021 |       0.68301
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01119 |       0.00000 |      84.16708 |       0.00454 |       0.68853
     -0.01684 |       0.00000 |      71.53416 |       0.00859 |       0.68445
     -0.01826 |       0.00000 |      45.59304 |       0.00793 |       0.68510
     -0.02046 |       0.00000 |      23.20214 |       0.00905 |       0.68397
     -0.02324 |       0.00000 |      16.10640 |       0.01034 |       0.68269
     -0.02533 |       0.00000 |      14.66428 |       0.01183 |       0.68121
     -0.02650 |       0.00000 |      13.65542 |       0.01275 |       0.68030
     -0.02693 |       0.00000 |      12.51706 |       0.01357 |       0.67949
     -0.02791 |       0.00000 |      11.44582 |       0.01368 |       0.67939
     -0.02827 |       0.00000 |      10.62386 |       0.01396 |       0.67911
Evaluating losses...
     -0.02869 |       0.00000 |      10.28435 |       0.01323 |       0.67983
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00105 |       0.00000 |      89.86337 |      8.34e-05 |       0.69306
     -0.00442 |       0.00000 |      87.17561 |       0.00063 |       0.69250
     -0.00989 |       0.00000 |      84.04006 |       0.00317 |       0.68996
     -0.01414 |       0.00000 |      80.20712 |       0.00705 |       0.68613
     -0.01529 |       0.00000 |      75.60359 |       0.00877 |       0.68443
     -0.01645 |       0.00000 |      70.33739 |       0.00841 |       0.68478
     -0.01776 |       0.00000 |      64.72981 |       0.00922 |       0.68399
     -0.01963 |       0.00000 |      58.98057 |       0.00925 |       0.68395
     -0.02112 |       0.00000 |      53.25884 |       0.00995 |       0.68325
     -0.02275 |       0.00000 |      47.78210 |       0.01040 |       0.68280
Evaluating losses...
     -0.02328 |       0.00000 |      45.12704 |       0.01022 |       0.68299
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00874 |       0.00000 |      78.49369 |       0.00418 |       0.68915
     -0.01477 |       0.00000 |      65.76054 |       0.00798 |       0.68548
     -0.01640 |       0.00000 |      40.64634 |       0.00852 |       0.68495
     -0.01875 |       0.00000 |      20.66547 |       0.00920 |       0.68428
     -0.02115 |       0.00000 |      15.23471 |       0.01053 |       0.68298
     -0.02359 |       0.00000 |      14.01829 |       0.01142 |       0.68211
     -0.02513 |       0.00000 |      12.85858 |       0.01249 |       0.68106
     -0.02631 |       0.00000 |      11.68509 |       0.01299 |       0.68057
     -0.02688 |       0.00000 |      10.71957 |       0.01346 |       0.68011
     -0.02676 |       0.00000 |      10.05666 |       0.01396 |       0.67962
Evaluating losses...
     -0.02697 |       0.00000 |       9.78788 |       0.01571 |       0.67793
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00169 |       0.00000 |      92.45570 |       0.00012 |       0.69302
     -0.00686 |       0.00000 |      89.53152 |       0.00109 |       0.69203
     -0.01417 |       0.00000 |      86.03674 |       0.00474 |       0.68839
     -0.01663 |       0.00000 |      81.70354 |       0.00846 |       0.68472
     -0.01732 |       0.00000 |      76.46985 |       0.00837 |       0.68481
     -0.01792 |       0.00000 |      70.61603 |       0.00936 |       0.68383
     -0.01834 |       0.00000 |      64.45230 |       0.00882 |       0.68436
     -0.01909 |       0.00000 |      58.12881 |       0.00890 |       0.68428
     -0.01981 |       0.00000 |      51.88945 |       0.00896 |       0.68422
     -0.02041 |       0.00000 |      45.97902 |       0.00939 |       0.68380
Evaluating losses...
     -0.02100 |       0.00000 |      43.07516 |       0.00932 |       0.68387
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00939 |       0.00000 |      87.66176 |       0.00479 |       0.68843
     -0.01704 |       0.00000 |      75.13103 |       0.00770 |       0.68556
     -0.01878 |       0.00000 |      48.78662 |       0.00896 |       0.68432
     -0.02244 |       0.00000 |      24.76236 |       0.00918 |       0.68409
     -0.02576 |       0.00000 |      16.77304 |       0.01080 |       0.68249
     -0.02873 |       0.00000 |      15.08194 |       0.01185 |       0.68146
     -0.03060 |       0.00000 |      13.68200 |       0.01280 |       0.68053
     -0.03184 |       0.00000 |      12.16458 |       0.01333 |       0.68000
     -0.03254 |       0.00000 |      10.88465 |       0.01376 |       0.67959
     -0.03256 |       0.00000 |      10.03540 |       0.01401 |       0.67935
Evaluating losses...
     -0.03324 |       0.00000 |       9.71363 |       0.01323 |       0.68010
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00129 |       0.00000 |      94.78271 |       0.00010 |       0.69306
     -0.00576 |       0.00000 |      92.21814 |       0.00094 |       0.69226
     -0.01291 |       0.00000 |      89.11326 |       0.00419 |       0.68909
     -0.01740 |       0.00000 |      85.14893 |       0.00791 |       0.68546
     -0.01848 |       0.00000 |      80.29318 |       0.00841 |       0.68496
     -0.01940 |       0.00000 |      74.80145 |       0.00937 |       0.68404
     -0.02026 |       0.00000 |      68.84944 |       0.00920 |       0.68419
     -0.02169 |       0.00000 |      62.61634 |       0.00941 |       0.68398
     -0.02340 |       0.00000 |      56.40451 |       0.00962 |       0.68377
     -0.02546 |       0.00000 |      50.41111 |       0.01033 |       0.68307
Evaluating losses...
     -0.02615 |       0.00000 |      47.49701 |       0.01062 |       0.68279
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01037 |       0.00000 |      89.14943 |       0.00388 |       0.68914
     -0.01648 |       0.00000 |      75.54720 |       0.00856 |       0.68442
     -0.01811 |       0.00000 |      47.68260 |       0.00838 |       0.68459
     -0.02100 |       0.00000 |      24.23354 |       0.00880 |       0.68417
     -0.02422 |       0.00000 |      17.25403 |       0.01009 |       0.68288
     -0.02669 |       0.00000 |      15.78735 |       0.01172 |       0.68126
     -0.02836 |       0.00000 |      14.57989 |       0.01285 |       0.68015
     -0.02926 |       0.00000 |      13.33678 |       0.01369 |       0.67932
     -0.02976 |       0.00000 |      12.24130 |       0.01371 |       0.67930
     -0.03025 |       0.00000 |      11.45214 |       0.01421 |       0.67880
Evaluating losses...
     -0.03077 |       0.00000 |      11.13293 |       0.01366 |       0.67935
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00185 |       0.00000 |      89.27676 |       0.00011 |       0.69303
     -0.00732 |       0.00000 |      86.74909 |       0.00110 |       0.69204
     -0.01486 |       0.00000 |      83.65908 |       0.00479 |       0.68838
     -0.01828 |       0.00000 |      79.74551 |       0.00794 |       0.68528
     -0.01912 |       0.00000 |      75.02397 |       0.00908 |       0.68416
     -0.02009 |       0.00000 |      69.66121 |       0.00944 |       0.68380
     -0.02099 |       0.00000 |      63.88931 |       0.00905 |       0.68418
     -0.02261 |       0.00000 |      57.94059 |       0.00946 |       0.68377
     -0.02404 |       0.00000 |      52.03770 |       0.01025 |       0.68299
     -0.02550 |       0.00000 |      46.08271 |       0.01064 |       0.68260
Evaluating losses...
     -0.02575 |       0.00000 |      43.10250 |       0.01103 |       0.68222
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01254 |       0.00000 |      94.58330 |       0.00504 |       0.68790
     -0.01946 |       0.00000 |      81.21828 |       0.00853 |       0.68432
     -0.02144 |       0.00000 |      53.05435 |       0.00840 |       0.68445
     -0.02462 |       0.00000 |      27.59734 |       0.00924 |       0.68361
     -0.02854 |       0.00000 |      18.96503 |       0.01070 |       0.68215
     -0.03107 |       0.00000 |      17.11581 |       0.01204 |       0.68082
     -0.03218 |       0.00000 |      15.84052 |       0.01326 |       0.67961
     -0.03313 |       0.00000 |      14.42817 |       0.01367 |       0.67921
     -0.03364 |       0.00000 |      13.15708 |       0.01385 |       0.67903
     -0.03370 |       0.00000 |      12.19933 |       0.01438 |       0.67851
Evaluating losses...
     -0.03426 |       0.00000 |      11.81378 |       0.01279 |       0.68009
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00154 |       0.00000 |      88.14616 |      9.86e-05 |       0.69304
     -0.00701 |       0.00000 |      85.24597 |       0.00106 |       0.69206
     -0.01451 |       0.00000 |      81.92001 |       0.00506 |       0.68809
     -0.01755 |       0.00000 |      78.09380 |       0.00834 |       0.68485
     -0.01812 |       0.00000 |      73.71841 |       0.00889 |       0.68430
     -0.01890 |       0.00000 |      68.74558 |       0.00940 |       0.68380
     -0.02024 |       0.00000 |      63.35691 |       0.00872 |       0.68446
     -0.02121 |       0.00000 |      57.71946 |       0.00941 |       0.68378
     -0.02250 |       0.00000 |      52.02748 |       0.01003 |       0.68316
     -0.02392 |       0.00000 |      46.53712 |       0.01026 |       0.68293
Evaluating losses...
     -0.02465 |       0.00000 |      43.83368 |       0.01036 |       0.68283
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01066 |       0.00000 |      82.37625 |       0.00472 |       0.68839
     -0.01705 |       0.00000 |      68.95894 |       0.00829 |       0.68481
     -0.01867 |       0.00000 |      42.63371 |       0.00834 |       0.68475
     -0.02140 |       0.00000 |      21.49182 |       0.00909 |       0.68400
     -0.02489 |       0.00000 |      15.36208 |       0.01083 |       0.68228
     -0.02802 |       0.00000 |      13.90825 |       0.01227 |       0.68085
     -0.02975 |       0.00000 |      12.51083 |       0.01303 |       0.68009
     -0.03047 |       0.00000 |      11.10016 |       0.01360 |       0.67954
     -0.03041 |       0.00000 |      10.01041 |       0.01426 |       0.67889
     -0.03118 |       0.00000 |       9.32500 |       0.01396 |       0.67919
Evaluating losses...
     -0.03147 |       0.00000 |       9.07469 |       0.01483 |       0.67833
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00117 |       0.00000 |      87.40546 |      6.97e-05 |       0.69308
     -0.00526 |       0.00000 |      84.68251 |       0.00071 |       0.69245
     -0.01073 |       0.00000 |      81.19988 |       0.00309 |       0.69011
     -0.01451 |       0.00000 |      76.74495 |       0.00650 |       0.68672
     -0.01619 |       0.00000 |      71.45879 |       0.00864 |       0.68461
     -0.01692 |       0.00000 |      65.63575 |       0.00896 |       0.68428
     -0.01787 |       0.00000 |      59.54114 |       0.00891 |       0.68433
     -0.01944 |       0.00000 |      53.48527 |       0.00943 |       0.68380
     -0.02084 |       0.00000 |      47.66557 |       0.00976 |       0.68348
     -0.02241 |       0.00000 |      42.21445 |       0.01047 |       0.68278
Evaluating losses...
     -0.02315 |       0.00000 |      39.55385 |       0.01077 |       0.68248
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00965 |       0.00000 |      89.17068 |       0.00385 |       0.68928
     -0.01603 |       0.00000 |      75.39967 |       0.00795 |       0.68519
     -0.01742 |       0.00000 |      47.85702 |       0.00846 |       0.68469
     -0.01912 |       0.00000 |      24.63680 |       0.00887 |       0.68427
     -0.02253 |       0.00000 |      17.43161 |       0.00991 |       0.68323
     -0.02578 |       0.00000 |      15.91979 |       0.01167 |       0.68149
     -0.02795 |       0.00000 |      14.72400 |       0.01299 |       0.68018
     -0.02880 |       0.00000 |      13.37384 |       0.01340 |       0.67978
     -0.02966 |       0.00000 |      12.16254 |       0.01391 |       0.67928
     -0.03012 |       0.00000 |      11.24682 |       0.01421 |       0.67898
Evaluating losses...
     -0.03029 |       0.00000 |      10.88894 |       0.01554 |       0.67768
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00193 |       0.00000 |      89.42025 |       0.00012 |       0.69303
     -0.00810 |       0.00000 |      86.70559 |       0.00161 |       0.69155
     -0.01528 |       0.00000 |      83.56717 |       0.00566 |       0.68756
     -0.01761 |       0.00000 |      79.76917 |       0.00872 |       0.68454
     -0.01818 |       0.00000 |      75.23604 |       0.00945 |       0.68383
     -0.01894 |       0.00000 |      70.15320 |       0.00917 |       0.68410
     -0.01970 |       0.00000 |      64.73894 |       0.00943 |       0.68384
     -0.02079 |       0.00000 |      59.15982 |       0.00940 |       0.68386
     -0.02191 |       0.00000 |      53.54852 |       0.01042 |       0.68286
     -0.02330 |       0.00000 |      48.12657 |       0.01064 |       0.68265
Evaluating losses...
     -0.02404 |       0.00000 |      45.48777 |       0.01115 |       0.68215
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01012 |       0.00000 |      86.81961 |       0.00439 |       0.68892
     -0.01744 |       0.00000 |      74.08448 |       0.00804 |       0.68539
     -0.01926 |       0.00000 |      47.33062 |       0.00855 |       0.68489
     -0.02202 |       0.00000 |      24.01334 |       0.00930 |       0.68415
     -0.02563 |       0.00000 |      16.71763 |       0.01055 |       0.68292
     -0.02809 |       0.00000 |      15.23231 |       0.01229 |       0.68122
     -0.02967 |       0.00000 |      14.15143 |       0.01322 |       0.68032
     -0.03057 |       0.00000 |      12.94469 |       0.01375 |       0.67980
     -0.03100 |       0.00000 |      11.85802 |       0.01419 |       0.67936
     -0.03098 |       0.00000 |      11.04455 |       0.01496 |       0.67862
Evaluating losses...
     -0.03185 |       0.00000 |      10.74666 |       0.01413 |       0.67942
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00093 |       0.00000 |      90.73298 |      5.92e-05 |       0.69309
     -0.00422 |       0.00000 |      87.81602 |       0.00069 |       0.69245
     -0.00860 |       0.00000 |      84.48897 |       0.00310 |       0.69003
     -0.01113 |       0.00000 |      80.43800 |       0.00608 |       0.68709
     -0.01228 |       0.00000 |      75.75423 |       0.00764 |       0.68555
     -0.01321 |       0.00000 |      70.43546 |       0.00811 |       0.68509
     -0.01370 |       0.00000 |      64.73523 |       0.00831 |       0.68489
     -0.01443 |       0.00000 |      58.81767 |       0.00884 |       0.68437
     -0.01513 |       0.00000 |      52.91713 |       0.00866 |       0.68454
     -0.01597 |       0.00000 |      47.25595 |       0.00933 |       0.68388
Evaluating losses...
     -0.01656 |       0.00000 |      44.50844 |       0.00857 |       0.68462
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00944 |       0.00000 |      83.07502 |       0.00366 |       0.68969
     -0.01498 |       0.00000 |      70.63530 |       0.00808 |       0.68546
     -0.01671 |       0.00000 |      44.53046 |       0.00853 |       0.68501
     -0.01953 |       0.00000 |      22.15381 |       0.00887 |       0.68467
     -0.02352 |       0.00000 |      15.42286 |       0.01066 |       0.68292
     -0.02625 |       0.00000 |      14.03837 |       0.01233 |       0.68129
     -0.02725 |       0.00000 |      12.81180 |       0.01308 |       0.68056
     -0.02767 |       0.00000 |      11.50814 |       0.01357 |       0.68008
     -0.02871 |       0.00000 |      10.36095 |       0.01361 |       0.68003
     -0.02849 |       0.00000 |       9.56190 |       0.01411 |       0.67955
Evaluating losses...
     -0.02934 |       0.00000 |       9.24657 |       0.01422 |       0.67943
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00114 |       0.00000 |      93.96014 |      8.01e-05 |       0.69307
     -0.00490 |       0.00000 |      90.62772 |       0.00086 |       0.69230
     -0.01042 |       0.00000 |      87.04362 |       0.00381 |       0.68939
     -0.01372 |       0.00000 |      83.05396 |       0.00697 |       0.68629
     -0.01465 |       0.00000 |      78.51494 |       0.00865 |       0.68465
     -0.01530 |       0.00000 |      73.46721 |       0.00834 |       0.68495
     -0.01612 |       0.00000 |      68.05845 |       0.00899 |       0.68431
     -0.01687 |       0.00000 |      62.47168 |       0.00904 |       0.68426
     -0.01766 |       0.00000 |      56.93413 |       0.00998 |       0.68334
     -0.01873 |       0.00000 |      51.52541 |       0.00967 |       0.68364
Evaluating losses...
     -0.01905 |       0.00000 |      48.81628 |       0.01003 |       0.68328
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00923 |       0.00000 |      87.74744 |       0.00434 |       0.68885
     -0.01636 |       0.00000 |      75.08533 |       0.00774 |       0.68548
     -0.01839 |       0.00000 |      48.82258 |       0.00856 |       0.68466
     -0.02121 |       0.00000 |      24.96018 |       0.00900 |       0.68422
     -0.02367 |       0.00000 |      16.85196 |       0.01045 |       0.68279
     -0.02554 |       0.00000 |      15.10620 |       0.01156 |       0.68170
     -0.02677 |       0.00000 |      13.84401 |       0.01202 |       0.68125
     -0.02739 |       0.00000 |      12.42750 |       0.01259 |       0.68069
     -0.02816 |       0.00000 |      11.19038 |       0.01325 |       0.68004
     -0.02872 |       0.00000 |      10.30946 |       0.01322 |       0.68007
Evaluating losses...
     -0.02905 |       0.00000 |       9.97772 |       0.01419 |       0.67912
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00139 |       0.00000 |      90.71346 |      7.80e-05 |       0.69307
     -0.00595 |       0.00000 |      88.31053 |       0.00119 |       0.69196
     -0.01175 |       0.00000 |      85.38226 |       0.00468 |       0.68852
     -0.01491 |       0.00000 |      81.70265 |       0.00681 |       0.68640
     -0.01597 |       0.00000 |      77.17358 |       0.00820 |       0.68503
     -0.01677 |       0.00000 |      72.07275 |       0.00844 |       0.68480
     -0.01788 |       0.00000 |      66.68974 |       0.00875 |       0.68449
     -0.01909 |       0.00000 |      61.15518 |       0.00902 |       0.68422
     -0.02040 |       0.00000 |      55.59238 |       0.00968 |       0.68357
     -0.02142 |       0.00000 |      50.14378 |       0.00986 |       0.68338
Evaluating losses...
     -0.02272 |       0.00000 |      47.43156 |       0.01011 |       0.68313
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01199 |       0.00000 |      85.83646 |       0.00461 |       0.68857
     -0.01777 |       0.00000 |      72.33752 |       0.00847 |       0.68474
     -0.01913 |       0.00000 |      45.74072 |       0.00882 |       0.68439
     -0.02069 |       0.00000 |      23.43275 |       0.00887 |       0.68434
     -0.02376 |       0.00000 |      16.51740 |       0.00935 |       0.68385
     -0.02668 |       0.00000 |      15.00756 |       0.01117 |       0.68206
     -0.02847 |       0.00000 |      13.81159 |       0.01252 |       0.68073
     -0.02926 |       0.00000 |      12.54016 |       0.01337 |       0.67990
     -0.03023 |       0.00000 |      11.45200 |       0.01363 |       0.67964
     -0.03068 |       0.00000 |      10.70434 |       0.01393 |       0.67934
Evaluating losses...
     -0.03108 |       0.00000 |      10.39827 |       0.01415 |       0.67912
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00171 |       0.00000 |      96.08575 |       0.00013 |       0.69302
     -0.00733 |       0.00000 |      93.44949 |       0.00150 |       0.69165
     -0.01413 |       0.00000 |      90.32037 |       0.00548 |       0.68773
     -0.01696 |       0.00000 |      86.40935 |       0.00783 |       0.68542
     -0.01770 |       0.00000 |      81.74799 |       0.00855 |       0.68471
     -0.01847 |       0.00000 |      76.52289 |       0.00915 |       0.68413
     -0.01918 |       0.00000 |      70.88748 |       0.00920 |       0.68407
     -0.02019 |       0.00000 |      64.95251 |       0.00911 |       0.68416
     -0.02159 |       0.00000 |      58.95415 |       0.00952 |       0.68375
     -0.02244 |       0.00000 |      53.04932 |       0.01016 |       0.68312
Evaluating losses...
     -0.02320 |       0.00000 |      50.10308 |       0.01138 |       0.68193
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00846 |       0.00000 |      87.44940 |       0.00372 |       0.68950
     -0.01433 |       0.00000 |      74.79963 |       0.00780 |       0.68550
     -0.01638 |       0.00000 |      48.37770 |       0.00789 |       0.68541
     -0.01875 |       0.00000 |      24.54701 |       0.00904 |       0.68428
     -0.02120 |       0.00000 |      16.74004 |       0.01019 |       0.68315
     -0.02294 |       0.00000 |      15.08701 |       0.01119 |       0.68216
     -0.02441 |       0.00000 |      13.86174 |       0.01236 |       0.68102
     -0.02570 |       0.00000 |      12.62034 |       0.01309 |       0.68031
     -0.02664 |       0.00000 |      11.63256 |       0.01352 |       0.67988
     -0.02700 |       0.00000 |      10.96659 |       0.01344 |       0.67996
Evaluating losses...
     -0.02798 |       0.00000 |      10.71350 |       0.01454 |       0.67888
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00144 |       0.00000 |      84.88058 |       0.00013 |       0.69302
     -0.00618 |       0.00000 |      82.08293 |       0.00093 |       0.69223
     -0.01336 |       0.00000 |      78.72737 |       0.00440 |       0.68880
     -0.01752 |       0.00000 |      74.63428 |       0.00759 |       0.68566
     -0.01831 |       0.00000 |      69.87351 |       0.00945 |       0.68386
     -0.01909 |       0.00000 |      64.58508 |       0.00912 |       0.68416
     -0.01997 |       0.00000 |      58.98639 |       0.00923 |       0.68405
     -0.02134 |       0.00000 |      53.34890 |       0.00953 |       0.68375
     -0.02231 |       0.00000 |      47.86625 |       0.00981 |       0.68347
     -0.02353 |       0.00000 |      42.70375 |       0.01028 |       0.68300
Evaluating losses...
     -0.02427 |       0.00000 |      40.24433 |       0.01105 |       0.68225
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00845 |       0.00000 |      89.44132 |       0.00379 |       0.68931
     -0.01410 |       0.00000 |      76.27887 |       0.00756 |       0.68553
     -0.01573 |       0.00000 |      49.09818 |       0.00806 |       0.68503
     -0.01791 |       0.00000 |      25.08253 |       0.00857 |       0.68452
     -0.02089 |       0.00000 |      17.31253 |       0.00952 |       0.68357
     -0.02326 |       0.00000 |      15.65569 |       0.01110 |       0.68201
     -0.02515 |       0.00000 |      14.47383 |       0.01222 |       0.68090
     -0.02634 |       0.00000 |      13.20968 |       0.01275 |       0.68038
     -0.02720 |       0.00000 |      12.09214 |       0.01321 |       0.67993
     -0.02761 |       0.00000 |      11.28876 |       0.01365 |       0.67950
Evaluating losses...
     -0.02818 |       0.00000 |      10.97565 |       0.01476 |       0.67841
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00283 |       0.00000 |      90.63201 |       0.00018 |       0.69297
     -0.01101 |       0.00000 |      88.26908 |       0.00211 |       0.69105
     -0.01840 |       0.00000 |      85.46320 |       0.00663 |       0.68659
     -0.02038 |       0.00000 |      81.95404 |       0.00868 |       0.68459
     -0.02124 |       0.00000 |      77.72620 |       0.00941 |       0.68387
     -0.02199 |       0.00000 |      72.75191 |       0.00942 |       0.68385
     -0.02330 |       0.00000 |      67.20920 |       0.00979 |       0.68348
     -0.02471 |       0.00000 |      61.32216 |       0.00937 |       0.68388
     -0.02672 |       0.00000 |      55.36663 |       0.01036 |       0.68290
     -0.02838 |       0.00000 |      49.51649 |       0.01088 |       0.68239
Evaluating losses...
     -0.02941 |       0.00000 |      46.61491 |       0.01093 |       0.68234
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01204 |       0.00000 |      86.68154 |       0.00445 |       0.68884
     -0.01930 |       0.00000 |      74.16803 |       0.00848 |       0.68491
     -0.02156 |       0.00000 |      47.50610 |       0.00891 |       0.68448
     -0.02450 |       0.00000 |      23.94238 |       0.00931 |       0.68408
     -0.02716 |       0.00000 |      16.53604 |       0.01086 |       0.68255
     -0.02906 |       0.00000 |      14.90595 |       0.01188 |       0.68156
     -0.02992 |       0.00000 |      13.57711 |       0.01225 |       0.68119
     -0.03075 |       0.00000 |      12.25790 |       0.01329 |       0.68018
     -0.03115 |       0.00000 |      11.23499 |       0.01359 |       0.67988
     -0.03180 |       0.00000 |      10.56472 |       0.01392 |       0.67956
Evaluating losses...
     -0.03192 |       0.00000 |      10.31068 |       0.01231 |       0.68112
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00177 |       0.00000 |      88.97130 |       0.00014 |       0.69302
     -0.00720 |       0.00000 |      85.72075 |       0.00144 |       0.69177
     -0.01272 |       0.00000 |      81.89474 |       0.00492 |       0.68838
     -0.01536 |       0.00000 |      77.36967 |       0.00725 |       0.68610
     -0.01649 |       0.00000 |      72.03234 |       0.00834 |       0.68503
     -0.01732 |       0.00000 |      66.16801 |       0.00874 |       0.68464
     -0.01873 |       0.00000 |      60.06392 |       0.00874 |       0.68464
     -0.01956 |       0.00000 |      53.94833 |       0.00948 |       0.68391
     -0.02054 |       0.00000 |      48.00404 |       0.00960 |       0.68379
     -0.02163 |       0.00000 |      42.43956 |       0.00991 |       0.68348
Evaluating losses...
     -0.02199 |       0.00000 |      39.76692 |       0.01031 |       0.68309
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01153 |       0.00000 |      85.34739 |       0.00442 |       0.68871
     -0.01716 |       0.00000 |      72.56855 |       0.00852 |       0.68462
     -0.01928 |       0.00000 |      46.24228 |       0.00899 |       0.68415
     -0.02220 |       0.00000 |      23.34633 |       0.00923 |       0.68390
     -0.02542 |       0.00000 |      16.08258 |       0.01068 |       0.68246
     -0.02778 |       0.00000 |      14.52716 |       0.01166 |       0.68149
     -0.02940 |       0.00000 |      13.37256 |       0.01302 |       0.68015
     -0.03053 |       0.00000 |      12.16312 |       0.01322 |       0.67995
     -0.03091 |       0.00000 |      11.12592 |       0.01393 |       0.67926
     -0.03153 |       0.00000 |      10.38480 |       0.01388 |       0.67931
Evaluating losses...
     -0.03193 |       0.00000 |      10.08811 |       0.01419 |       0.67900
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00157 |       0.00000 |      94.52390 |      8.19e-05 |       0.69306
     -0.00657 |       0.00000 |      91.54572 |       0.00090 |       0.69224
     -0.01432 |       0.00000 |      87.83791 |       0.00384 |       0.68933
     -0.01848 |       0.00000 |      83.26447 |       0.00803 |       0.68520
     -0.01917 |       0.00000 |      77.81002 |       0.00914 |       0.68411
     -0.01999 |       0.00000 |      71.70299 |       0.00919 |       0.68406
     -0.02043 |       0.00000 |      65.35316 |       0.00957 |       0.68369
     -0.02176 |       0.00000 |      58.94538 |       0.00912 |       0.68412
     -0.02270 |       0.00000 |      52.74371 |       0.00971 |       0.68353
     -0.02448 |       0.00000 |      46.86662 |       0.01007 |       0.68318
Evaluating losses...
     -0.02513 |       0.00000 |      44.00035 |       0.00942 |       0.68381
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01044 |       0.00000 |      82.20329 |       0.00467 |       0.68848
     -0.01704 |       0.00000 |      69.79575 |       0.00796 |       0.68520
     -0.01879 |       0.00000 |      44.60650 |       0.00838 |       0.68479
     -0.02143 |       0.00000 |      22.42896 |       0.00904 |       0.68413
     -0.02449 |       0.00000 |      15.37413 |       0.01029 |       0.68288
     -0.02654 |       0.00000 |      13.88037 |       0.01157 |       0.68162
     -0.02792 |       0.00000 |      12.76079 |       0.01289 |       0.68032
     -0.02883 |       0.00000 |      11.59454 |       0.01304 |       0.68018
     -0.02927 |       0.00000 |      10.63551 |       0.01372 |       0.67951
     -0.02941 |       0.00000 |       9.95879 |       0.01416 |       0.67908
Evaluating losses...
     -0.02940 |       0.00000 |       9.71030 |       0.01193 |       0.68125
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00134 |       0.00000 |      88.57794 |       0.00011 |       0.69303
     -0.00550 |       0.00000 |      85.55035 |       0.00115 |       0.69200
     -0.01183 |       0.00000 |      82.14881 |       0.00427 |       0.68890
     -0.01491 |       0.00000 |      78.09451 |       0.00755 |       0.68568
     -0.01561 |       0.00000 |      73.16946 |       0.00855 |       0.68471
     -0.01617 |       0.00000 |      67.64874 |       0.00859 |       0.68466
     -0.01683 |       0.00000 |      61.76417 |       0.00884 |       0.68441
     -0.01816 |       0.00000 |      55.84403 |       0.00903 |       0.68422
     -0.01910 |       0.00000 |      50.06881 |       0.00933 |       0.68392
     -0.01989 |       0.00000 |      44.61015 |       0.01033 |       0.68294
Evaluating losses...
     -0.02038 |       0.00000 |      41.98943 |       0.01069 |       0.68258
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00947 |       0.00000 |      86.16107 |       0.00391 |       0.68937
     -0.01574 |       0.00000 |      74.05270 |       0.00775 |       0.68563
     -0.01784 |       0.00000 |      48.07320 |       0.00812 |       0.68527
     -0.02086 |       0.00000 |      24.47494 |       0.00960 |       0.68381
     -0.02448 |       0.00000 |      16.92872 |       0.01095 |       0.68248
     -0.02664 |       0.00000 |      15.28244 |       0.01242 |       0.68104
     -0.02775 |       0.00000 |      13.91236 |       0.01308 |       0.68039
     -0.02831 |       0.00000 |      12.53746 |       0.01366 |       0.67982
     -0.02832 |       0.00000 |      11.42115 |       0.01358 |       0.67990
     -0.02878 |       0.00000 |      10.68404 |       0.01392 |       0.67956
Evaluating losses...
     -0.02897 |       0.00000 |      10.42091 |       0.01606 |       0.67749
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00164 |       0.00000 |      81.45968 |      9.26e-05 |       0.69305
     -0.00743 |       0.00000 |      78.85304 |       0.00107 |       0.69209
     -0.01575 |       0.00000 |      75.90122 |       0.00483 |       0.68836
     -0.01858 |       0.00000 |      72.34354 |       0.00847 |       0.68479
     -0.01959 |       0.00000 |      68.16105 |       0.00872 |       0.68454
     -0.02028 |       0.00000 |      63.57330 |       0.00930 |       0.68397
     -0.02121 |       0.00000 |      58.64985 |       0.00908 |       0.68417
     -0.02239 |       0.00000 |      53.51159 |       0.00993 |       0.68333
     -0.02412 |       0.00000 |      48.35698 |       0.00995 |       0.68330
     -0.02553 |       0.00000 |      43.32841 |       0.01030 |       0.68295
Evaluating losses...
     -0.02618 |       0.00000 |      40.93987 |       0.01140 |       0.68188
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00838 |       0.00000 |      83.45905 |       0.00398 |       0.68906
     -0.01430 |       0.00000 |      70.77675 |       0.00799 |       0.68500
     -0.01639 |       0.00000 |      44.82957 |       0.00809 |       0.68489
     -0.01943 |       0.00000 |      22.48532 |       0.00918 |       0.68380
     -0.02222 |       0.00000 |      15.62220 |       0.01077 |       0.68222
     -0.02434 |       0.00000 |      14.26787 |       0.01188 |       0.68112
     -0.02520 |       0.00000 |      13.21035 |       0.01272 |       0.68029
     -0.02619 |       0.00000 |      11.99948 |       0.01313 |       0.67988
     -0.02672 |       0.00000 |      10.89864 |       0.01338 |       0.67964
     -0.02661 |       0.00000 |      10.09465 |       0.01380 |       0.67924
Evaluating losses...
     -0.02731 |       0.00000 |       9.77902 |       0.01221 |       0.68080
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00103 |       0.00000 |      83.48862 |      5.59e-05 |       0.69309
     -0.00419 |       0.00000 |      80.41310 |       0.00055 |       0.69260
     -0.00937 |       0.00000 |      76.73897 |       0.00272 |       0.69046
     -0.01336 |       0.00000 |      72.27627 |       0.00700 |       0.68625
     -0.01461 |       0.00000 |      67.03738 |       0.00859 |       0.68470
     -0.01559 |       0.00000 |      61.36800 |       0.00836 |       0.68492
     -0.01621 |       0.00000 |      55.48693 |       0.00911 |       0.68418
     -0.01753 |       0.00000 |      49.63078 |       0.00913 |       0.68415
     -0.01875 |       0.00000 |      43.95876 |       0.00965 |       0.68363
     -0.01952 |       0.00000 |      38.65615 |       0.01023 |       0.68307
Evaluating losses...
     -0.02017 |       0.00000 |      36.15307 |       0.01076 |       0.68255
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01052 |       0.00000 |      94.60532 |       0.00404 |       0.68927
     -0.01823 |       0.00000 |      81.12537 |       0.00842 |       0.68502
     -0.02022 |       0.00000 |      52.48405 |       0.00827 |       0.68515
     -0.02300 |       0.00000 |      26.57899 |       0.00914 |       0.68430
     -0.02635 |       0.00000 |      17.78651 |       0.01026 |       0.68319
     -0.02884 |       0.00000 |      15.94437 |       0.01181 |       0.68168
     -0.02966 |       0.00000 |      14.65883 |       0.01278 |       0.68073
     -0.03101 |       0.00000 |      13.25879 |       0.01337 |       0.68015
     -0.03128 |       0.00000 |      12.04339 |       0.01383 |       0.67970
     -0.03195 |       0.00000 |      11.20896 |       0.01362 |       0.67991
Evaluating losses...
     -0.03210 |       0.00000 |      10.88148 |       0.01498 |       0.67858
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00072 |       0.00000 |      79.65036 |      8.12e-05 |       0.69307
     -0.00392 |       0.00000 |      76.60268 |       0.00056 |       0.69261
     -0.00940 |       0.00000 |      72.76885 |       0.00284 |       0.69037
     -0.01404 |       0.00000 |      68.18700 |       0.00658 |       0.68669
     -0.01631 |       0.00000 |      62.96183 |       0.00847 |       0.68483
     -0.01698 |       0.00000 |      57.26337 |       0.00908 |       0.68423
     -0.01795 |       0.00000 |      51.43753 |       0.00914 |       0.68416
     -0.01851 |       0.00000 |      45.74440 |       0.00917 |       0.68413
     -0.01995 |       0.00000 |      40.32658 |       0.00984 |       0.68347
     -0.02096 |       0.00000 |      35.35489 |       0.00993 |       0.68338
Evaluating losses...
     -0.02161 |       0.00000 |      32.98711 |       0.00984 |       0.68346
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00772 |       0.00000 |      93.05097 |       0.00277 |       0.69032
     -0.01533 |       0.00000 |      79.69557 |       0.00814 |       0.68492
     -0.01720 |       0.00000 |      51.82536 |       0.00797 |       0.68507
     -0.01925 |       0.00000 |      26.32752 |       0.00869 |       0.68435
     -0.02219 |       0.00000 |      17.70644 |       0.00917 |       0.68386
     -0.02477 |       0.00000 |      16.04501 |       0.01081 |       0.68224
     -0.02729 |       0.00000 |      15.05207 |       0.01223 |       0.68083
     -0.02875 |       0.00000 |      13.92579 |       0.01279 |       0.68028
     -0.02917 |       0.00000 |      12.83492 |       0.01334 |       0.67974
     -0.03016 |       0.00000 |      11.93043 |       0.01412 |       0.67897
Evaluating losses...
     -0.03060 |       0.00000 |      11.55408 |       0.01289 |       0.68019
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00213 |       0.00000 |      87.26585 |       0.00013 |       0.69301
     -0.00889 |       0.00000 |      84.27690 |       0.00169 |       0.69145
     -0.01634 |       0.00000 |      80.82898 |       0.00515 |       0.68802
     -0.01965 |       0.00000 |      76.67915 |       0.00852 |       0.68469
     -0.02053 |       0.00000 |      71.90184 |       0.00903 |       0.68418
     -0.02089 |       0.00000 |      66.64265 |       0.00923 |       0.68398
     -0.02149 |       0.00000 |      61.15163 |       0.00973 |       0.68349
     -0.02284 |       0.00000 |      55.54344 |       0.00994 |       0.68328
     -0.02385 |       0.00000 |      49.88602 |       0.00950 |       0.68371
     -0.02501 |       0.00000 |      44.50037 |       0.00987 |       0.68333
Evaluating losses...
     -0.02543 |       0.00000 |      41.85564 |       0.01086 |       0.68236
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00902 |       0.00000 |      85.64711 |       0.00333 |       0.68989
     -0.01531 |       0.00000 |      72.43018 |       0.00829 |       0.68502
     -0.01709 |       0.00000 |      45.43483 |       0.00807 |       0.68522
     -0.01930 |       0.00000 |      22.98137 |       0.00907 |       0.68424
     -0.02218 |       0.00000 |      16.21676 |       0.01021 |       0.68311
     -0.02436 |       0.00000 |      14.75108 |       0.01113 |       0.68221
     -0.02610 |       0.00000 |      13.54681 |       0.01223 |       0.68112
     -0.02728 |       0.00000 |      12.25825 |       0.01303 |       0.68034
     -0.02784 |       0.00000 |      11.14573 |       0.01334 |       0.68003
     -0.02849 |       0.00000 |      10.37018 |       0.01340 |       0.67997
Evaluating losses...
     -0.02890 |       0.00000 |      10.07113 |       0.01385 |       0.67953
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00104 |       0.00000 |      93.85953 |      8.55e-05 |       0.69307
     -0.00429 |       0.00000 |      91.02333 |       0.00082 |       0.69236
     -0.00936 |       0.00000 |      87.73798 |       0.00321 |       0.69002
     -0.01335 |       0.00000 |      83.75428 |       0.00605 |       0.68722
     -0.01540 |       0.00000 |      78.92709 |       0.00745 |       0.68583
     -0.01647 |       0.00000 |      73.37106 |       0.00890 |       0.68440
     -0.01788 |       0.00000 |      67.38108 |       0.00857 |       0.68472
     -0.01925 |       0.00000 |      61.20576 |       0.00882 |       0.68447
     -0.02093 |       0.00000 |      55.11339 |       0.00968 |       0.68363
     -0.02263 |       0.00000 |      49.18424 |       0.01018 |       0.68313
Evaluating losses...
     -0.02321 |       0.00000 |      46.29472 |       0.01099 |       0.68234
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.01232 |       0.00000 |      88.16605 |       0.00461 |       0.68847
     -0.01968 |       0.00000 |      75.43063 |       0.00876 |       0.68431
     -0.02159 |       0.00000 |      48.77044 |       0.00879 |       0.68427
     -0.02439 |       0.00000 |      24.66855 |       0.00973 |       0.68333
     -0.02768 |       0.00000 |      16.76392 |       0.01111 |       0.68195
     -0.03008 |       0.00000 |      14.94151 |       0.01236 |       0.68071
     -0.03139 |       0.00000 |      13.47724 |       0.01350 |       0.67959
     -0.03242 |       0.00000 |      12.06229 |       0.01381 |       0.67929
     -0.03312 |       0.00000 |      10.93304 |       0.01416 |       0.67894
     -0.03355 |       0.00000 |      10.18334 |       0.01451 |       0.67860
Evaluating losses...
     -0.03395 |       0.00000 |       9.89781 |       0.01532 |       0.67780
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00125 |       0.00000 |      86.33978 |       0.00011 |       0.69303
     -0.00524 |       0.00000 |      83.55128 |       0.00115 |       0.69196
     -0.00934 |       0.00000 |      80.32970 |       0.00374 |       0.68938
     -0.01134 |       0.00000 |      76.49206 |       0.00544 |       0.68772
     -0.01315 |       0.00000 |      71.92950 |       0.00737 |       0.68585
     -0.01378 |       0.00000 |      66.72375 |       0.00768 |       0.68554
     -0.01456 |       0.00000 |      61.10363 |       0.00815 |       0.68507
     -0.01536 |       0.00000 |      55.25158 |       0.00783 |       0.68539
     -0.01633 |       0.00000 |      49.57913 |       0.00895 |       0.68428
     -0.01715 |       0.00000 |      44.16364 |       0.00894 |       0.68428
Evaluating losses...
     -0.01780 |       0.00000 |      41.54359 |       0.00873 |       0.68449
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00750 |       0.00000 |      83.15320 |       0.00305 |       0.68998
     -0.01392 |       0.00000 |      69.89423 |       0.00814 |       0.68482
     -0.01616 |       0.00000 |      43.38651 |       0.00825 |       0.68469
     -0.01976 |       0.00000 |      21.83859 |       0.00935 |       0.68358
     -0.02297 |       0.00000 |      15.62313 |       0.01110 |       0.68185
     -0.02574 |       0.00000 |      14.16174 |       0.01240 |       0.68055
     -0.02728 |       0.00000 |      12.91957 |       0.01290 |       0.68006
     -0.02796 |       0.00000 |      11.61764 |       0.01350 |       0.67948
     -0.02857 |       0.00000 |      10.46585 |       0.01338 |       0.67960
     -0.02867 |       0.00000 |       9.63701 |       0.01402 |       0.67897
Evaluating losses...
     -0.02880 |       0.00000 |       9.31152 |       0.01564 |       0.67737
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00140 |       0.00000 |      84.32919 |       0.00013 |       0.69302
     -0.00544 |       0.00000 |      81.70689 |       0.00102 |       0.69215
     -0.01130 |       0.00000 |      78.45866 |       0.00439 |       0.68882
     -0.01491 |       0.00000 |      74.36389 |       0.00728 |       0.68598
     -0.01554 |       0.00000 |      69.48770 |       0.00824 |       0.68505
     -0.01663 |       0.00000 |      64.10631 |       0.00903 |       0.68427
     -0.01798 |       0.00000 |      58.49514 |       0.00837 |       0.68491
     -0.01968 |       0.00000 |      52.89897 |       0.00941 |       0.68388
     -0.02129 |       0.00000 |      47.38292 |       0.01024 |       0.68307
     -0.02266 |       0.00000 |      42.10096 |       0.01078 |       0.68254
Evaluating losses...
     -0.02289 |       0.00000 |      39.55902 |       0.01239 |       0.68097
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00848 |       0.00000 |      87.37868 |       0.00387 |       0.68933
     -0.01457 |       0.00000 |      74.53916 |       0.00814 |       0.68512
     -0.01634 |       0.00000 |      48.03864 |       0.00808 |       0.68517
     -0.01837 |       0.00000 |      24.44617 |       0.00906 |       0.68420
     -0.02044 |       0.00000 |      17.04923 |       0.00975 |       0.68352
     -0.02234 |       0.00000 |      15.63136 |       0.01061 |       0.68266
     -0.02362 |       0.00000 |      14.57283 |       0.01203 |       0.68127
     -0.02515 |       0.00000 |      13.40151 |       0.01266 |       0.68066
     -0.02545 |       0.00000 |      12.35264 |       0.01277 |       0.68054
     -0.02607 |       0.00000 |      11.55378 |       0.01326 |       0.68007
Evaluating losses...
     -0.02681 |       0.00000 |      11.21743 |       0.01447 |       0.67888
-----------------------------

  "{} != {}".format(self.training_env, self.eval_env))


Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00191 |       0.00000 |      85.57808 |       0.00012 |       0.69302
     -0.00779 |       0.00000 |      82.83141 |       0.00130 |       0.69183
     -0.01424 |       0.00000 |      79.84222 |       0.00492 |       0.68823
     -0.01779 |       0.00000 |      76.42071 |       0.00784 |       0.68534
     -0.01901 |       0.00000 |      72.52492 |       0.00870 |       0.68450
     -0.01982 |       0.00000 |      68.14018 |       0.00907 |       0.68412
     -0.02117 |       0.00000 |      63.38713 |       0.00913 |       0.68405
     -0.02237 |       0.00000 |      58.46276 |       0.00962 |       0.68357
     -0.02379 |       0.00000 |      53.48033 |       0.01000 |       0.68320
     -0.02514 |       0.00000 |      48.47550 |       0.01054 |       0.68266
Evaluating losses...
     -0.02595 |       0.00000 |      45.98987 |       0.01097 |       0.68223
-----------------------------

In [20]:
df.head()

Unnamed: 0,Seed,BNN reward,BNN std,DNN reward,DNN std
0,600.0,189.033,19.244737,181.186,30.742241
