<img src="stoovo1024.png" width="400px" height="400px">

In [7]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam


ENV_NAME = 'CartPole-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

# Option 1 : Simple model
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))

# Option 2: deep network
# model = Sequential()
# model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
# model.add(Dense(16))
# model.add(Activation('relu'))
# model.add(Dense(16))
# model.add(Activation('relu'))
# model.add(Dense(16))
# model.add(Activation('relu'))
# model.add(Dense(nb_actions))
# model.add(Activation('softmax'))


print(model.summary())


# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = EpisodeParameterMemory(limit=1000, window_length=1)

cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
               batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
cem.compile()

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
cem.fit(env, nb_steps=100000, visualize=False, verbose=2)

# After training is done, we save the best weights.
cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
cem.test(env, nb_episodes=5, visualize=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 10        
_________________________________________________________________
activation_9 (Activation)    (None, 2)                 0         
Total params: 10
Trainable params: 10
Non-trainable params: 0
_________________________________________________________________
None
Training for 100000 steps ...
    13/100000: episode: 1, duration: 0.092s, episode steps: 13, steps per second: 141, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: -0.107 [-1.319, 0.772], mean_best_reward: --
    24/100000: episode: 2, duration: 0.007s, episode steps: 11, steps per second: 1535, episode reward: 11.000, mean reward: 1.000 

   652/100000: episode: 35, duration: 0.065s, episode steps: 87, steps per second: 1338, episode reward: 87.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: -0.164 [-1.532, 1.254], mean_best_reward: --
   672/100000: episode: 36, duration: 0.020s, episode steps: 20, steps per second: 1011, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.350 [0.000, 1.000], mean observation: 0.053 [-1.201, 1.977], mean_best_reward: --
   716/100000: episode: 37, duration: 0.035s, episode steps: 44, steps per second: 1261, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.568 [0.000, 1.000], mean observation: -0.055 [-2.395, 1.395], mean_best_reward: --
   728/100000: episode: 38, duration: 0.008s, episode steps: 12, steps per second: 1480, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.107 [-0.774, 1.524], mean_best_reward: --
   742/100000: epi

  1651/100000: episode: 76, duration: 0.021s, episode steps: 13, steps per second: 629, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: 0.125 [-0.935, 1.534], mean_best_reward: --
  1686/100000: episode: 77, duration: 0.031s, episode steps: 35, steps per second: 1133, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: -0.053 [-1.702, 1.962], mean_best_reward: --
  1707/100000: episode: 78, duration: 0.026s, episode steps: 21, steps per second: 822, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.619 [0.000, 1.000], mean observation: -0.041 [-1.700, 1.027], mean_best_reward: --
  1720/100000: episode: 79, duration: 0.013s, episode steps: 13, steps per second: 986, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.094 [-1.725, 1.021], mean_best_reward: --
  1729/100000: episo

  2418/100000: episode: 113, duration: 0.015s, episode steps: 13, steps per second: 847, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.385 [0.000, 1.000], mean observation: 0.120 [-0.756, 1.251], mean_best_reward: --
  2433/100000: episode: 114, duration: 0.015s, episode steps: 15, steps per second: 1032, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.733 [0.000, 1.000], mean observation: -0.093 [-2.383, 1.529], mean_best_reward: --
  2465/100000: episode: 115, duration: 0.022s, episode steps: 32, steps per second: 1427, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: 0.063 [-0.842, 1.785], mean_best_reward: --
  2474/100000: episode: 116, duration: 0.008s, episode steps: 9, steps per second: 1198, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.222 [0.000, 1.000], mean observation: 0.138 [-1.203, 1.869], mean_best_reward: --
  2485/100000: epi

  3220/100000: episode: 148, duration: 0.020s, episode steps: 21, steps per second: 1054, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.381 [0.000, 1.000], mean observation: 0.091 [-0.990, 1.892], mean_best_reward: --
  3233/100000: episode: 149, duration: 0.010s, episode steps: 13, steps per second: 1257, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.769 [0.000, 1.000], mean observation: -0.113 [-2.431, 1.383], mean_best_reward: --
  3277/100000: episode: 150, duration: 0.025s, episode steps: 44, steps per second: 1777, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.591 [0.000, 1.000], mean observation: -0.077 [-2.491, 1.554], mean_best_reward: --
  3290/100000: episode: 151, duration: 0.009s, episode steps: 13, steps per second: 1433, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.308 [0.000, 1.000], mean observation: 0.094 [-1.175, 1.901], mean_best_reward: 65.500000
  3306/

  4359/100000: episode: 182, duration: 0.046s, episode steps: 50, steps per second: 1090, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.164 [-1.251, 0.554], mean_best_reward: --
  4371/100000: episode: 183, duration: 0.008s, episode steps: 12, steps per second: 1593, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.099 [-1.697, 0.968], mean_best_reward: --
  4409/100000: episode: 184, duration: 0.022s, episode steps: 38, steps per second: 1760, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.130 [-0.863, 0.282], mean_best_reward: --
  4443/100000: episode: 185, duration: 0.019s, episode steps: 34, steps per second: 1782, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.441 [0.000, 1.000], mean observation: 0.060 [-0.968, 1.695], mean_best_reward: --
  4470/100000

  5605/100000: episode: 220, duration: 0.057s, episode steps: 39, steps per second: 684, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: 0.058 [-0.609, 1.541], mean_best_reward: --
  5650/100000: episode: 221, duration: 0.041s, episode steps: 45, steps per second: 1103, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: -0.035 [-1.272, 0.834], mean_best_reward: --
  5669/100000: episode: 222, duration: 0.012s, episode steps: 19, steps per second: 1556, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: -0.108 [-1.201, 0.768], mean_best_reward: --
  5688/100000: episode: 223, duration: 0.012s, episode steps: 19, steps per second: 1620, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.579 [0.000, 1.000], mean observation: -0.077 [-1.431, 0.824], mean_best_reward: --
  5718/100000:

  6956/100000: episode: 255, duration: 0.037s, episode steps: 38, steps per second: 1017, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.553 [0.000, 1.000], mean observation: 0.124 [-0.452, 0.821], mean_best_reward: --
  7000/100000: episode: 256, duration: 0.029s, episode steps: 44, steps per second: 1531, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: 0.014 [-1.402, 1.181], mean_best_reward: --
  7068/100000: episode: 257, duration: 0.056s, episode steps: 68, steps per second: 1215, episode reward: 68.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.141 [-1.222, 1.548], mean_best_reward: --
  7105/100000: episode: 258, duration: 0.029s, episode steps: 37, steps per second: 1258, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.459 [0.000, 1.000], mean observation: -0.089 [-0.880, 0.604], mean_best_reward: --
  7139/100000: 

  8314/100000: episode: 293, duration: 0.061s, episode steps: 53, steps per second: 867, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.180 [-0.940, 1.350], mean_best_reward: --
  8349/100000: episode: 294, duration: 0.038s, episode steps: 35, steps per second: 916, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.457 [0.000, 1.000], mean observation: 0.057 [-0.815, 1.562], mean_best_reward: --
  8362/100000: episode: 295, duration: 0.015s, episode steps: 13, steps per second: 854, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: -0.119 [-1.208, 0.734], mean_best_reward: --
  8374/100000: episode: 296, duration: 0.011s, episode steps: 12, steps per second: 1061, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.104 [-0.996, 1.666], mean_best_reward: --
  8393/100000: epi

  9403/100000: episode: 328, duration: 0.030s, episode steps: 41, steps per second: 1388, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.119 [-0.876, 0.434], mean_best_reward: --
  9415/100000: episode: 329, duration: 0.011s, episode steps: 12, steps per second: 1085, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.113 [-0.968, 1.510], mean_best_reward: --
  9468/100000: episode: 330, duration: 0.032s, episode steps: 53, steps per second: 1652, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.453 [0.000, 1.000], mean observation: -0.179 [-1.143, 0.540], mean_best_reward: --
  9499/100000: episode: 331, duration: 0.021s, episode steps: 31, steps per second: 1472, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.548 [0.000, 1.000], mean observation: -0.040 [-1.729, 1.129], mean_best_reward: --
  9518/100000

 10560/100000: episode: 365, duration: 0.043s, episode steps: 55, steps per second: 1294, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: -0.061 [-1.348, 0.672], mean_best_reward: --
 10597/100000: episode: 366, duration: 0.026s, episode steps: 37, steps per second: 1445, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: 0.047 [-0.770, 1.197], mean_best_reward: --
 10655/100000: episode: 367, duration: 0.039s, episode steps: 58, steps per second: 1500, episode reward: 58.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.569 [0.000, 1.000], mean observation: 0.082 [-2.298, 1.602], mean_best_reward: --
 10683/100000: episode: 368, duration: 0.022s, episode steps: 28, steps per second: 1245, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: -0.057 [-1.427, 0.790], mean_best_reward: --
 10696/100000:

 11905/100000: episode: 401, duration: 0.021s, episode steps: 20, steps per second: 955, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: 0.080 [-0.640, 1.112], mean_best_reward: 86.000000
 11921/100000: episode: 402, duration: 0.012s, episode steps: 16, steps per second: 1330, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: 0.067 [-1.013, 1.575], mean_best_reward: --
 11970/100000: episode: 403, duration: 0.042s, episode steps: 49, steps per second: 1178, episode reward: 49.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.551 [0.000, 1.000], mean observation: -0.066 [-2.043, 1.012], mean_best_reward: --
 11993/100000: episode: 404, duration: 0.021s, episode steps: 23, steps per second: 1083, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: -0.071 [-1.391, 0.765], mean_best_reward: --
 12084/1

 13193/100000: episode: 437, duration: 0.033s, episode steps: 47, steps per second: 1445, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.426 [0.000, 1.000], mean observation: -0.024 [-1.540, 1.947], mean_best_reward: --
 13261/100000: episode: 438, duration: 0.042s, episode steps: 68, steps per second: 1600, episode reward: 68.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.067 [-0.718, 1.246], mean_best_reward: --
 13272/100000: episode: 439, duration: 0.009s, episode steps: 11, steps per second: 1174, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: -0.111 [-1.599, 1.014], mean_best_reward: --
 13336/100000: episode: 440, duration: 0.056s, episode steps: 64, steps per second: 1134, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: 0.036 [-0.990, 1.635], mean_best_reward: --
 13415/100000:

 14529/100000: episode: 476, duration: 0.055s, episode steps: 65, steps per second: 1176, episode reward: 65.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.523 [0.000, 1.000], mean observation: 0.057 [-0.835, 0.830], mean_best_reward: --
 14616/100000: episode: 477, duration: 0.098s, episode steps: 87, steps per second: 886, episode reward: 87.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: 0.039 [-1.276, 1.214], mean_best_reward: --
 14639/100000: episode: 478, duration: 0.022s, episode steps: 23, steps per second: 1027, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.060 [-0.789, 1.226], mean_best_reward: --
 14672/100000: episode: 479, duration: 0.041s, episode steps: 33, steps per second: 800, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: 0.115 [-0.369, 0.767], mean_best_reward: --
 14691/100000: epi

 15818/100000: episode: 512, duration: 0.060s, episode steps: 71, steps per second: 1175, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.493 [0.000, 1.000], mean observation: -0.051 [-1.390, 0.929], mean_best_reward: --
 15850/100000: episode: 513, duration: 0.024s, episode steps: 32, steps per second: 1346, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: -0.010 [-1.218, 0.993], mean_best_reward: --
 15940/100000: episode: 514, duration: 0.049s, episode steps: 90, steps per second: 1834, episode reward: 90.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: -0.169 [-1.916, 1.018], mean_best_reward: --
 15980/100000: episode: 515, duration: 0.028s, episode steps: 40, steps per second: 1435, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: 0.062 [-0.658, 1.452], mean_best_reward: --
 16020/100000

 17252/100000: episode: 553, duration: 0.025s, episode steps: 31, steps per second: 1258, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: -0.020 [-1.437, 1.119], mean_best_reward: --
 17280/100000: episode: 554, duration: 0.023s, episode steps: 28, steps per second: 1214, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: -0.019 [-1.583, 1.204], mean_best_reward: --
 17310/100000: episode: 555, duration: 0.018s, episode steps: 30, steps per second: 1654, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.106 [-1.369, 0.433], mean_best_reward: --
 17343/100000: episode: 556, duration: 0.021s, episode steps: 33, steps per second: 1584, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.087 [-0.668, 1.806], mean_best_reward: --
 17379/100000

 18361/100000: episode: 590, duration: 0.056s, episode steps: 56, steps per second: 998, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.446 [0.000, 1.000], mean observation: -0.203 [-1.075, 0.625], mean_best_reward: --
 18395/100000: episode: 591, duration: 0.021s, episode steps: 34, steps per second: 1649, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.088 [-0.424, 1.226], mean_best_reward: --
 18426/100000: episode: 592, duration: 0.017s, episode steps: 31, steps per second: 1860, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: 0.054 [-0.787, 1.205], mean_best_reward: --
 18442/100000: episode: 593, duration: 0.009s, episode steps: 16, steps per second: 1740, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.088 [-0.755, 1.506], mean_best_reward: --
 18503/100000: e

 19537/100000: episode: 625, duration: 0.016s, episode steps: 15, steps per second: 943, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.087 [-0.947, 1.444], mean_best_reward: --
 19578/100000: episode: 626, duration: 0.033s, episode steps: 41, steps per second: 1230, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: -0.121 [-0.872, 0.371], mean_best_reward: --
 19597/100000: episode: 627, duration: 0.022s, episode steps: 19, steps per second: 865, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: 0.112 [-0.576, 1.151], mean_best_reward: --
 19617/100000: episode: 628, duration: 0.021s, episode steps: 20, steps per second: 935, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.028 [-1.375, 1.904], mean_best_reward: --
 19781/100000: epi

 20833/100000: episode: 659, duration: 0.025s, episode steps: 26, steps per second: 1029, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: 0.087 [-0.591, 0.929], mean_best_reward: --
 20848/100000: episode: 660, duration: 0.013s, episode steps: 15, steps per second: 1165, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.092 [-0.992, 1.483], mean_best_reward: --
 20927/100000: episode: 661, duration: 0.044s, episode steps: 79, steps per second: 1778, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: -0.083 [-0.927, 0.871], mean_best_reward: --
 21012/100000: episode: 662, duration: 0.054s, episode steps: 85, steps per second: 1587, episode reward: 85.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.506 [0.000, 1.000], mean observation: 0.160 [-1.468, 1.708], mean_best_reward: --
 21088/100000: 

 22173/100000: episode: 695, duration: 0.024s, episode steps: 28, steps per second: 1170, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.024 [-1.194, 0.829], mean_best_reward: --
 22187/100000: episode: 696, duration: 0.012s, episode steps: 14, steps per second: 1211, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.571 [0.000, 1.000], mean observation: -0.128 [-1.307, 0.749], mean_best_reward: --
 22272/100000: episode: 697, duration: 0.046s, episode steps: 85, steps per second: 1862, episode reward: 85.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: 0.061 [-0.951, 1.166], mean_best_reward: --
 22326/100000: episode: 698, duration: 0.038s, episode steps: 54, steps per second: 1425, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.080 [-0.733, 1.103], mean_best_reward: --
 22368/100000:

 23815/100000: episode: 733, duration: 0.031s, episode steps: 29, steps per second: 940, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: 0.087 [-0.817, 1.243], mean_best_reward: --
 23869/100000: episode: 734, duration: 0.036s, episode steps: 54, steps per second: 1492, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.191 [-1.153, 0.629], mean_best_reward: --
 23922/100000: episode: 735, duration: 0.039s, episode steps: 53, steps per second: 1346, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: 0.052 [-1.173, 1.607], mean_best_reward: --
 23949/100000: episode: 736, duration: 0.022s, episode steps: 27, steps per second: 1204, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: 0.072 [-0.585, 1.073], mean_best_reward: --
 24010/100000: e

 25008/100000: episode: 771, duration: 0.010s, episode steps: 13, steps per second: 1279, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.231 [0.000, 1.000], mean observation: 0.118 [-1.365, 2.279], mean_best_reward: --
 25025/100000: episode: 772, duration: 0.013s, episode steps: 17, steps per second: 1332, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.412 [0.000, 1.000], mean observation: 0.080 [-0.775, 1.419], mean_best_reward: --
 25041/100000: episode: 773, duration: 0.010s, episode steps: 16, steps per second: 1552, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.090 [-1.170, 0.770], mean_best_reward: --
 25061/100000: episode: 774, duration: 0.012s, episode steps: 20, steps per second: 1665, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.086 [-0.762, 1.595], mean_best_reward: --
 25085/100000: 

 26255/100000: episode: 810, duration: 0.032s, episode steps: 40, steps per second: 1238, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: 0.093 [-0.754, 1.475], mean_best_reward: --
 26293/100000: episode: 811, duration: 0.028s, episode steps: 38, steps per second: 1369, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.020 [-1.444, 1.142], mean_best_reward: --
 26318/100000: episode: 812, duration: 0.017s, episode steps: 25, steps per second: 1431, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.056 [-1.537, 2.305], mean_best_reward: --
 26351/100000: episode: 813, duration: 0.024s, episode steps: 33, steps per second: 1363, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: -0.068 [-1.397, 0.805], mean_best_reward: --
 26375/100000: 

 27885/100000: episode: 851, duration: 0.059s, episode steps: 43, steps per second: 732, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.465 [0.000, 1.000], mean observation: 0.061 [-0.775, 1.734], mean_best_reward: 97.500000
 27909/100000: episode: 852, duration: 0.026s, episode steps: 24, steps per second: 930, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.542 [0.000, 1.000], mean observation: -0.092 [-1.516, 0.644], mean_best_reward: --
 27931/100000: episode: 853, duration: 0.027s, episode steps: 22, steps per second: 807, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.089 [-0.583, 1.198], mean_best_reward: --
 27976/100000: episode: 854, duration: 0.034s, episode steps: 45, steps per second: 1309, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.092 [-1.255, 0.818], mean_best_reward: --
 28024/100

 29649/100000: episode: 893, duration: 0.037s, episode steps: 40, steps per second: 1081, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.074 [-1.252, 0.588], mean_best_reward: --
 29684/100000: episode: 894, duration: 0.022s, episode steps: 35, steps per second: 1603, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.111 [-1.165, 0.390], mean_best_reward: --
 29768/100000: episode: 895, duration: 0.052s, episode steps: 84, steps per second: 1606, episode reward: 84.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.195 [-1.653, 0.844], mean_best_reward: --
 29790/100000: episode: 896, duration: 0.015s, episode steps: 22, steps per second: 1510, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.103 [-0.397, 1.061], mean_best_reward: --
 29815/100000

<keras.callbacks.History at 0x7f3b54052490>