In [1]:
from __future__ import division
import argparse

from PIL import Image
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from keras.optimizers import Adam
import keras.backend as K

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

Using TensorFlow backend.


In [2]:
import matplotlib.pyplot as plt

import sys

from gym_unity.envs import UnityEnv

%matplotlib inline

print("Python version:")
print(sys.version)

# check Python version
if (sys.version_info[0] < 3):
    raise Exception("ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3")


Python version:
3.6.7 |Anaconda, Inc.| (default, Oct 23 2018, 14:01:38) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [3]:
class BallVecProcessor(Processor):
    def process_action(self, action):
        #print(action)
        action = [np.floor(action/21)*((action-20)/10-1.1), (1-np.floor(action/21))*((action-20)/10+1)]
        #print(action)
        return action
    def process_info(self, info):
        key, value = info.items()
        #print(key)
        #print(value)
        #print(value[1].rewards)
        key = 1
        value = value[1].rewards
        info = {key: value}
        
        """Processes the info as obtained from the environment for use in an agent and
        returns it.

        # Arguments
            info (dict): An info as obtained by the environment

        # Returns
            Info obtained by the environment processed
        """
        return info

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)
    

In [4]:
env_name = "mlagents/envs/3DBall_128"  # Name of the Unity environment binary to launch
env = UnityEnv(env_name, worker_id=0, use_visual=False)

nb_actions = 42
print(str(env))

INFO:mlagents.envs:
'Ball3DAcademy' started successfully!
Unity Academy name: Ball3DAcademy
        Number of Brains: 1
        Number of External Brains : 1
        Reset Parameters :
		
Unity brain name: Ball3DBrain
        Number of Visual Observations (per agent): 1
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): [2]
        Vector Action descriptions: , 
INFO:gym_unity:1 agents within environment.


<UnityEnv instance>


In [5]:
# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                144       
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 42)                714       
_________________________________________________________________
activation_3 (Activation)    (None, 42)                0         
Total para

In [6]:
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
processor = BallVecProcessor()
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=42, memory=memory, nb_steps_warmup=1000,
               target_model_update=1e-2, policy=policy, processor = processor)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [7]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env, nb_steps=100000, visualize=False, verbose=2)

Training for 100000 steps ...
    20/100000: episode: 1, duration: 0.545s, episode steps: 20, steps per second: 37, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: -0.055 [-1.000, 1.000], mean observation: -0.313 [-7.848, 4.000], loss: --, mean_absolute_error: --, mean_q: --
    43/100000: episode: 2, duration: 0.236s, episode steps: 23, steps per second: 97, episode reward: 1.200, mean reward: 0.052 [-1.000, 0.100], mean action: 0.048 [-1.000, 0.700], mean observation: -0.256 [-7.848, 4.000], loss: --, mean_absolute_error: --, mean_q: --
    60/100000: episode: 3, duration: 0.200s, episode steps: 17, steps per second: 85, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: -0.071 [-0.900, 0.800], mean observation: -0.655 [-7.848, 4.000], loss: --, mean_absolute_error: --, mean_q: --
    83/100000: episode: 4, duration: 0.261s, episode steps: 23, steps per second: 88, episode reward: 1.200, mean reward: 0.052 [-1.000, 0.100], mean action: -0.

   710/100000: episode: 32, duration: 0.213s, episode steps: 21, steps per second: 99, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: 0.086 [-0.900, 1.000], mean observation: -0.288 [-6.867, 4.000], loss: --, mean_absolute_error: --, mean_q: --
   737/100000: episode: 33, duration: 0.295s, episode steps: 27, steps per second: 91, episode reward: 1.600, mean reward: 0.059 [-1.000, 0.100], mean action: 0.026 [-1.000, 0.900], mean observation: -0.428 [-7.848, 4.000], loss: --, mean_absolute_error: --, mean_q: --
   755/100000: episode: 34, duration: 0.172s, episode steps: 18, steps per second: 104, episode reward: 0.700, mean reward: 0.039 [-1.000, 0.100], mean action: 0.017 [-0.900, 1.000], mean observation: -0.411 [-7.848, 4.000], loss: --, mean_absolute_error: --, mean_q: --
   777/100000: episode: 35, duration: 0.225s, episode steps: 22, steps per second: 98, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: 0.014 [-1.000, 1.000], mean ob

  1455/100000: episode: 63, duration: 0.274s, episode steps: 21, steps per second: 77, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: 0.074 [-0.900, 1.000], mean observation: -0.245 [-6.867, 4.000], loss: 0.128503, mean_absolute_error: 1.179777, mean_q: 1.618593
  1477/100000: episode: 64, duration: 0.239s, episode steps: 22, steps per second: 92, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: 0.032 [-1.000, 1.000], mean observation: -0.274 [-7.848, 4.000], loss: 0.136981, mean_absolute_error: 1.262225, mean_q: 1.719616
  1496/100000: episode: 65, duration: 0.210s, episode steps: 19, steps per second: 90, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: 0.037 [-0.900, 1.000], mean observation: 0.344 [-7.848, 4.000], loss: 0.123840, mean_absolute_error: 1.310185, mean_q: 1.775242
  1516/100000: episode: 66, duration: 0.254s, episode steps: 20, steps per second: 79, episode reward: 0.900, mean reward: 0.045 [-1.000,

  2089/100000: episode: 92, duration: 0.244s, episode steps: 19, steps per second: 78, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: 0.071 [-1.000, 1.000], mean observation: -0.282 [-7.848, 4.000], loss: 0.245230, mean_absolute_error: 2.774495, mean_q: 3.376879
  2107/100000: episode: 93, duration: 0.197s, episode steps: 18, steps per second: 91, episode reward: 0.700, mean reward: 0.039 [-1.000, 0.100], mean action: -0.011 [-0.900, 0.800], mean observation: 0.061 [-7.848, 4.000], loss: 0.250006, mean_absolute_error: 2.801099, mean_q: 3.406243
  2125/100000: episode: 94, duration: 0.213s, episode steps: 18, steps per second: 85, episode reward: 0.700, mean reward: 0.039 [-1.000, 0.100], mean action: 0.089 [-1.000, 0.900], mean observation: 0.134 [-7.848, 4.000], loss: 0.301995, mean_absolute_error: 2.804128, mean_q: 3.436416
  2144/100000: episode: 95, duration: 0.237s, episode steps: 19, steps per second: 80, episode reward: 0.800, mean reward: 0.042 [-1.000,

  2731/100000: episode: 121, duration: 0.459s, episode steps: 27, steps per second: 59, episode reward: 1.600, mean reward: 0.059 [-1.000, 0.100], mean action: -0.078 [-1.000, 0.900], mean observation: -0.290 [-7.848, 4.000], loss: 0.372216, mean_absolute_error: 4.362435, mean_q: 5.118945
  2751/100000: episode: 122, duration: 0.306s, episode steps: 20, steps per second: 65, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: -0.108 [-1.000, 0.800], mean observation: -0.374 [-7.848, 4.000], loss: 0.371143, mean_absolute_error: 4.453413, mean_q: 5.227214
  2775/100000: episode: 123, duration: 0.447s, episode steps: 24, steps per second: 54, episode reward: 1.300, mean reward: 0.054 [-1.000, 0.100], mean action: 0.006 [-1.000, 0.900], mean observation: 0.173 [-7.848, 4.000], loss: 0.319024, mean_absolute_error: 4.324211, mean_q: 5.050910
  2793/100000: episode: 124, duration: 0.388s, episode steps: 18, steps per second: 46, episode reward: 0.700, mean reward: 0.039 [-

  3386/100000: episode: 150, duration: 0.312s, episode steps: 26, steps per second: 83, episode reward: 1.500, mean reward: 0.058 [-1.000, 0.100], mean action: -0.008 [-1.000, 1.000], mean observation: 0.044 [-6.867, 4.000], loss: 0.314006, mean_absolute_error: 5.949769, mean_q: 6.726308
  3409/100000: episode: 151, duration: 0.283s, episode steps: 23, steps per second: 81, episode reward: 1.200, mean reward: 0.052 [-1.000, 0.100], mean action: -0.072 [-1.000, 0.900], mean observation: -0.043 [-6.867, 4.000], loss: 0.384753, mean_absolute_error: 6.078969, mean_q: 6.856194
  3434/100000: episode: 152, duration: 0.311s, episode steps: 25, steps per second: 80, episode reward: 1.400, mean reward: 0.056 [-1.000, 0.100], mean action: 0.012 [-1.000, 1.000], mean observation: 0.237 [-7.848, 4.000], loss: 0.368386, mean_absolute_error: 5.902097, mean_q: 6.705259
  3449/100000: episode: 153, duration: 0.158s, episode steps: 15, steps per second: 95, episode reward: 0.400, mean reward: 0.027 [-1

  4055/100000: episode: 179, duration: 0.250s, episode steps: 20, steps per second: 80, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: -0.070 [-1.000, 1.000], mean observation: -0.007 [-7.848, 4.000], loss: 0.291306, mean_absolute_error: 6.474248, mean_q: 7.197957
  4075/100000: episode: 180, duration: 0.250s, episode steps: 20, steps per second: 80, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: 0.040 [-1.000, 1.000], mean observation: -0.398 [-7.848, 4.000], loss: 0.238660, mean_absolute_error: 6.819362, mean_q: 7.515215
  4101/100000: episode: 181, duration: 0.400s, episode steps: 26, steps per second: 65, episode reward: 1.500, mean reward: 0.058 [-1.000, 0.100], mean action: -0.069 [-1.000, 0.900], mean observation: -0.372 [-7.848, 4.000], loss: 0.239882, mean_absolute_error: 6.500003, mean_q: 7.183958
  4121/100000: episode: 182, duration: 0.232s, episode steps: 20, steps per second: 86, episode reward: 0.900, mean reward: 0.045 [

  4697/100000: episode: 208, duration: 0.344s, episode steps: 19, steps per second: 55, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: -0.082 [-0.900, 1.000], mean observation: -0.224 [-7.848, 4.000], loss: 0.286980, mean_absolute_error: 6.614457, mean_q: 7.354558
  4715/100000: episode: 209, duration: 0.215s, episode steps: 18, steps per second: 84, episode reward: 0.700, mean reward: 0.039 [-1.000, 0.100], mean action: -0.008 [-0.900, 1.000], mean observation: -0.310 [-7.848, 4.000], loss: 0.330469, mean_absolute_error: 6.691556, mean_q: 7.498895
  4733/100000: episode: 210, duration: 0.210s, episode steps: 18, steps per second: 86, episode reward: 0.700, mean reward: 0.039 [-1.000, 0.100], mean action: 0.019 [-1.000, 1.000], mean observation: -0.282 [-7.848, 4.000], loss: 0.338123, mean_absolute_error: 6.523038, mean_q: 7.359344
  4766/100000: episode: 211, duration: 0.347s, episode steps: 33, steps per second: 95, episode reward: 2.200, mean reward: 0.067 [

  5383/100000: episode: 237, duration: 0.229s, episode steps: 17, steps per second: 74, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: -0.032 [-0.900, 1.000], mean observation: 0.338 [-7.848, 4.000], loss: 0.320723, mean_absolute_error: 6.236575, mean_q: 7.043702
  5431/100000: episode: 238, duration: 0.557s, episode steps: 48, steps per second: 86, episode reward: 3.700, mean reward: 0.077 [-1.000, 0.100], mean action: -0.010 [-1.000, 1.000], mean observation: 0.064 [-7.848, 4.000], loss: 0.352504, mean_absolute_error: 6.288002, mean_q: 7.064980
  5460/100000: episode: 239, duration: 0.397s, episode steps: 29, steps per second: 73, episode reward: 1.800, mean reward: 0.062 [-1.000, 0.100], mean action: 0.003 [-0.900, 1.000], mean observation: 0.344 [-7.848, 4.000], loss: 0.384573, mean_absolute_error: 6.246907, mean_q: 7.014368
  5486/100000: episode: 240, duration: 0.274s, episode steps: 26, steps per second: 95, episode reward: 1.500, mean reward: 0.058 [-1.

  6103/100000: episode: 266, duration: 0.333s, episode steps: 26, steps per second: 78, episode reward: 1.500, mean reward: 0.058 [-1.000, 0.100], mean action: -0.015 [-1.000, 1.000], mean observation: -0.133 [-7.848, 4.000], loss: 0.325188, mean_absolute_error: 6.084696, mean_q: 6.884057
  6127/100000: episode: 267, duration: 0.327s, episode steps: 24, steps per second: 73, episode reward: 1.300, mean reward: 0.054 [-1.000, 0.100], mean action: -0.042 [-1.000, 1.000], mean observation: -0.394 [-6.867, 4.000], loss: 0.379278, mean_absolute_error: 5.841502, mean_q: 6.615740
  6144/100000: episode: 268, duration: 0.232s, episode steps: 17, steps per second: 73, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: -0.179 [-1.000, 0.500], mean observation: 0.067 [-7.848, 4.000], loss: 0.342456, mean_absolute_error: 6.013781, mean_q: 6.770060
  6168/100000: episode: 269, duration: 0.294s, episode steps: 24, steps per second: 82, episode reward: 1.300, mean reward: 0.054 [

  6845/100000: episode: 295, duration: 0.256s, episode steps: 19, steps per second: 74, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: 0.026 [-1.000, 0.900], mean observation: 0.169 [-7.848, 4.000], loss: 0.233917, mean_absolute_error: 6.556154, mean_q: 7.361237
  6866/100000: episode: 296, duration: 0.252s, episode steps: 21, steps per second: 83, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: -0.000 [-0.900, 1.000], mean observation: 0.230 [-7.848, 4.000], loss: 0.352455, mean_absolute_error: 6.542249, mean_q: 7.393505
  6891/100000: episode: 297, duration: 0.294s, episode steps: 25, steps per second: 85, episode reward: 1.400, mean reward: 0.056 [-1.000, 0.100], mean action: 0.098 [-1.000, 1.000], mean observation: -0.152 [-6.867, 4.000], loss: 0.333043, mean_absolute_error: 6.532847, mean_q: 7.408377
  6910/100000: episode: 298, duration: 0.182s, episode steps: 19, steps per second: 104, episode reward: 0.800, mean reward: 0.042 [-1

  7610/100000: episode: 325, duration: 0.460s, episode steps: 41, steps per second: 89, episode reward: 3.000, mean reward: 0.073 [-1.000, 0.100], mean action: -0.059 [-1.000, 1.000], mean observation: 0.076 [-7.848, 4.000], loss: 0.466255, mean_absolute_error: 6.788521, mean_q: 7.773921
  7632/100000: episode: 326, duration: 0.268s, episode steps: 22, steps per second: 82, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: -0.034 [-0.800, 0.800], mean observation: -0.327 [-7.848, 4.000], loss: 0.557780, mean_absolute_error: 7.046698, mean_q: 7.966941
  7658/100000: episode: 327, duration: 0.321s, episode steps: 26, steps per second: 81, episode reward: 1.500, mean reward: 0.058 [-1.000, 0.100], mean action: -0.094 [-1.000, 1.000], mean observation: -0.448 [-7.848, 4.000], loss: 0.447961, mean_absolute_error: 6.886447, mean_q: 7.790299
  7679/100000: episode: 328, duration: 0.240s, episode steps: 21, steps per second: 88, episode reward: 1.000, mean reward: 0.048 [

  8419/100000: episode: 354, duration: 0.483s, episode steps: 43, steps per second: 89, episode reward: 3.200, mean reward: 0.074 [-1.000, 0.100], mean action: -0.045 [-1.000, 1.000], mean observation: 0.123 [-7.848, 4.000], loss: 0.619190, mean_absolute_error: 8.203462, mean_q: 9.294501
  8441/100000: episode: 355, duration: 0.256s, episode steps: 22, steps per second: 86, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: -0.011 [-0.900, 0.800], mean observation: -0.232 [-6.867, 4.000], loss: 0.636292, mean_absolute_error: 8.250793, mean_q: 9.308115
  8462/100000: episode: 356, duration: 0.231s, episode steps: 21, steps per second: 91, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: 0.031 [-1.000, 0.800], mean observation: -0.578 [-7.848, 4.000], loss: 0.982929, mean_absolute_error: 8.113553, mean_q: 9.208385
  8495/100000: episode: 357, duration: 0.347s, episode steps: 33, steps per second: 95, episode reward: 2.200, mean reward: 0.067 [-

  9210/100000: episode: 383, duration: 0.243s, episode steps: 21, steps per second: 87, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: -0.026 [-1.000, 1.000], mean observation: 0.100 [-7.848, 4.000], loss: 0.759447, mean_absolute_error: 8.812023, mean_q: 9.880856
  9226/100000: episode: 384, duration: 0.203s, episode steps: 16, steps per second: 79, episode reward: 0.500, mean reward: 0.031 [-1.000, 0.100], mean action: 0.028 [-0.800, 0.900], mean observation: 0.318 [-8.829, 4.000], loss: 0.650679, mean_absolute_error: 9.127918, mean_q: 10.169336
  9243/100000: episode: 385, duration: 0.214s, episode steps: 17, steps per second: 80, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: -0.088 [-0.900, 0.700], mean observation: 0.475 [-7.848, 4.000], loss: 0.666234, mean_absolute_error: 8.974195, mean_q: 10.047656
  9266/100000: episode: 386, duration: 0.278s, episode steps: 23, steps per second: 83, episode reward: 1.200, mean reward: 0.052 [-

  9969/100000: episode: 412, duration: 0.245s, episode steps: 19, steps per second: 77, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: 0.089 [-0.700, 0.900], mean observation: 0.313 [-7.848, 4.000], loss: 0.881427, mean_absolute_error: 9.691057, mean_q: 10.955565
  9993/100000: episode: 413, duration: 0.255s, episode steps: 24, steps per second: 94, episode reward: 1.300, mean reward: 0.054 [-1.000, 0.100], mean action: 0.025 [-0.800, 1.000], mean observation: -0.302 [-7.848, 4.000], loss: 0.598931, mean_absolute_error: 10.075060, mean_q: 11.328259
 10015/100000: episode: 414, duration: 0.294s, episode steps: 22, steps per second: 75, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: -0.009 [-1.000, 1.000], mean observation: 0.483 [-6.867, 4.000], loss: 0.738967, mean_absolute_error: 9.972093, mean_q: 11.238900
 10045/100000: episode: 415, duration: 0.346s, episode steps: 30, steps per second: 87, episode reward: 1.900, mean reward: 0.063 

 10879/100000: episode: 441, duration: 0.515s, episode steps: 46, steps per second: 89, episode reward: 3.500, mean reward: 0.076 [-1.000, 0.100], mean action: 0.005 [-1.000, 0.900], mean observation: 0.053 [-7.848, 4.000], loss: 0.807442, mean_absolute_error: 9.835869, mean_q: 11.089683
 10906/100000: episode: 442, duration: 0.292s, episode steps: 27, steps per second: 93, episode reward: 1.600, mean reward: 0.059 [-1.000, 0.100], mean action: -0.013 [-1.000, 0.900], mean observation: 0.230 [-7.848, 4.000], loss: 0.885691, mean_absolute_error: 10.113964, mean_q: 11.334802
 10943/100000: episode: 443, duration: 0.433s, episode steps: 37, steps per second: 85, episode reward: 2.600, mean reward: 0.070 [-1.000, 0.100], mean action: -0.035 [-1.000, 1.000], mean observation: -0.326 [-7.848, 4.000], loss: 0.679492, mean_absolute_error: 10.562932, mean_q: 11.728830
 10963/100000: episode: 444, duration: 0.232s, episode steps: 20, steps per second: 86, episode reward: 0.900, mean reward: 0.04

 11724/100000: episode: 470, duration: 0.695s, episode steps: 55, steps per second: 79, episode reward: 4.400, mean reward: 0.080 [-1.000, 0.100], mean action: -0.002 [-1.000, 1.000], mean observation: 0.224 [-7.848, 4.000], loss: 0.844251, mean_absolute_error: 10.353232, mean_q: 11.616868
 11743/100000: episode: 471, duration: 0.208s, episode steps: 19, steps per second: 91, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: -0.005 [-1.000, 1.000], mean observation: -0.314 [-7.848, 4.000], loss: 0.628945, mean_absolute_error: 10.837576, mean_q: 12.098731
 11782/100000: episode: 472, duration: 0.468s, episode steps: 39, steps per second: 83, episode reward: 2.800, mean reward: 0.072 [-1.000, 0.100], mean action: 0.018 [-1.000, 1.000], mean observation: 0.028 [-7.848, 4.000], loss: 0.778363, mean_absolute_error: 10.510346, mean_q: 11.755119
 11834/100000: episode: 473, duration: 0.606s, episode steps: 52, steps per second: 86, episode reward: 4.100, mean reward: 0.0

 12657/100000: episode: 499, duration: 0.274s, episode steps: 20, steps per second: 73, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: 0.030 [-1.000, 0.900], mean observation: -0.402 [-7.848, 4.000], loss: 0.912451, mean_absolute_error: 11.147917, mean_q: 12.440282
 12728/100000: episode: 500, duration: 0.739s, episode steps: 71, steps per second: 96, episode reward: 6.000, mean reward: 0.085 [-1.000, 0.100], mean action: -0.051 [-1.000, 1.000], mean observation: 0.061 [-7.848, 4.000], loss: 0.832443, mean_absolute_error: 11.402916, mean_q: 12.692650
 12748/100000: episode: 501, duration: 0.237s, episode steps: 20, steps per second: 84, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: -0.038 [-1.000, 0.800], mean observation: -0.453 [-6.867, 4.000], loss: 0.795263, mean_absolute_error: 11.113611, mean_q: 12.463690
 12773/100000: episode: 502, duration: 0.289s, episode steps: 25, steps per second: 86, episode reward: 1.400, mean reward: 0.

 13748/100000: episode: 528, duration: 0.574s, episode steps: 47, steps per second: 82, episode reward: 3.600, mean reward: 0.077 [-1.000, 0.100], mean action: 0.029 [-1.000, 1.000], mean observation: -0.160 [-7.848, 4.000], loss: 0.785299, mean_absolute_error: 12.113111, mean_q: 13.568513
 13773/100000: episode: 529, duration: 0.315s, episode steps: 25, steps per second: 79, episode reward: 1.400, mean reward: 0.056 [-1.000, 0.100], mean action: -0.092 [-1.000, 1.000], mean observation: -0.184 [-6.867, 4.000], loss: 1.057480, mean_absolute_error: 12.383994, mean_q: 13.816556
 13823/100000: episode: 530, duration: 0.616s, episode steps: 50, steps per second: 81, episode reward: 3.900, mean reward: 0.078 [-1.000, 0.100], mean action: -0.048 [-1.000, 1.000], mean observation: 0.248 [-7.848, 4.000], loss: 0.892271, mean_absolute_error: 12.345990, mean_q: 13.788200
 13859/100000: episode: 531, duration: 0.406s, episode steps: 36, steps per second: 89, episode reward: 2.500, mean reward: 0.

 14658/100000: episode: 557, duration: 0.371s, episode steps: 33, steps per second: 89, episode reward: 2.200, mean reward: 0.067 [-1.000, 0.100], mean action: -0.027 [-1.000, 1.000], mean observation: 0.200 [-7.848, 4.000], loss: 0.948235, mean_absolute_error: 13.801324, mean_q: 15.423445
 14683/100000: episode: 558, duration: 0.322s, episode steps: 25, steps per second: 78, episode reward: 1.400, mean reward: 0.056 [-1.000, 0.100], mean action: -0.012 [-1.000, 1.000], mean observation: 0.421 [-7.848, 4.000], loss: 0.975089, mean_absolute_error: 13.287296, mean_q: 14.865048
 14725/100000: episode: 559, duration: 0.508s, episode steps: 42, steps per second: 83, episode reward: 3.100, mean reward: 0.074 [-1.000, 0.100], mean action: 0.006 [-0.900, 1.000], mean observation: -0.023 [-7.848, 4.000], loss: 1.330501, mean_absolute_error: 13.577456, mean_q: 15.153834
 14745/100000: episode: 560, duration: 0.234s, episode steps: 20, steps per second: 86, episode reward: 0.900, mean reward: 0.0

 15785/100000: episode: 586, duration: 0.658s, episode steps: 57, steps per second: 87, episode reward: 4.600, mean reward: 0.081 [-1.000, 0.100], mean action: -0.026 [-1.000, 1.000], mean observation: -0.021 [-7.848, 4.000], loss: 1.295850, mean_absolute_error: 15.589671, mean_q: 17.384918
 15814/100000: episode: 587, duration: 0.356s, episode steps: 29, steps per second: 82, episode reward: 1.800, mean reward: 0.062 [-1.000, 0.100], mean action: -0.009 [-1.000, 1.000], mean observation: 0.043 [-7.848, 4.000], loss: 1.591584, mean_absolute_error: 15.500573, mean_q: 17.407429
 15834/100000: episode: 588, duration: 0.250s, episode steps: 20, steps per second: 80, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: -0.035 [-0.900, 1.000], mean observation: 0.127 [-7.848, 4.000], loss: 1.450849, mean_absolute_error: 15.782959, mean_q: 17.722410
 15886/100000: episode: 589, duration: 0.576s, episode steps: 52, steps per second: 90, episode reward: 4.100, mean reward: 0.

 17066/100000: episode: 615, duration: 1.026s, episode steps: 89, steps per second: 87, episode reward: 7.800, mean reward: 0.088 [-1.000, 0.100], mean action: -0.027 [-1.000, 1.000], mean observation: 0.050 [-7.848, 4.000], loss: 1.647439, mean_absolute_error: 17.810278, mean_q: 19.894304
 17093/100000: episode: 616, duration: 0.320s, episode steps: 27, steps per second: 84, episode reward: 1.600, mean reward: 0.059 [-1.000, 0.100], mean action: 0.037 [-1.000, 1.000], mean observation: -0.213 [-6.867, 4.000], loss: 1.794374, mean_absolute_error: 17.827778, mean_q: 19.928944
 17167/100000: episode: 617, duration: 0.883s, episode steps: 74, steps per second: 84, episode reward: 6.300, mean reward: 0.085 [-1.000, 0.100], mean action: 0.003 [-1.000, 1.000], mean observation: -0.106 [-7.848, 4.000], loss: 1.718620, mean_absolute_error: 18.029310, mean_q: 20.081604
 17269/100000: episode: 618, duration: 1.166s, episode steps: 102, steps per second: 87, episode reward: 9.100, mean reward: 0.

 18435/100000: episode: 644, duration: 0.235s, episode steps: 20, steps per second: 85, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: 0.025 [-0.800, 1.000], mean observation: 0.335 [-6.867, 4.000], loss: 1.959140, mean_absolute_error: 20.271912, mean_q: 22.696558
 18492/100000: episode: 645, duration: 0.647s, episode steps: 57, steps per second: 88, episode reward: 4.600, mean reward: 0.081 [-1.000, 0.100], mean action: -0.047 [-1.000, 1.000], mean observation: -0.018 [-7.848, 4.000], loss: 1.996515, mean_absolute_error: 21.005095, mean_q: 23.464884
 18586/100000: episode: 646, duration: 1.059s, episode steps: 94, steps per second: 89, episode reward: 8.300, mean reward: 0.088 [-1.000, 0.100], mean action: 0.025 [-1.000, 1.000], mean observation: 0.067 [-7.848, 4.000], loss: 2.473529, mean_absolute_error: 21.491896, mean_q: 24.052435
 18622/100000: episode: 647, duration: 0.429s, episode steps: 36, steps per second: 84, episode reward: 2.500, mean reward: 0.06

 21169/100000: episode: 673, duration: 0.260s, episode steps: 18, steps per second: 69, episode reward: 0.700, mean reward: 0.039 [-1.000, 0.100], mean action: 0.092 [-0.900, 0.900], mean observation: -0.264 [-7.848, 4.000], loss: 3.424370, mean_absolute_error: 28.444271, mean_q: 31.720207
 21190/100000: episode: 674, duration: 0.246s, episode steps: 21, steps per second: 85, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: 0.048 [-1.000, 1.000], mean observation: 0.092 [-6.867, 4.000], loss: 3.018991, mean_absolute_error: 28.599787, mean_q: 31.898844
 21214/100000: episode: 675, duration: 0.287s, episode steps: 24, steps per second: 84, episode reward: 1.300, mean reward: 0.054 [-1.000, 0.100], mean action: 0.137 [-0.900, 1.000], mean observation: 0.158 [-7.848, 4.000], loss: 2.907934, mean_absolute_error: 28.042849, mean_q: 31.381021
 22215/100000: episode: 676, duration: 11.719s, episode steps: 1001, steps per second: 85, episode reward: 100.100, mean reward: 

 44700/100000: episode: 701, duration: 11.092s, episode steps: 1001, steps per second: 90, episode reward: 100.100, mean reward: 0.100 [0.100, 0.100], mean action: -0.001 [-1.000, 1.000], mean observation: 0.188 [-7.848, 4.000], loss: 19.995596, mean_absolute_error: 107.424484, mean_q: 114.991043
 45701/100000: episode: 702, duration: 11.522s, episode steps: 1001, steps per second: 87, episode reward: 100.100, mean reward: 0.100 [0.100, 0.100], mean action: 0.001 [-1.000, 1.000], mean observation: 0.188 [-7.848, 4.000], loss: 16.973871, mean_absolute_error: 109.072617, mean_q: 116.654312
 46702/100000: episode: 703, duration: 11.819s, episode steps: 1001, steps per second: 85, episode reward: 100.100, mean reward: 0.100 [0.100, 0.100], mean action: 0.001 [-1.000, 1.000], mean observation: 0.186 [-7.848, 4.000], loss: 19.986319, mean_absolute_error: 110.018112, mean_q: 117.658089
 47703/100000: episode: 704, duration: 11.233s, episode steps: 1001, steps per second: 89, episode reward: 1

 62168/100000: episode: 729, duration: 0.724s, episode steps: 59, steps per second: 82, episode reward: 4.800, mean reward: 0.081 [-1.000, 0.100], mean action: 0.065 [-1.000, 1.000], mean observation: 0.078 [-7.848, 4.000], loss: 18.474075, mean_absolute_error: 122.429062, mean_q: 129.112915
 62202/100000: episode: 730, duration: 0.428s, episode steps: 34, steps per second: 79, episode reward: 2.300, mean reward: 0.068 [-1.000, 0.100], mean action: 0.250 [-0.900, 1.000], mean observation: -0.043 [-6.867, 4.000], loss: 16.996952, mean_absolute_error: 121.249725, mean_q: 128.016266
 62333/100000: episode: 731, duration: 1.454s, episode steps: 131, steps per second: 90, episode reward: 12.000, mean reward: 0.092 [-1.000, 0.100], mean action: 0.034 [-1.000, 1.000], mean observation: 0.170 [-6.867, 4.000], loss: 14.893246, mean_absolute_error: 121.722786, mean_q: 128.457687
 62515/100000: episode: 732, duration: 2.138s, episode steps: 182, steps per second: 85, episode reward: 17.100, mean 

 68217/100000: episode: 757, duration: 1.633s, episode steps: 140, steps per second: 86, episode reward: 12.900, mean reward: 0.092 [-1.000, 0.100], mean action: -0.034 [-1.000, 1.000], mean observation: 0.180 [-7.848, 4.000], loss: 13.304839, mean_absolute_error: 114.796333, mean_q: 120.715683
 68252/100000: episode: 758, duration: 0.439s, episode steps: 35, steps per second: 80, episode reward: 2.400, mean reward: 0.069 [-1.000, 0.100], mean action: -0.176 [-1.000, 1.000], mean observation: 0.357 [-6.867, 4.000], loss: 11.894973, mean_absolute_error: 114.671402, mean_q: 120.583649
 68759/100000: episode: 759, duration: 5.916s, episode steps: 507, steps per second: 86, episode reward: 49.600, mean reward: 0.098 [-1.000, 0.100], mean action: 0.019 [-1.000, 1.000], mean observation: 0.168 [-7.848, 4.000], loss: 11.237815, mean_absolute_error: 114.238602, mean_q: 120.023537
 69259/100000: episode: 760, duration: 5.616s, episode steps: 500, steps per second: 89, episode reward: 48.900, me

 76703/100000: episode: 785, duration: 0.747s, episode steps: 65, steps per second: 87, episode reward: 5.400, mean reward: 0.083 [-1.000, 0.100], mean action: 0.093 [-1.000, 1.000], mean observation: 0.077 [-6.946, 4.000], loss: 5.493799, mean_absolute_error: 90.108566, mean_q: 94.534683
 76925/100000: episode: 786, duration: 2.546s, episode steps: 222, steps per second: 87, episode reward: 21.100, mean reward: 0.095 [-1.000, 0.100], mean action: 0.006 [-1.000, 1.000], mean observation: 0.203 [-6.867, 4.000], loss: 5.928614, mean_absolute_error: 89.686546, mean_q: 94.039124
 77505/100000: episode: 787, duration: 6.762s, episode steps: 580, steps per second: 86, episode reward: 56.900, mean reward: 0.098 [-1.000, 0.100], mean action: 0.000 [-1.000, 1.000], mean observation: 0.145 [-6.867, 4.000], loss: 8.002007, mean_absolute_error: 88.640610, mean_q: 93.060844
 77535/100000: episode: 788, duration: 0.381s, episode steps: 30, steps per second: 79, episode reward: 1.900, mean reward: 0.

 80881/100000: episode: 814, duration: 0.493s, episode steps: 24, steps per second: 49, episode reward: 1.300, mean reward: 0.054 [-1.000, 0.100], mean action: 0.060 [-1.000, 1.000], mean observation: -0.417 [-6.867, 4.000], loss: 7.507509, mean_absolute_error: 81.198334, mean_q: 85.241707
 80908/100000: episode: 815, duration: 0.387s, episode steps: 27, steps per second: 70, episode reward: 1.600, mean reward: 0.059 [-1.000, 0.100], mean action: -0.063 [-1.000, 1.000], mean observation: -0.198 [-6.867, 4.000], loss: 7.777591, mean_absolute_error: 80.926270, mean_q: 84.894707
 81165/100000: episode: 816, duration: 3.184s, episode steps: 257, steps per second: 81, episode reward: 24.600, mean reward: 0.096 [-1.000, 0.100], mean action: -0.009 [-1.000, 1.000], mean observation: 0.114 [-6.867, 4.000], loss: 7.158630, mean_absolute_error: 80.639664, mean_q: 84.769371
 81232/100000: episode: 817, duration: 0.893s, episode steps: 67, steps per second: 75, episode reward: 5.600, mean reward: 

 82994/100000: episode: 843, duration: 0.333s, episode steps: 32, steps per second: 96, episode reward: 2.100, mean reward: 0.066 [-1.000, 0.100], mean action: -0.183 [-1.000, 0.900], mean observation: 0.275 [-6.867, 4.000], loss: 4.316245, mean_absolute_error: 76.868958, mean_q: 81.026703
 83115/100000: episode: 844, duration: 1.370s, episode steps: 121, steps per second: 88, episode reward: 11.000, mean reward: 0.091 [-1.000, 0.100], mean action: 0.030 [-1.000, 1.000], mean observation: 0.056 [-7.848, 4.000], loss: 7.702164, mean_absolute_error: 76.815720, mean_q: 80.844482
 83141/100000: episode: 845, duration: 0.304s, episode steps: 26, steps per second: 86, episode reward: 1.500, mean reward: 0.058 [-1.000, 0.100], mean action: -0.065 [-1.000, 1.000], mean observation: -0.550 [-6.867, 4.000], loss: 8.619825, mean_absolute_error: 76.150185, mean_q: 80.220253
 83309/100000: episode: 846, duration: 1.935s, episode steps: 168, steps per second: 87, episode reward: 15.700, mean reward:

 88065/100000: episode: 872, duration: 5.290s, episode steps: 464, steps per second: 88, episode reward: 45.300, mean reward: 0.098 [-1.000, 0.100], mean action: -0.008 [-1.000, 1.000], mean observation: 0.188 [-6.867, 4.000], loss: 6.892467, mean_absolute_error: 72.531960, mean_q: 76.825165
 88135/100000: episode: 873, duration: 0.781s, episode steps: 70, steps per second: 90, episode reward: 5.900, mean reward: 0.084 [-1.000, 0.100], mean action: 0.044 [-1.000, 1.000], mean observation: 0.203 [-6.867, 4.000], loss: 9.304734, mean_absolute_error: 72.021996, mean_q: 76.263512
 88621/100000: episode: 874, duration: 5.299s, episode steps: 486, steps per second: 92, episode reward: 47.500, mean reward: 0.098 [-1.000, 0.100], mean action: -0.013 [-1.000, 1.000], mean observation: 0.146 [-7.848, 4.000], loss: 6.594003, mean_absolute_error: 72.318176, mean_q: 76.635223
 89135/100000: episode: 875, duration: 5.654s, episode steps: 514, steps per second: 91, episode reward: 50.300, mean reward

 93862/100000: episode: 901, duration: 0.807s, episode steps: 67, steps per second: 83, episode reward: 5.600, mean reward: 0.084 [-1.000, 0.100], mean action: -0.131 [-1.000, 1.000], mean observation: 0.113 [-6.867, 4.000], loss: 11.123363, mean_absolute_error: 71.133453, mean_q: 75.843323
 94121/100000: episode: 902, duration: 2.993s, episode steps: 259, steps per second: 87, episode reward: 24.800, mean reward: 0.096 [-1.000, 0.100], mean action: -0.018 [-1.000, 1.000], mean observation: 0.240 [-6.867, 4.000], loss: 7.026450, mean_absolute_error: 72.036125, mean_q: 76.875580
 94964/100000: episode: 903, duration: 9.783s, episode steps: 843, steps per second: 86, episode reward: 83.200, mean reward: 0.099 [-1.000, 0.100], mean action: -0.000 [-1.000, 1.000], mean observation: 0.182 [-7.848, 4.000], loss: 7.735035, mean_absolute_error: 72.468460, mean_q: 77.279922
 95263/100000: episode: 904, duration: 3.435s, episode steps: 299, steps per second: 87, episode reward: 28.800, mean rewa

<keras.callbacks.History at 0x10d743470>

In [8]:
ENV_NAME = '3DBall_128'
# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_vec_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=1000, visualize=True)

Testing for 1000 episodes ...
Episode 1: reward: 100.100, steps: 1001
Episode 2: reward: 100.100, steps: 1001
Episode 3: reward: 100.100, steps: 1001
Episode 4: reward: 100.100, steps: 1001
Episode 5: reward: 100.100, steps: 1001
Episode 6: reward: 100.100, steps: 1001
Episode 7: reward: 100.100, steps: 1001
Episode 8: reward: 100.100, steps: 1001
Episode 9: reward: 100.100, steps: 1001
Episode 10: reward: 100.100, steps: 1001
Episode 11: reward: 100.100, steps: 1001
Episode 12: reward: 100.100, steps: 1001
Episode 13: reward: 100.100, steps: 1001
Episode 14: reward: 100.100, steps: 1001
Episode 15: reward: 100.100, steps: 1001
Episode 16: reward: 100.100, steps: 1001
Episode 17: reward: 100.100, steps: 1001
Episode 18: reward: 100.100, steps: 1001
Episode 19: reward: 100.100, steps: 1001
Episode 20: reward: 100.100, steps: 1001
Episode 21: reward: 100.100, steps: 1001
Episode 22: reward: 100.100, steps: 1001
Episode 23: reward: 100.100, steps: 1001
Episode 24: reward: 100.100, steps: 

KeyboardInterrupt: 