In [1]:
from __future__ import division
import argparse

from PIL import Image
import numpy as np
import gym

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute, Input, Concatenate
from keras.optimizers import Adam
import keras.backend as K

from rl.agents.ddpg import DDPGAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint
from rl.random import OrnsteinUhlenbeckProcess

Using TensorFlow backend.


In [2]:
import matplotlib.pyplot as plt
import sys
from gym_unity.envs import UnityEnv

%matplotlib inline

print("Python version:")
print(sys.version)

# check Python version
if (sys.version_info[0] < 3):
    raise Exception("ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3")


Python version:
3.6.7 |Anaconda, Inc.| (default, Oct 23 2018, 14:01:38) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [3]:
class BallVecProcessor(Processor):
    def process_action(self, action):
        #print(action)
        return action
    def process_info(self, info):
        key, value = info.items()
        #print(key)
        #print(value)
        #print(value[1].rewards)
        key = 1
        value = value[1].rewards
        info = {key: value}
        return info
    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)    

In [4]:
env_name = "mlagents/envs/3DBall_128"  # Name of the Unity environment binary to launch
env = UnityEnv(env_name, worker_id=0, use_visual=False)

nb_actions = 2
print(str(env))

INFO:mlagents.envs:
'Ball3DAcademy' started successfully!
Unity Academy name: Ball3DAcademy
        Number of Brains: 1
        Number of External Brains : 1
        Reset Parameters :
		
Unity brain name: Ball3DBrain
        Number of Visual Observations (per agent): 1
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): [2]
        Vector Action descriptions: , 
INFO:gym_unity:1 agents within environment.


<UnityEnv instance>


In [5]:
actor = Sequential()
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(nb_actions))
actor.add(Activation('linear'))
print(actor.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                144       
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________

In [6]:
action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observation_input (InputLayer)  (None, 1, 8)         0                                            
__________________________________________________________________________________________________
action_input (InputLayer)       (None, 2)            0                                            
__________________________________________________________________________________________________
flatten_2 (Flatten)             (None, 8)            0           observation_input[0][0]          
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 10)           0           action_input[0][0]               
                                                                 flatten_2[0][0]                  
__________

In [7]:
processor = BallVecProcessor()
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.99, target_model_update=1e-3, processor = processor)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

In [8]:
agent.fit(env, nb_steps=50000, visualize=True, verbose=2)



Training for 50000 steps ...
    19/50000: episode: 1, duration: 0.563s, episode steps: 19, steps per second: 34, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: -0.074 [-0.965, 0.706], mean observation: 0.189 [-6.867, 4.000], loss: --, mean_absolute_error: --, mean_q: --
    36/50000: episode: 2, duration: 0.172s, episode steps: 17, steps per second: 99, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: -0.085 [-1.028, 0.495], mean observation: 0.120 [-7.848, 4.000], loss: --, mean_absolute_error: --, mean_q: --
    56/50000: episode: 3, duration: 0.198s, episode steps: 20, steps per second: 101, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: -0.068 [-0.960, 0.323], mean observation: 0.153 [-7.848, 4.000], loss: --, mean_absolute_error: --, mean_q: --
    72/50000: episode: 4, duration: 0.162s, episode steps: 16, steps per second: 99, episode reward: 0.500, mean reward: 0.031 [-1.000, 0.100], mean action: -0.102 [-

   551/50000: episode: 30, duration: 0.318s, episode steps: 19, steps per second: 60, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: -0.028 [-1.821, 1.601], mean observation: 0.356 [-7.848, 4.000], loss: 0.006909, mean_absolute_error: 0.071849, mean_q: -0.291069
   570/50000: episode: 31, duration: 0.240s, episode steps: 19, steps per second: 79, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: 0.193 [-1.509, 3.074], mean observation: 0.262 [-5.886, 4.000], loss: 0.008931, mean_absolute_error: 0.083821, mean_q: -0.292774
   584/50000: episode: 32, duration: 0.173s, episode steps: 14, steps per second: 81, episode reward: 0.300, mean reward: 0.021 [-1.000, 0.100], mean action: 0.221 [-1.217, 1.842], mean observation: 0.152 [-8.829, 4.000], loss: 0.006739, mean_absolute_error: 0.074450, mean_q: -0.305965
   604/50000: episode: 33, duration: 0.303s, episode steps: 20, steps per second: 66, episode reward: 0.900, mean reward: 0.045 [-1.000, 0

  1072/50000: episode: 59, duration: 0.303s, episode steps: 23, steps per second: 76, episode reward: 1.200, mean reward: 0.052 [-1.000, 0.100], mean action: 0.063 [-1.594, 4.268], mean observation: 0.407 [-4.905, 4.000], loss: 0.004265, mean_absolute_error: 0.057198, mean_q: -0.242825
  1089/50000: episode: 60, duration: 0.215s, episode steps: 17, steps per second: 79, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: 0.328 [-1.451, 3.833], mean observation: 0.307 [-6.867, 4.000], loss: 0.004426, mean_absolute_error: 0.055560, mean_q: -0.243024
  1106/50000: episode: 61, duration: 0.243s, episode steps: 17, steps per second: 70, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: -0.030 [-0.909, 1.130], mean observation: 0.343 [-7.848, 4.000], loss: 0.004848, mean_absolute_error: 0.059053, mean_q: -0.249043
  1122/50000: episode: 62, duration: 0.215s, episode steps: 16, steps per second: 74, episode reward: 0.500, mean reward: 0.031 [-1.000, 0

  1640/50000: episode: 89, duration: 0.208s, episode steps: 15, steps per second: 72, episode reward: 0.400, mean reward: 0.027 [-1.000, 0.100], mean action: -0.039 [-1.493, 1.771], mean observation: 0.175 [-8.829, 4.000], loss: 0.003033, mean_absolute_error: 0.049859, mean_q: -0.187453
  1654/50000: episode: 90, duration: 0.170s, episode steps: 14, steps per second: 83, episode reward: 0.300, mean reward: 0.021 [-1.000, 0.100], mean action: -0.604 [-3.255, 1.647], mean observation: -0.088 [-8.829, 4.000], loss: 0.003116, mean_absolute_error: 0.045873, mean_q: -0.187394
  1673/50000: episode: 91, duration: 0.246s, episode steps: 19, steps per second: 77, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: -0.452 [-2.306, 0.983], mean observation: 0.386 [-6.867, 4.000], loss: 0.003749, mean_absolute_error: 0.055552, mean_q: -0.211366
  1708/50000: episode: 92, duration: 0.418s, episode steps: 35, steps per second: 84, episode reward: 2.400, mean reward: 0.069 [-1.000

  2162/50000: episode: 118, duration: 0.246s, episode steps: 21, steps per second: 85, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: -0.611 [-4.824, 1.825], mean observation: 0.427 [-6.867, 4.000], loss: 0.001629, mean_absolute_error: 0.038884, mean_q: -0.150627
  2180/50000: episode: 119, duration: 0.282s, episode steps: 18, steps per second: 64, episode reward: 0.700, mean reward: 0.039 [-1.000, 0.100], mean action: -0.259 [-1.440, 1.131], mean observation: 0.297 [-7.848, 4.000], loss: 0.002074, mean_absolute_error: 0.040838, mean_q: -0.167313
  2208/50000: episode: 120, duration: 0.425s, episode steps: 28, steps per second: 66, episode reward: 1.700, mean reward: 0.061 [-1.000, 0.100], mean action: -0.139 [-2.838, 4.244], mean observation: 0.422 [-5.206, 4.000], loss: 0.002217, mean_absolute_error: 0.045703, mean_q: -0.165250
  2223/50000: episode: 121, duration: 0.210s, episode steps: 15, steps per second: 72, episode reward: 0.400, mean reward: 0.027 [-1.

  2744/50000: episode: 147, duration: 0.515s, episode steps: 40, steps per second: 78, episode reward: 2.900, mean reward: 0.073 [-1.000, 0.100], mean action: -0.018 [-2.356, 3.845], mean observation: 0.231 [-6.049, 4.000], loss: 0.001625, mean_absolute_error: 0.038019, mean_q: -0.141427
  2760/50000: episode: 148, duration: 0.201s, episode steps: 16, steps per second: 80, episode reward: 0.500, mean reward: 0.031 [-1.000, 0.100], mean action: -0.312 [-2.560, 1.547], mean observation: 0.235 [-7.848, 4.000], loss: 0.001633, mean_absolute_error: 0.037217, mean_q: -0.121739
  2777/50000: episode: 149, duration: 0.246s, episode steps: 17, steps per second: 69, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: -0.094 [-2.028, 1.780], mean observation: 0.316 [-8.829, 4.000], loss: 0.001640, mean_absolute_error: 0.036327, mean_q: -0.114495
  2795/50000: episode: 150, duration: 0.233s, episode steps: 18, steps per second: 77, episode reward: 0.700, mean reward: 0.039 [-1.

  3319/50000: episode: 176, duration: 0.376s, episode steps: 29, steps per second: 77, episode reward: 1.800, mean reward: 0.062 [-1.000, 0.100], mean action: -0.970 [-5.917, 1.428], mean observation: 0.006 [-7.848, 4.000], loss: 0.001603, mean_absolute_error: 0.037435, mean_q: -0.075260
  3335/50000: episode: 177, duration: 0.195s, episode steps: 16, steps per second: 82, episode reward: 0.500, mean reward: 0.031 [-1.000, 0.100], mean action: 0.393 [-0.925, 3.074], mean observation: 0.219 [-8.829, 4.000], loss: 0.001898, mean_absolute_error: 0.041823, mean_q: -0.086818
  3349/50000: episode: 178, duration: 0.194s, episode steps: 14, steps per second: 72, episode reward: 0.300, mean reward: 0.021 [-1.000, 0.100], mean action: -0.238 [-2.613, 1.810], mean observation: 0.124 [-8.829, 4.000], loss: 0.002138, mean_absolute_error: 0.041991, mean_q: -0.103798
  3370/50000: episode: 179, duration: 0.285s, episode steps: 21, steps per second: 74, episode reward: 1.000, mean reward: 0.048 [-1.0

  3879/50000: episode: 205, duration: 0.214s, episode steps: 18, steps per second: 84, episode reward: 0.700, mean reward: 0.039 [-1.000, 0.100], mean action: 0.132 [-1.945, 2.461], mean observation: 0.299 [-6.867, 4.000], loss: 0.001928, mean_absolute_error: 0.037160, mean_q: -0.049734
  3897/50000: episode: 206, duration: 0.248s, episode steps: 18, steps per second: 73, episode reward: 0.700, mean reward: 0.039 [-1.000, 0.100], mean action: 0.625 [-1.166, 3.508], mean observation: 0.407 [-7.848, 4.000], loss: 0.002177, mean_absolute_error: 0.046302, mean_q: -0.066154
  3921/50000: episode: 207, duration: 0.290s, episode steps: 24, steps per second: 83, episode reward: 1.300, mean reward: 0.054 [-1.000, 0.100], mean action: 0.460 [-1.595, 4.111], mean observation: 0.369 [-6.867, 4.000], loss: 0.001670, mean_absolute_error: 0.042444, mean_q: -0.019599
  3936/50000: episode: 208, duration: 0.200s, episode steps: 15, steps per second: 75, episode reward: 0.400, mean reward: 0.027 [-1.000

  4526/50000: episode: 234, duration: 0.217s, episode steps: 17, steps per second: 78, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: 0.294 [-1.442, 2.670], mean observation: 0.147 [-7.848, 4.000], loss: 0.001570, mean_absolute_error: 0.038816, mean_q: 0.028305
  4541/50000: episode: 235, duration: 0.188s, episode steps: 15, steps per second: 80, episode reward: 0.400, mean reward: 0.027 [-1.000, 0.100], mean action: 0.211 [-1.289, 2.757], mean observation: 0.135 [-7.848, 4.000], loss: 0.002073, mean_absolute_error: 0.041216, mean_q: 0.033524
  4556/50000: episode: 236, duration: 0.180s, episode steps: 15, steps per second: 83, episode reward: 0.400, mean reward: 0.027 [-1.000, 0.100], mean action: 0.205 [-1.209, 2.219], mean observation: 0.179 [-7.848, 4.000], loss: 0.001875, mean_absolute_error: 0.040248, mean_q: 0.026000
  4573/50000: episode: 237, duration: 0.240s, episode steps: 17, steps per second: 71, episode reward: 0.600, mean reward: 0.035 [-1.000, 0

  5310/50000: episode: 263, duration: 0.502s, episode steps: 42, steps per second: 84, episode reward: 3.100, mean reward: 0.074 [-1.000, 0.100], mean action: -0.700 [-5.396, 1.932], mean observation: 0.021 [-6.867, 4.000], loss: 0.001406, mean_absolute_error: 0.037123, mean_q: 0.083875
  5335/50000: episode: 264, duration: 0.307s, episode steps: 25, steps per second: 81, episode reward: 1.400, mean reward: 0.056 [-1.000, 0.100], mean action: 0.207 [-1.948, 2.688], mean observation: 0.492 [-7.848, 4.000], loss: 0.001425, mean_absolute_error: 0.036631, mean_q: 0.075698
  5361/50000: episode: 265, duration: 0.307s, episode steps: 26, steps per second: 85, episode reward: 1.500, mean reward: 0.058 [-1.000, 0.100], mean action: -0.232 [-3.469, 1.384], mean observation: 0.319 [-7.848, 4.000], loss: 0.001157, mean_absolute_error: 0.034817, mean_q: 0.071092
  5421/50000: episode: 266, duration: 0.720s, episode steps: 60, steps per second: 83, episode reward: 4.900, mean reward: 0.082 [-1.000,

  6078/50000: episode: 292, duration: 0.224s, episode steps: 18, steps per second: 80, episode reward: 0.700, mean reward: 0.039 [-1.000, 0.100], mean action: -1.573 [-4.796, 0.974], mean observation: -0.447 [-6.867, 4.000], loss: 0.001576, mean_absolute_error: 0.040698, mean_q: 0.086575
  6105/50000: episode: 293, duration: 0.337s, episode steps: 27, steps per second: 80, episode reward: 1.600, mean reward: 0.059 [-1.000, 0.100], mean action: -0.671 [-4.193, 2.094], mean observation: 0.082 [-7.848, 4.000], loss: 0.001603, mean_absolute_error: 0.039181, mean_q: 0.116837
  6129/50000: episode: 294, duration: 0.320s, episode steps: 24, steps per second: 75, episode reward: 1.300, mean reward: 0.054 [-1.000, 0.100], mean action: -1.074 [-7.467, 2.308], mean observation: -0.082 [-7.877, 4.000], loss: 0.001622, mean_absolute_error: 0.039686, mean_q: 0.111774
  6152/50000: episode: 295, duration: 0.360s, episode steps: 23, steps per second: 64, episode reward: 1.200, mean reward: 0.052 [-1.0

  6836/50000: episode: 321, duration: 0.316s, episode steps: 24, steps per second: 76, episode reward: 1.300, mean reward: 0.054 [-1.000, 0.100], mean action: -0.247 [-8.584, 3.296], mean observation: 0.160 [-7.848, 4.000], loss: 0.001936, mean_absolute_error: 0.044554, mean_q: 0.181534
  6874/50000: episode: 322, duration: 0.461s, episode steps: 38, steps per second: 82, episode reward: 2.700, mean reward: 0.071 [-1.000, 0.100], mean action: 0.001 [-3.771, 2.783], mean observation: 0.256 [-6.867, 4.000], loss: 0.001706, mean_absolute_error: 0.040841, mean_q: 0.172711
  6910/50000: episode: 323, duration: 0.431s, episode steps: 36, steps per second: 83, episode reward: 2.500, mean reward: 0.069 [-1.000, 0.100], mean action: 0.137 [-2.671, 4.257], mean observation: 0.341 [-8.572, 4.000], loss: 0.001972, mean_absolute_error: 0.041971, mean_q: 0.180111
  6942/50000: episode: 324, duration: 0.391s, episode steps: 32, steps per second: 82, episode reward: 2.100, mean reward: 0.066 [-1.000, 

  7688/50000: episode: 350, duration: 0.435s, episode steps: 33, steps per second: 76, episode reward: 2.200, mean reward: 0.067 [-1.000, 0.100], mean action: -0.343 [-2.462, 2.431], mean observation: 0.365 [-6.867, 4.000], loss: 0.001929, mean_absolute_error: 0.042831, mean_q: 0.250880
  7712/50000: episode: 351, duration: 0.288s, episode steps: 24, steps per second: 83, episode reward: 1.300, mean reward: 0.054 [-1.000, 0.100], mean action: 0.340 [-2.426, 2.521], mean observation: 0.414 [-7.848, 4.000], loss: 0.001725, mean_absolute_error: 0.040854, mean_q: 0.246035
  7739/50000: episode: 352, duration: 0.337s, episode steps: 27, steps per second: 80, episode reward: 1.600, mean reward: 0.059 [-1.000, 0.100], mean action: 0.042 [-3.835, 3.000], mean observation: 0.345 [-6.867, 4.000], loss: 0.001851, mean_absolute_error: 0.041023, mean_q: 0.269702
  7777/50000: episode: 353, duration: 0.482s, episode steps: 38, steps per second: 79, episode reward: 2.700, mean reward: 0.071 [-1.000, 

  8489/50000: episode: 379, duration: 0.429s, episode steps: 36, steps per second: 84, episode reward: 2.500, mean reward: 0.069 [-1.000, 0.100], mean action: -0.681 [-6.210, 2.563], mean observation: 0.185 [-7.008, 4.000], loss: 0.001843, mean_absolute_error: 0.041885, mean_q: 0.290378
  8506/50000: episode: 380, duration: 0.221s, episode steps: 17, steps per second: 77, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: 0.200 [-2.811, 2.715], mean observation: 0.286 [-7.848, 4.000], loss: 0.001881, mean_absolute_error: 0.042224, mean_q: 0.293369
  8562/50000: episode: 381, duration: 0.677s, episode steps: 56, steps per second: 83, episode reward: 4.500, mean reward: 0.080 [-1.000, 0.100], mean action: -0.688 [-5.607, 1.818], mean observation: -0.035 [-6.867, 4.000], loss: 0.001991, mean_absolute_error: 0.042714, mean_q: 0.308220
  8607/50000: episode: 382, duration: 0.542s, episode steps: 45, steps per second: 83, episode reward: 3.400, mean reward: 0.076 [-1.000

  9469/50000: episode: 409, duration: 0.469s, episode steps: 38, steps per second: 81, episode reward: 2.700, mean reward: 0.071 [-1.000, 0.100], mean action: 0.021 [-2.514, 3.983], mean observation: -0.098 [-5.886, 4.000], loss: 0.004038, mean_absolute_error: 0.051644, mean_q: 0.347328
  9519/50000: episode: 410, duration: 0.599s, episode steps: 50, steps per second: 83, episode reward: 3.900, mean reward: 0.078 [-1.000, 0.100], mean action: -0.524 [-4.633, 2.280], mean observation: 0.002 [-7.647, 4.000], loss: 0.002730, mean_absolute_error: 0.049133, mean_q: 0.319492
  9543/50000: episode: 411, duration: 0.292s, episode steps: 24, steps per second: 82, episode reward: 1.300, mean reward: 0.054 [-1.000, 0.100], mean action: 0.543 [-2.937, 3.699], mean observation: 0.428 [-7.848, 4.000], loss: 0.002836, mean_absolute_error: 0.050087, mean_q: 0.314313
  9568/50000: episode: 412, duration: 0.303s, episode steps: 25, steps per second: 83, episode reward: 1.400, mean reward: 0.056 [-1.000,

 10516/50000: episode: 438, duration: 0.559s, episode steps: 44, steps per second: 79, episode reward: 3.300, mean reward: 0.075 [-1.000, 0.100], mean action: -0.406 [-5.425, 4.727], mean observation: -0.017 [-6.867, 4.000], loss: 0.003814, mean_absolute_error: 0.053369, mean_q: 0.413866
 10531/50000: episode: 439, duration: 0.271s, episode steps: 15, steps per second: 55, episode reward: 0.400, mean reward: 0.027 [-1.000, 0.100], mean action: -0.086 [-2.122, 2.904], mean observation: -0.081 [-8.829, 4.000], loss: 0.003229, mean_absolute_error: 0.053396, mean_q: 0.384396
 10565/50000: episode: 440, duration: 0.464s, episode steps: 34, steps per second: 73, episode reward: 2.300, mean reward: 0.068 [-1.000, 0.100], mean action: -0.790 [-5.171, 2.769], mean observation: -0.250 [-6.867, 4.000], loss: 0.002833, mean_absolute_error: 0.047977, mean_q: 0.416815
 10589/50000: episode: 441, duration: 0.310s, episode steps: 24, steps per second: 77, episode reward: 1.300, mean reward: 0.054 [-1.

 11573/50000: episode: 467, duration: 0.968s, episode steps: 60, steps per second: 62, episode reward: 4.900, mean reward: 0.082 [-1.000, 0.100], mean action: -0.047 [-3.615, 5.150], mean observation: -0.050 [-6.867, 4.000], loss: 0.003528, mean_absolute_error: 0.054517, mean_q: 0.449229
 11618/50000: episode: 468, duration: 0.664s, episode steps: 45, steps per second: 68, episode reward: 3.400, mean reward: 0.076 [-1.000, 0.100], mean action: 0.015 [-3.739, 3.740], mean observation: -0.070 [-5.886, 4.000], loss: 0.004612, mean_absolute_error: 0.057369, mean_q: 0.465316
 11644/50000: episode: 469, duration: 0.347s, episode steps: 26, steps per second: 75, episode reward: 1.500, mean reward: 0.058 [-1.000, 0.100], mean action: 0.374 [-3.181, 2.557], mean observation: 0.360 [-7.848, 4.000], loss: 0.003870, mean_absolute_error: 0.058295, mean_q: 0.467622
 11691/50000: episode: 470, duration: 0.632s, episode steps: 47, steps per second: 74, episode reward: 3.600, mean reward: 0.077 [-1.000

 12630/50000: episode: 496, duration: 1.223s, episode steps: 79, steps per second: 65, episode reward: 6.800, mean reward: 0.086 [-1.000, 0.100], mean action: -0.298 [-4.499, 4.216], mean observation: -0.012 [-5.886, 4.000], loss: 0.003472, mean_absolute_error: 0.055519, mean_q: 0.536090
 12668/50000: episode: 497, duration: 0.501s, episode steps: 38, steps per second: 76, episode reward: 2.700, mean reward: 0.071 [-1.000, 0.100], mean action: -0.142 [-3.510, 4.357], mean observation: -0.230 [-6.867, 4.000], loss: 0.003991, mean_absolute_error: 0.057848, mean_q: 0.541203
 12708/50000: episode: 498, duration: 0.501s, episode steps: 40, steps per second: 80, episode reward: 2.900, mean reward: 0.073 [-1.000, 0.100], mean action: 0.235 [-3.811, 6.886], mean observation: -0.283 [-6.867, 4.000], loss: 0.004902, mean_absolute_error: 0.058685, mean_q: 0.582951
 12731/50000: episode: 499, duration: 0.282s, episode steps: 23, steps per second: 82, episode reward: 1.200, mean reward: 0.052 [-1.0

 13636/50000: episode: 525, duration: 0.219s, episode steps: 17, steps per second: 77, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: 0.263 [-3.380, 4.839], mean observation: -0.664 [-7.848, 4.000], loss: 0.005684, mean_absolute_error: 0.061124, mean_q: 0.615824
 13658/50000: episode: 526, duration: 0.257s, episode steps: 22, steps per second: 86, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: 0.004 [-4.218, 3.403], mean observation: -0.559 [-5.886, 4.000], loss: 0.004939, mean_absolute_error: 0.065661, mean_q: 0.562076
 13755/50000: episode: 527, duration: 1.173s, episode steps: 97, steps per second: 83, episode reward: 8.600, mean reward: 0.089 [-1.000, 0.100], mean action: -0.184 [-6.617, 3.957], mean observation: -0.095 [-7.799, 4.000], loss: 0.005646, mean_absolute_error: 0.065017, mean_q: 0.605561
 13858/50000: episode: 528, duration: 1.235s, episode steps: 103, steps per second: 83, episode reward: 9.200, mean reward: 0.089 [-1.0

 17207/50000: episode: 554, duration: 0.885s, episode steps: 68, steps per second: 77, episode reward: 5.700, mean reward: 0.084 [-1.000, 0.100], mean action: -1.541 [-11.890, 3.219], mean observation: 0.196 [-7.840, 4.000], loss: 0.004788, mean_absolute_error: 0.061731, mean_q: 0.939795
 17411/50000: episode: 555, duration: 2.620s, episode steps: 204, steps per second: 78, episode reward: 19.300, mean reward: 0.095 [-1.000, 0.100], mean action: -0.618 [-8.501, 3.657], mean observation: 0.139 [-7.848, 4.000], loss: 0.005130, mean_absolute_error: 0.062289, mean_q: 0.935893
 17433/50000: episode: 556, duration: 0.271s, episode steps: 22, steps per second: 81, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: 0.431 [-4.175, 3.564], mean observation: 0.401 [-6.867, 4.000], loss: 0.006855, mean_absolute_error: 0.069155, mean_q: 0.970358
 17543/50000: episode: 557, duration: 1.374s, episode steps: 110, steps per second: 80, episode reward: 9.900, mean reward: 0.090 [-1.

 20867/50000: episode: 583, duration: 1.292s, episode steps: 94, steps per second: 73, episode reward: 8.300, mean reward: 0.088 [-1.000, 0.100], mean action: -0.714 [-10.355, 4.059], mean observation: 0.187 [-8.397, 4.000], loss: 0.007652, mean_absolute_error: 0.074607, mean_q: 1.212699
 20980/50000: episode: 584, duration: 1.482s, episode steps: 113, steps per second: 76, episode reward: 10.200, mean reward: 0.090 [-1.000, 0.100], mean action: -1.098 [-10.217, 7.260], mean observation: -0.093 [-7.848, 4.000], loss: 0.006833, mean_absolute_error: 0.070672, mean_q: 1.220605
 21080/50000: episode: 585, duration: 1.332s, episode steps: 100, steps per second: 75, episode reward: 8.900, mean reward: 0.089 [-1.000, 0.100], mean action: -1.372 [-9.992, 3.729], mean observation: 0.067 [-7.848, 4.000], loss: 0.006614, mean_absolute_error: 0.070518, mean_q: 1.234887
 21115/50000: episode: 586, duration: 0.514s, episode steps: 35, steps per second: 68, episode reward: 2.400, mean reward: 0.069 [

 28318/50000: episode: 612, duration: 0.281s, episode steps: 23, steps per second: 82, episode reward: 1.200, mean reward: 0.052 [-1.000, 0.100], mean action: 1.276 [-4.464, 12.024], mean observation: -0.503 [-7.848, 4.000], loss: 0.010114, mean_absolute_error: 0.077654, mean_q: 1.926618
 29191/50000: episode: 613, duration: 10.284s, episode steps: 873, steps per second: 85, episode reward: 86.200, mean reward: 0.099 [-1.000, 0.100], mean action: -0.503 [-17.255, 11.090], mean observation: 0.116 [-5.886, 4.000], loss: 0.010875, mean_absolute_error: 0.083014, mean_q: 1.952337
 29212/50000: episode: 614, duration: 0.256s, episode steps: 21, steps per second: 82, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: 0.035 [-6.421, 6.875], mean observation: -0.477 [-6.867, 4.000], loss: 0.008374, mean_absolute_error: 0.076653, mean_q: 1.967785
 29237/50000: episode: 615, duration: 0.301s, episode steps: 25, steps per second: 83, episode reward: 1.400, mean reward: 0.056 [

 32523/50000: episode: 641, duration: 1.248s, episode steps: 106, steps per second: 85, episode reward: 9.500, mean reward: 0.090 [-1.000, 0.100], mean action: -0.318 [-10.423, 8.796], mean observation: 0.186 [-6.867, 4.000], loss: 0.067069, mean_absolute_error: 0.159861, mean_q: 2.273108
 33360/50000: episode: 642, duration: 10.412s, episode steps: 837, steps per second: 80, episode reward: 82.600, mean reward: 0.099 [-1.000, 0.100], mean action: 0.013 [-9.574, 9.684], mean observation: 0.204 [-5.886, 4.000], loss: 0.040729, mean_absolute_error: 0.126971, mean_q: 2.323241
 33401/50000: episode: 643, duration: 0.549s, episode steps: 41, steps per second: 75, episode reward: 3.000, mean reward: 0.073 [-1.000, 0.100], mean action: 1.488 [-8.198, 18.514], mean observation: 0.110 [-5.886, 4.000], loss: 0.034498, mean_absolute_error: 0.130672, mean_q: 2.367242
 33467/50000: episode: 644, duration: 0.880s, episode steps: 66, steps per second: 75, episode reward: 5.500, mean reward: 0.083 [-1

 34918/50000: episode: 670, duration: 0.333s, episode steps: 23, steps per second: 69, episode reward: 1.200, mean reward: 0.052 [-1.000, 0.100], mean action: -6.483 [-35.002, 13.720], mean observation: 0.163 [-7.848, 4.000], loss: 0.023502, mean_absolute_error: 0.113261, mean_q: 2.485149
 34994/50000: episode: 671, duration: 1.097s, episode steps: 76, steps per second: 69, episode reward: 6.500, mean reward: 0.086 [-1.000, 0.100], mean action: -0.929 [-18.343, 15.518], mean observation: -0.189 [-8.194, 4.000], loss: 0.031065, mean_absolute_error: 0.120190, mean_q: 2.476488
 35056/50000: episode: 672, duration: 0.851s, episode steps: 62, steps per second: 73, episode reward: 5.100, mean reward: 0.082 [-1.000, 0.100], mean action: 0.368 [-13.429, 19.363], mean observation: -0.210 [-8.178, 4.000], loss: 0.033603, mean_absolute_error: 0.122716, mean_q: 2.509124
 35085/50000: episode: 673, duration: 0.372s, episode steps: 29, steps per second: 78, episode reward: 1.800, mean reward: 0.062 

 36496/50000: episode: 699, duration: 0.527s, episode steps: 39, steps per second: 74, episode reward: 2.800, mean reward: 0.072 [-1.000, 0.100], mean action: -12.154 [-134.172, 10.128], mean observation: 0.431 [-8.724, 4.000], loss: 0.034680, mean_absolute_error: 0.122147, mean_q: 2.631930
 36523/50000: episode: 700, duration: 0.415s, episode steps: 27, steps per second: 65, episode reward: 1.600, mean reward: 0.059 [-1.000, 0.100], mean action: -17.717 [-92.413, 4.958], mean observation: 0.276 [-6.867, 4.000], loss: 0.030619, mean_absolute_error: 0.113758, mean_q: 2.692477
 36621/50000: episode: 701, duration: 1.323s, episode steps: 98, steps per second: 74, episode reward: 8.700, mean reward: 0.089 [-1.000, 0.100], mean action: -5.296 [-105.164, 16.082], mean observation: 0.235 [-6.867, 4.000], loss: 0.042000, mean_absolute_error: 0.128913, mean_q: 2.641666
 36672/50000: episode: 702, duration: 0.642s, episode steps: 51, steps per second: 79, episode reward: 4.000, mean reward: 0.07

 38260/50000: episode: 728, duration: 0.711s, episode steps: 54, steps per second: 76, episode reward: 4.300, mean reward: 0.080 [-1.000, 0.100], mean action: -8.291 [-89.423, 10.564], mean observation: 0.262 [-6.867, 4.000], loss: 0.028006, mean_absolute_error: 0.111408, mean_q: 2.787579
 38287/50000: episode: 729, duration: 0.350s, episode steps: 27, steps per second: 77, episode reward: 1.600, mean reward: 0.059 [-1.000, 0.100], mean action: -0.109 [-35.068, 30.088], mean observation: -0.670 [-8.704, 4.000], loss: 0.033792, mean_absolute_error: 0.123873, mean_q: 2.736282
 38308/50000: episode: 730, duration: 0.273s, episode steps: 21, steps per second: 77, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: -7.347 [-55.597, 17.902], mean observation: -0.371 [-7.848, 4.000], loss: 0.024022, mean_absolute_error: 0.104003, mean_q: 2.883602
 38332/50000: episode: 731, duration: 0.308s, episode steps: 24, steps per second: 78, episode reward: 1.300, mean reward: 0.054

 39674/50000: episode: 757, duration: 0.296s, episode steps: 22, steps per second: 74, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: -8.887 [-90.316, 7.253], mean observation: 0.113 [-6.867, 4.000], loss: 0.028356, mean_absolute_error: 0.115115, mean_q: 2.835270
 39701/50000: episode: 758, duration: 0.379s, episode steps: 27, steps per second: 71, episode reward: 1.600, mean reward: 0.059 [-1.000, 0.100], mean action: -7.283 [-80.736, 20.031], mean observation: 0.452 [-5.886, 4.000], loss: 0.021650, mean_absolute_error: 0.107723, mean_q: 2.895331
 39756/50000: episode: 759, duration: 0.758s, episode steps: 55, steps per second: 73, episode reward: 4.400, mean reward: 0.080 [-1.000, 0.100], mean action: -7.801 [-154.423, 10.536], mean observation: -0.035 [-6.805, 4.000], loss: 0.038875, mean_absolute_error: 0.123527, mean_q: 2.901737
 39789/50000: episode: 760, duration: 0.472s, episode steps: 33, steps per second: 70, episode reward: 2.200, mean reward: 0.067 

 42627/50000: episode: 786, duration: 8.851s, episode steps: 720, steps per second: 81, episode reward: 70.900, mean reward: 0.098 [-1.000, 0.100], mean action: -0.637 [-90.445, 18.721], mean observation: 0.130 [-6.200, 4.000], loss: 0.027259, mean_absolute_error: 0.106750, mean_q: 3.088026
 42673/50000: episode: 787, duration: 0.544s, episode steps: 46, steps per second: 85, episode reward: 3.500, mean reward: 0.076 [-1.000, 0.100], mean action: -6.647 [-112.164, 24.557], mean observation: 0.040 [-5.886, 4.000], loss: 0.026549, mean_absolute_error: 0.107299, mean_q: 3.061661
 43043/50000: episode: 788, duration: 4.366s, episode steps: 370, steps per second: 85, episode reward: 35.900, mean reward: 0.097 [-1.000, 0.100], mean action: -0.744 [-98.096, 19.333], mean observation: 0.150 [-7.993, 4.000], loss: 0.023993, mean_absolute_error: 0.102078, mean_q: 3.111101
 43072/50000: episode: 789, duration: 0.344s, episode steps: 29, steps per second: 84, episode reward: 1.800, mean reward: 0.

NameError: name 'ENV_NAME' is not defined

In [10]:
# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=1000, visualize=True, nb_max_episode_steps=1001)

Testing for 5 episodes ...
Episode 1: reward: 20.000, steps: 200
Episode 2: reward: 20.000, steps: 200
Episode 3: reward: 20.000, steps: 200
Episode 4: reward: 20.000, steps: 200
Episode 5: reward: 20.000, steps: 200


<keras.callbacks.History at 0x11de4d470>

In [11]:
# After training is done, we save the final weights.
ENV_NAME = '3DBall_128'
agent.save_weights('ddpg_{}_vec_weights.h5f'.format(ENV_NAME), overwrite=True)