In [1]:
from __future__ import division
import argparse
import time
from PIL import Image
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from keras.optimizers import Adam
from keras.layers.wrappers import TimeDistributed
from keras.layers.recurrent import LSTM, GRU
import keras.backend as K

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

Using TensorFlow backend.


In [2]:
import matplotlib.pyplot as plt
import sys
from gym_unity.envs import UnityEnv

%matplotlib inline

print("Python version:")
print(sys.version)

# check Python version
if (sys.version_info[0] < 3):
    raise Exception("ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3")
    
INPUT_SHAPE = (80, 80, 3)
WINDOW_LENGTH = 4
mode = 'train'
weights = 'None'

Python version:
3.6.7 |Anaconda, Inc.| (default, Oct 23 2018, 14:01:38) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [3]:
class BallVecProcessor(Processor):
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = np.array(observation)
        img = img[24:104,24:104]    
        processed_observation = img
        return processed_observation.astype('uint8')  # saves storage in experience memory

    def process_action(self, action):
        action = [np.floor(action/21)*((action-20)/10-1.1), (1-np.floor(action/21))*((action-20)/10+1)]
        return action
    
    def process_info(self, info):
        key, value = info.items()
        key = value[0]
        value = value[1].rewards
        info = {key: value}
        
        """Processes the info as obtained from the environment for use in an agent and
        returns it.

        # Arguments
            info (dict): An info as obtained by the environment

        # Returns
            Info obtained by the environment processed
        """
        return info

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)
    

In [4]:
env_name = "mlagents/envs/3DBall_128"  # Name of the Unity environment binary to launch
env = UnityEnv(env_name, worker_id=0, use_visual=True)

nb_actions = 42
print(str(env))

INFO:mlagents.envs:
'Ball3DAcademy' started successfully!
Unity Academy name: Ball3DAcademy
        Number of Brains: 1
        Number of External Brains : 1
        Reset Parameters :
		
Unity brain name: Ball3DBrain
        Number of Visual Observations (per agent): 1
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): [2]
        Vector Action descriptions: , 
INFO:gym_unity:1 agents within environment.


<UnityEnv instance>


In [5]:
# Next, we build a very simple model.
input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
model = Sequential()
if K.image_dim_ordering() == 'tf':
    # (width, height, channels)
    model.add(Permute((1, 2, 3, 4), input_shape=input_shape))
elif K.image_dim_ordering() == 'th':
    # (channels, width, height)
    model.add(Permute((1, 2, 3, 4), input_shape=input_shape))
else:
    raise RuntimeError('Unknown image_dim_ordering.')

model.add(TimeDistributed(Convolution2D(32, 8, 8, subsample=(4,4), activation='relu'), input_shape=(input_shape)))
model.add(TimeDistributed(Convolution2D(64, 4, 4, subsample=(2,2), activation='relu')))
model.add(TimeDistributed(Convolution2D(64, 3, 3, activation='relu')))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(512,  activation='tanh'))
model.add(Dense(output_dim=nb_actions, activation='linear'))
    
print(model.summary())

  del sys.path[0]
  
  from ipykernel import kernelapp as app


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute_1 (Permute)          (None, 4, 80, 80, 3)      0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 4, 19, 19, 32)     6176      
_________________________________________________________________
time_distributed_2 (TimeDist (None, 4, 8, 8, 64)       32832     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 4, 6, 6, 64)       36928     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 4, 2304)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               5769216   
_________________________________________________________________
dense_1 (Dense)              (None, 42)                21546     
Total para



# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
processor = BallVecProcessor()
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.01, value_test=.005,
                              nb_steps=1000000)

dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=5000, gamma=.99, target_model_update=10000,
               train_interval=1, delta_clip=1.)
dqn.compile(Adam(lr=1e-6), metrics=['mae'])


In [6]:
processor = BallVecProcessor()
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)

policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=42, memory=memory, nb_steps_warmup=10000,
               target_model_update=1e-2, policy=policy, processor = processor)
dqn.compile(Adam(lr=1e-6), metrics=['mae'])

In [7]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
#ENV_NAME = '3DBall_vis1'
#dqn.load_weights('dqn_{}_weights.h5f'.format(ENV_NAME))
dqn.fit(env, nb_steps=1750000, visualize=True, verbose=2)


Training for 1750000 steps ...
      20/1750000: episode: 1, duration: 0.735s, episode steps: 20, steps per second: 27, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: -0.090 [-1.000, 0.800], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
      41/1750000: episode: 2, duration: 0.366s, episode steps: 21, steps per second: 57, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: -0.124 [-1.000, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
      60/1750000: episode: 3, duration: 0.333s, episode steps: 19, steps per second: 57, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: 0.018 [-0.900, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
      80/1750000: episode: 4, duration: 0.400s, episode steps: 20, steps per second: 50, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean acti

     818/1750000: episode: 32, duration: 0.452s, episode steps: 27, steps per second: 60, episode reward: 1.600, mean reward: 0.059 [-1.000, 0.100], mean action: -0.007 [-0.900, 0.900], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
     835/1750000: episode: 33, duration: 0.286s, episode steps: 17, steps per second: 60, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: -0.091 [-1.000, 0.900], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
     862/1750000: episode: 34, duration: 0.517s, episode steps: 27, steps per second: 52, episode reward: 1.600, mean reward: 0.059 [-1.000, 0.100], mean action: -0.020 [-0.900, 0.900], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
     890/1750000: episode: 35, duration: 0.488s, episode steps: 28, steps per second: 57, episode reward: 1.700, mean reward: 0.061 [-1.000, 0.100], mean action: -0.066 [-1.000, 1.000]

    1512/1750000: episode: 63, duration: 0.872s, episode steps: 19, steps per second: 22, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: -0.026 [-1.000, 0.700], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    1531/1750000: episode: 64, duration: 0.419s, episode steps: 19, steps per second: 45, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: -0.129 [-1.000, 0.800], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    1548/1750000: episode: 65, duration: 3.211s, episode steps: 17, steps per second: 5, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: 0.015 [-1.000, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    1570/1750000: episode: 66, duration: 4.040s, episode steps: 22, steps per second: 5, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: -0.102 [-1.000, 1.000], m

    2152/1750000: episode: 94, duration: 22.411s, episode steps: 17, steps per second: 1, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: 0.112 [-0.800, 1.000], mean observation: 0.003 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    2195/1750000: episode: 95, duration: 37.010s, episode steps: 43, steps per second: 1, episode reward: 3.200, mean reward: 0.074 [-1.000, 0.100], mean action: 0.008 [-1.000, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    2217/1750000: episode: 96, duration: 0.684s, episode steps: 22, steps per second: 32, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: 0.077 [-1.000, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    2262/1750000: episode: 97, duration: 0.746s, episode steps: 45, steps per second: 60, episode reward: 3.400, mean reward: 0.076 [-1.000, 0.100], mean action: -0.051 [-0.900, 1.000], m

    2903/1750000: episode: 125, duration: 0.368s, episode steps: 22, steps per second: 60, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: 0.077 [-0.900, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    2923/1750000: episode: 126, duration: 0.350s, episode steps: 20, steps per second: 57, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: -0.063 [-1.000, 0.800], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    2942/1750000: episode: 127, duration: 0.336s, episode steps: 19, steps per second: 56, episode reward: 0.800, mean reward: 0.042 [-1.000, 0.100], mean action: -0.124 [-1.000, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    2962/1750000: episode: 128, duration: 0.349s, episode steps: 20, steps per second: 57, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: 0.017 [-1.000, 0.90

    3587/1750000: episode: 156, duration: 0.274s, episode steps: 17, steps per second: 62, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: 0.132 [-0.700, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    3608/1750000: episode: 157, duration: 0.349s, episode steps: 21, steps per second: 60, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: 0.067 [-0.700, 1.000], mean observation: 0.003 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    3626/1750000: episode: 158, duration: 0.324s, episode steps: 18, steps per second: 56, episode reward: 0.700, mean reward: 0.039 [-1.000, 0.100], mean action: 0.072 [-0.700, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    3646/1750000: episode: 159, duration: 0.343s, episode steps: 20, steps per second: 58, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: -0.048 [-1.000, 1.000

    4245/1750000: episode: 187, duration: 0.415s, episode steps: 24, steps per second: 58, episode reward: 1.300, mean reward: 0.054 [-1.000, 0.100], mean action: 0.046 [-0.900, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    4270/1750000: episode: 188, duration: 0.452s, episode steps: 25, steps per second: 55, episode reward: 1.400, mean reward: 0.056 [-1.000, 0.100], mean action: 0.028 [-1.000, 0.900], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    4295/1750000: episode: 189, duration: 0.512s, episode steps: 25, steps per second: 49, episode reward: 1.400, mean reward: 0.056 [-1.000, 0.100], mean action: 0.036 [-1.000, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    4317/1750000: episode: 190, duration: 0.394s, episode steps: 22, steps per second: 56, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: 0.043 [-0.800, 1.000]

    4963/1750000: episode: 218, duration: 0.412s, episode steps: 24, steps per second: 58, episode reward: 1.300, mean reward: 0.054 [-1.000, 0.100], mean action: 0.085 [-1.000, 1.000], mean observation: 0.003 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    4983/1750000: episode: 219, duration: 0.365s, episode steps: 20, steps per second: 55, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: -0.093 [-0.900, 0.900], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    5000/1750000: episode: 220, duration: 0.300s, episode steps: 17, steps per second: 57, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: 0.103 [-1.000, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    5023/1750000: episode: 221, duration: 0.391s, episode steps: 23, steps per second: 59, episode reward: 1.200, mean reward: 0.052 [-1.000, 0.100], mean action: 0.004 [-1.000, 0.900

    5640/1750000: episode: 249, duration: 0.619s, episode steps: 35, steps per second: 57, episode reward: 2.400, mean reward: 0.069 [-1.000, 0.100], mean action: -0.054 [-1.000, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    5683/1750000: episode: 250, duration: 0.731s, episode steps: 43, steps per second: 59, episode reward: 3.200, mean reward: 0.074 [-1.000, 0.100], mean action: -0.064 [-1.000, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    5711/1750000: episode: 251, duration: 0.463s, episode steps: 28, steps per second: 60, episode reward: 1.700, mean reward: 0.061 [-1.000, 0.100], mean action: -0.023 [-1.000, 1.000], mean observation: 0.002 [0.000, 1.000], loss: --, mean_absolute_error: --, mean_q: --
    5737/1750000: episode: 252, duration: 0.487s, episode steps: 26, steps per second: 53, episode reward: 1.500, mean reward: 0.058 [-1.000, 0.100], mean action: -0.013 [-1.000, 1.

<keras.callbacks.History at 0x120bc8080>

In [8]:
ENV_NAME = '3DBall_128'
# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...


KeyboardInterrupt: 

In [None]:
ENV_NAME = '3DBall_128'
if mode == 'train':
    # Okay, now it's time to learn something! We capture the interrupt exception so that training
    # can be prematurely aborted. Notice that you can the built-in Keras callbacks!
    weights_filename = 'dqn_{}_weights.h5f'.format(ENV_NAME)
    checkpoint_weights_filename = 'dqn_' + env_name + '_weights_{step}.h5f'
    log_filename = 'dqn_{}_log.json'.format(env_name)
    callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)]
    #callbacks += [FileLogger(log_filename, interval=100)]
    dqn.fit(env, callbacks=callbacks, nb_steps=1750000, visualize=True)

    # After training is done, we save the final weights one more time.
    dqn.save_weights(weights_filename, overwrite=True)

    # Finally, evaluate our algorithm for 10 episodes.
    dqn.test(env, nb_episodes=10, visualize=True)
    
elif mode == 'test':
    weights_filename = 'dqn_{}_weights.h5f'.format(env_name)
    if weights:
        weights_filename = weights
    dqn.load_weights(weights_filename)
    dqn.test(env, nb_episodes=10, visualize=True)