In [1]:
from __future__ import division
import argparse

from PIL import Image
import numpy as np
import gym

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute, Input, Concatenate
from keras.optimizers import Adam
import keras.backend as K

from rl.agents.ddpg import DDPGAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint
from rl.random import OrnsteinUhlenbeckProcess

Using TensorFlow backend.


In [2]:
import matplotlib.pyplot as plt
import sys
from gym_unity.envs import UnityEnv

%matplotlib inline

print("Python version:")
print(sys.version)

# check Python version
if (sys.version_info[0] < 3):
    raise Exception("ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3")
    
INPUT_SHAPE = (128, 128)
WINDOW_LENGTH = 1

Python version:
3.6.7 |Anaconda, Inc.| (default, Oct 23 2018, 14:01:38) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [3]:
class BallVecProcessor(Processor):
    def process_observation(self, observation):
        #print(np.shape(observation))
        assert observation.ndim == 3  # (height, width, channel)
        img = np.array(observation)
        processed_observation = np.mean(img, axis=2)
        assert processed_observation.shape == INPUT_SHAPE
        #print(np.shape(processed_observation))
        return processed_observation.astype('uint8')  # saves storage in experience memory
    def process_action(self, action):
        return action
    def process_info(self, info):
        key, value = info.items()
        key = 1
        value = value[1].rewards
        info = {key: value}
        return info
    def process_reward(self, reward):
        return np.clip(reward, -1., 1.) 

In [4]:
env_name = "mlagents/envs/3DBall_128"  # Name of the Unity environment binary to launch
env = UnityEnv(env_name, worker_id=0, use_visual=True)

nb_actions = 2
print(str(env))

INFO:mlagents.envs:
'Ball3DAcademy' started successfully!
Unity Academy name: Ball3DAcademy
        Number of Brains: 1
        Number of External Brains : 1
        Reset Parameters :
		
Unity brain name: Ball3DBrain
        Number of Visual Observations (per agent): 1
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): [2]
        Vector Action descriptions: , 
INFO:gym_unity:1 agents within environment.


<UnityEnv instance>


In [5]:
# Next, we build a very simple model.
input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
actor = Sequential()
if K.image_dim_ordering() == 'tf':
    # (width, height, channels)
    actor.add(Permute((2, 3, 1), input_shape=input_shape))
elif K.image_dim_ordering() == 'th':
    # (channels, width, height)
    actor.add(Permute((1, 2, 3), input_shape=input_shape))
else:
    raise RuntimeError('Unknown image_dim_ordering.')
    
actor.add(Convolution2D(32, (8, 8), strides=(4, 4)))
actor.add(Activation('relu'))
actor.add(Convolution2D(64, (4, 4), strides=(2, 2)))
actor.add(Activation('relu'))
actor.add(Convolution2D(64, (3, 3), strides=(1, 1)))
actor.add(Flatten())
actor.add(Dense(512))
actor.add(Activation('relu'))
actor.add(Dense(nb_actions))
actor.add(Activation('linear'))
print(actor.summary()) 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute_1 (Permute)          (None, 128, 128, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 31, 31, 32)        2080      
_________________________________________________________________
activation_1 (Activation)    (None, 31, 31, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 14, 14, 64)        32832     
_________________________________________________________________
activation_2 (Activation)    (None, 14, 14, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 12, 12, 64)        36928     
_________________________________________________________________
flatten_1 (Flatten)          (None, 9216)              0         
__________

In [6]:
input_shape1 = (WINDOW_LENGTH,) + INPUT_SHAPE

action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=input_shape1, name='observation_input')

if K.image_dim_ordering() == 'tf':
    # (width, height, channels)
    observation_x = Permute((2, 3, 1), input_shape=input_shape)(observation_input)
elif K.image_dim_ordering() == 'th':
    # (channels, width, height)
    observation_x = Permute((1, 2, 3), input_shape=input_shape)(observation_input)
else:
    raise RuntimeError('Unknown image_dim_ordering.')

observation_x = Convolution2D(32, (8, 8), strides=(4, 4))(observation_x)
observation_x = Activation('relu')(observation_x)
observation_x = Convolution2D(64, (4, 4), strides=(2, 2))(observation_x)
observation_x = Activation('relu')(observation_x)
observation_x = Convolution2D(64, (3, 3), strides=(1, 1))(observation_x)
observation_x = Activation('relu')(observation_x)
flattened_observation = Flatten()(observation_x)
x = Concatenate()([action_input, flattened_observation])
x = Dense(512)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observation_input (InputLayer)  (None, 1, 128, 128)  0                                            
__________________________________________________________________________________________________
permute_2 (Permute)             (None, 128, 128, 1)  0           observation_input[0][0]          
__________________________________________________________________________________________________
conv2d_4 (Conv2D)               (None, 31, 31, 32)   2080        permute_2[0][0]                  
__________________________________________________________________________________________________
activation_5 (Activation)       (None, 31, 31, 32)   0           conv2d_4[0][0]                   
__________________________________________________________________________________________________
conv2d_5 (

In [7]:
processor = BallVecProcessor()
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.99, target_model_update=1e-3, processor = processor)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

In [8]:
agent.fit(env, nb_steps=1750000, visualize=True, verbose=2)

Training for 1000 steps ...
  25/1000: episode: 1, duration: 0.887s, episode steps: 25, steps per second: 28, episode reward: 1.400, mean reward: 0.056 [-1.000, 0.100], mean action: -0.056 [-0.183, 0.070], mean observation: 0.000 [0.000, 0.000], loss: --, mean_absolute_error: --, mean_q: --
  55/1000: episode: 2, duration: 0.354s, episode steps: 30, steps per second: 85, episode reward: 1.900, mean reward: 0.063 [-1.000, 0.100], mean action: 0.119 [-0.090, 0.356], mean observation: 0.000 [0.000, 0.000], loss: --, mean_absolute_error: --, mean_q: --
  75/1000: episode: 3, duration: 0.226s, episode steps: 20, steps per second: 88, episode reward: 0.900, mean reward: 0.045 [-1.000, 0.100], mean action: -0.009 [-0.129, 0.117], mean observation: 0.000 [0.000, 0.000], loss: --, mean_absolute_error: --, mean_q: --
  96/1000: episode: 4, duration: 0.266s, episode steps: 21, steps per second: 79, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: 0.025 [-0.047, 0.082], mean

 742/1000: episode: 31, duration: 6.967s, episode steps: 22, steps per second: 3, episode reward: 1.100, mean reward: 0.050 [-1.000, 0.100], mean action: 0.056 [-0.342, 0.458], mean observation: 0.000 [0.000, 0.000], loss: 0.028599, mean_absolute_error: 0.111199, mean_q: 0.087859
 760/1000: episode: 32, duration: 5.700s, episode steps: 18, steps per second: 3, episode reward: 0.700, mean reward: 0.039 [-1.000, 0.100], mean action: 0.144 [-0.007, 0.280], mean observation: 0.000 [0.000, 0.000], loss: 0.017872, mean_absolute_error: 0.051970, mean_q: 0.141372
 781/1000: episode: 33, duration: 6.710s, episode steps: 21, steps per second: 3, episode reward: 1.000, mean reward: 0.048 [-1.000, 0.100], mean action: 0.047 [-0.114, 0.235], mean observation: 0.000 [0.000, 0.000], loss: 0.022071, mean_absolute_error: 0.085105, mean_q: 0.108696
 798/1000: episode: 34, duration: 5.468s, episode steps: 17, steps per second: 3, episode reward: 0.600, mean reward: 0.035 [-1.000, 0.100], mean action: 0.0

<keras.callbacks.History at 0x109d93898>

In [9]:
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)

Testing for 5 episodes ...
Episode 1: reward: 1.500, steps: 26
Episode 2: reward: 1.000, steps: 21
Episode 3: reward: 1.200, steps: 23
Episode 4: reward: 1.000, steps: 21
Episode 5: reward: 1.000, steps: 21


<keras.callbacks.History at 0x102598978>

In [None]:
# After training is done, we save the final weights.
ENV_NAME = '3DBall_128'
agent.save_weights('ddpg_{}_vis_weights.h5f'.format(ENV_NAME), overwrite=True)