From b7d17e60580c761172f792f192c5a52875a661bf Mon Sep 17 00:00:00 2001 From: Alfredo Solano Martinez Date: Wed, 17 Jan 2018 10:42:24 +0900 Subject: [PATCH 1/2] Initial support for multiple observations --- python/ppo/history.py | 15 ++++++++++++++- python/ppo/models.py | 29 +++++++++++++++++------------ python/ppo/trainer.py | 12 ++++++++---- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/python/ppo/history.py b/python/ppo/history.py index 547e805318..4d3a0003f9 100755 --- a/python/ppo/history.py +++ b/python/ppo/history.py @@ -1,6 +1,6 @@ import numpy as np -history_keys = ['states', 'observations', 'actions', 'rewards', 'action_probs', 'epsilons', +history_keys = ['states', 'actions', 'rewards', 'action_probs', 'epsilons', 'value_estimates', 'advantages', 'discounted_returns'] @@ -44,6 +44,8 @@ def empty_local_history(agent_dict): """ for key in history_keys: agent_dict[key] = [] + for i, _ in enumerate(key for key in agent_dict.keys() if key.startswith('observations')): + agent_dict['observations%d' % i] = [] return agent_dict @@ -55,6 +57,8 @@ def vectorize_history(agent_dict): """ for key in history_keys: agent_dict[key] = np.array(agent_dict[key]) + for key in (key for key in agent_dict.keys() if key.startswith('observations')): + agent_dict[key] = np.array(agent_dict[key]) return agent_dict @@ -70,6 +74,8 @@ def empty_all_history(agent_info): history_dict[agent] = empty_local_history(history_dict[agent]) history_dict[agent]['cumulative_reward'] = 0 history_dict[agent]['episode_steps'] = 0 + for i, _ in enumerate(agent_info.observations): + history_dict[agent]['observations%d' % i] = [] return history_dict @@ -82,6 +88,8 @@ def append_history(global_buffer, local_buffer=None): """ for key in history_keys: global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0) + for key in (key for key in local_buffer.keys() if key.startswith('observations')): + global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0) return global_buffer @@ -94,6 +102,8 @@ def set_history(global_buffer, local_buffer=None): """ for key in history_keys: global_buffer[key] = np.copy(local_buffer[key]) + for key in (key for key in local_buffer.keys() if key.startswith('observations')): + global_buffer[key] = np.array(local_buffer[key]) return global_buffer @@ -108,4 +118,7 @@ def shuffle_buffer(global_buffer): for key in history_keys: if len(global_buffer[key]) > 0: global_buffer[key] = global_buffer[key][s] + for key in (key for key in global_buffer.keys() if key.startswith('observations')): + if len(global_buffer[key]) > 0: + global_buffer[key] = global_buffer[key][s] return global_buffer diff --git a/python/ppo/models.py b/python/ppo/models.py index 7dd8ec194b..fd60ce0686 100755 --- a/python/ppo/models.py +++ b/python/ppo/models.py @@ -61,6 +61,7 @@ def export_graph(model_path, env_name="env", target_nodes="action,value_estimate class PPOModel(object): def __init__(self): self.normalize = False + self.observation_in = [] def create_global_steps(self): """Creates TF ops to track and increment global training step.""" @@ -89,11 +90,11 @@ def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, act else: c_channels = 3 - self.observation_in = tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32, - name='observation_0') + self.observation_in.append(tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32, + name='observation_%d' % len(self.observation_in))) streams = [] for i in range(num_streams): - self.conv1 = tf.layers.conv2d(self.observation_in, 16, kernel_size=[8, 8], strides=[4, 4], + self.conv1 = tf.layers.conv2d(self.observation_in[-1], 16, kernel_size=[8, 8], strides=[4, 4], use_bias=False, activation=activation) self.conv2 = tf.layers.conv2d(self.conv1, 32, kernel_size=[4, 4], strides=[2, 2], use_bias=False, activation=activation) @@ -213,10 +214,12 @@ def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, num_layers): self.create_reward_encoder() hidden_state, hidden_visual, hidden_policy, hidden_value = None, None, None, None - if brain.number_observations > 0: - height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width'] - bw = brain.camera_resolutions[0]['blackAndWhite'] - hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers) + encoders = [] + for i in range(brain.number_observations): + height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width'] + bw = brain.camera_resolutions[i]['blackAndWhite'] + encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)) + hidden_visual = [tf.concat(encoders, axis=1)] if brain.state_space_size > 0: s_size = brain.state_space_size if brain.state_space_type == "continuous": @@ -275,10 +278,12 @@ def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_la self.normalize = normalize hidden_state, hidden_visual, hidden = None, None, None - if brain.number_observations > 0: - height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width'] - bw = brain.camera_resolutions[0]['blackAndWhite'] - hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0] + encoders = [] + for i in range(brain.number_observations): + height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width'] + bw = brain.camera_resolutions[i]['blackAndWhite'] + encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0]) + hidden_visual = [tf.concat(encoders, axis=1)] if brain.state_space_size > 0: s_size = brain.state_space_size if brain.state_space_type == "continuous": @@ -294,7 +299,7 @@ def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_la elif hidden_visual is None and hidden_state is not None: hidden = hidden_state elif hidden_visual is not None and hidden_state is not None: - hidden = tf.concat([hidden_visual, hidden_state], axis=1) + hidden = tf.concat([hidden_visual[0], hidden_state], axis=1) a_size = brain.action_space_size diff --git a/python/ppo/trainer.py b/python/ppo/trainer.py index 4a4e495298..8f1062a3bc 100755 --- a/python/ppo/trainer.py +++ b/python/ppo/trainer.py @@ -57,7 +57,8 @@ def take_action(self, info, env, brain_name, steps, normalize): epsi = np.random.randn(len(info.states), env.brains[brain_name].action_space_size) feed_dict[self.model.epsilon] = epsi if self.use_observations: - feed_dict[self.model.observation_in] = np.vstack(info.observations) + for i, _ in enumerate(info.observations): + feed_dict[self.model.observation_in[i]] = info.observations[i] if self.use_states: feed_dict[self.model.state_in] = info.states if self.is_training and env.brains[brain_name].state_space_type == "continuous" and self.use_states and normalize: @@ -91,7 +92,8 @@ def add_experiences(self, info, next_info, epsi, actions, a_dist, value): idx = info.agents.index(agent) if not info.local_done[idx]: if self.use_observations: - history['observations'].append([info.observations[0][idx]]) + for i, _ in enumerate(info.observations): + history['observations%d' % i].append([info.observations[i][idx]]) if self.use_states: history['states'].append(info.states[idx]) if self.is_continuous: @@ -120,7 +122,8 @@ def process_experiences(self, info, time_horizon, gamma, lambd): else: feed_dict = {self.model.batch_size: len(info.states)} if self.use_observations: - feed_dict[self.model.observation_in] = np.vstack(info.observations) + for i in range(self.info.observations): + feed_dict[self.model.observation_in[i]] = info.observations[i] if self.use_states: feed_dict[self.model.state_in] = info.states value_next = self.sess.run(self.model.value, feed_dict)[l] @@ -176,7 +179,8 @@ def update_model(self, batch_size, num_epoch): if self.use_states: feed_dict[self.model.state_in] = np.vstack(training_buffer['states'][start:end]) if self.use_observations: - feed_dict[self.model.observation_in] = np.vstack(training_buffer['observations'][start:end]) + for i, _ in enumerate(self.model.observation_in): + feed_dict[self.model.observation_in[i]] = np.vstack(training_buffer['observations%d' % i][start:end]) v_loss, p_loss, _ = self.sess.run([self.model.value_loss, self.model.policy_loss, self.model.update_batch], feed_dict=feed_dict) total_v += v_loss From 6173d17a76029990e5644cf0fcddccc88fcf273b Mon Sep 17 00:00:00 2001 From: Alfredo Solano Martinez Date: Thu, 18 Jan 2018 11:17:56 +0900 Subject: [PATCH 2/2] Fix PPO for continuous control --- python/ppo/models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/ppo/models.py b/python/ppo/models.py index fd60ce0686..be985b6e4d 100755 --- a/python/ppo/models.py +++ b/python/ppo/models.py @@ -219,7 +219,7 @@ def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, num_layers): height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width'] bw = brain.camera_resolutions[i]['blackAndWhite'] encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)) - hidden_visual = [tf.concat(encoders, axis=1)] + hidden_visual = tf.concat(encoders, axis=2) if brain.state_space_size > 0: s_size = brain.state_space_size if brain.state_space_type == "continuous": @@ -283,7 +283,7 @@ def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_la height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width'] bw = brain.camera_resolutions[i]['blackAndWhite'] encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0]) - hidden_visual = [tf.concat(encoders, axis=1)] + hidden_visual = tf.concat(encoders, axis=1) if brain.state_space_size > 0: s_size = brain.state_space_size if brain.state_space_type == "continuous": @@ -299,7 +299,7 @@ def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_la elif hidden_visual is None and hidden_state is not None: hidden = hidden_state elif hidden_visual is not None and hidden_state is not None: - hidden = tf.concat([hidden_visual[0], hidden_state], axis=1) + hidden = tf.concat([hidden_visual, hidden_state], axis=1) a_size = brain.action_space_size