Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion python/ppo/history.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np

history_keys = ['states', 'observations', 'actions', 'rewards', 'action_probs', 'epsilons',
history_keys = ['states', 'actions', 'rewards', 'action_probs', 'epsilons',
'value_estimates', 'advantages', 'discounted_returns']


Expand Down Expand Up @@ -44,6 +44,8 @@ def empty_local_history(agent_dict):
"""
for key in history_keys:
agent_dict[key] = []
for i, _ in enumerate(key for key in agent_dict.keys() if key.startswith('observations')):
agent_dict['observations%d' % i] = []
return agent_dict


Expand All @@ -55,6 +57,8 @@ def vectorize_history(agent_dict):
"""
for key in history_keys:
agent_dict[key] = np.array(agent_dict[key])
for key in (key for key in agent_dict.keys() if key.startswith('observations')):
agent_dict[key] = np.array(agent_dict[key])
return agent_dict


Expand All @@ -70,6 +74,8 @@ def empty_all_history(agent_info):
history_dict[agent] = empty_local_history(history_dict[agent])
history_dict[agent]['cumulative_reward'] = 0
history_dict[agent]['episode_steps'] = 0
for i, _ in enumerate(agent_info.observations):
history_dict[agent]['observations%d' % i] = []
return history_dict


Expand All @@ -82,6 +88,8 @@ def append_history(global_buffer, local_buffer=None):
"""
for key in history_keys:
global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0)
for key in (key for key in local_buffer.keys() if key.startswith('observations')):
global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0)
return global_buffer


Expand All @@ -94,6 +102,8 @@ def set_history(global_buffer, local_buffer=None):
"""
for key in history_keys:
global_buffer[key] = np.copy(local_buffer[key])
for key in (key for key in local_buffer.keys() if key.startswith('observations')):
global_buffer[key] = np.array(local_buffer[key])
return global_buffer


Expand All @@ -108,4 +118,7 @@ def shuffle_buffer(global_buffer):
for key in history_keys:
if len(global_buffer[key]) > 0:
global_buffer[key] = global_buffer[key][s]
for key in (key for key in global_buffer.keys() if key.startswith('observations')):
if len(global_buffer[key]) > 0:
global_buffer[key] = global_buffer[key][s]
return global_buffer
27 changes: 16 additions & 11 deletions python/ppo/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def export_graph(model_path, env_name="env", target_nodes="action,value_estimate
class PPOModel(object):
def __init__(self):
self.normalize = False
self.observation_in = []

def create_global_steps(self):
"""Creates TF ops to track and increment global training step."""
Expand Down Expand Up @@ -89,11 +90,11 @@ def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, act
else:
c_channels = 3

self.observation_in = tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
name='observation_0')
self.observation_in.append(tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
name='observation_%d' % len(self.observation_in)))
streams = []
for i in range(num_streams):
self.conv1 = tf.layers.conv2d(self.observation_in, 16, kernel_size=[8, 8], strides=[4, 4],
self.conv1 = tf.layers.conv2d(self.observation_in[-1], 16, kernel_size=[8, 8], strides=[4, 4],
use_bias=False, activation=activation)
self.conv2 = tf.layers.conv2d(self.conv1, 32, kernel_size=[4, 4], strides=[2, 2],
use_bias=False, activation=activation)
Expand Down Expand Up @@ -213,10 +214,12 @@ def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, num_layers):
self.create_reward_encoder()

hidden_state, hidden_visual, hidden_policy, hidden_value = None, None, None, None
if brain.number_observations > 0:
height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
bw = brain.camera_resolutions[0]['blackAndWhite']
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)
encoders = []
for i in range(brain.number_observations):
height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width']
bw = brain.camera_resolutions[i]['blackAndWhite']
encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers))
hidden_visual = tf.concat(encoders, axis=2)
if brain.state_space_size > 0:
s_size = brain.state_space_size
if brain.state_space_type == "continuous":
Expand Down Expand Up @@ -275,10 +278,12 @@ def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_la
self.normalize = normalize

hidden_state, hidden_visual, hidden = None, None, None
if brain.number_observations > 0:
height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
bw = brain.camera_resolutions[0]['blackAndWhite']
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0]
encoders = []
for i in range(brain.number_observations):
height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width']
bw = brain.camera_resolutions[i]['blackAndWhite']
encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0])
hidden_visual = tf.concat(encoders, axis=1)
if brain.state_space_size > 0:
s_size = brain.state_space_size
if brain.state_space_type == "continuous":
Expand Down
12 changes: 8 additions & 4 deletions python/ppo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ def take_action(self, info, env, brain_name, steps, normalize):
epsi = np.random.randn(len(info.states), env.brains[brain_name].action_space_size)
feed_dict[self.model.epsilon] = epsi
if self.use_observations:
feed_dict[self.model.observation_in] = np.vstack(info.observations)
for i, _ in enumerate(info.observations):
feed_dict[self.model.observation_in[i]] = info.observations[i]
if self.use_states:
feed_dict[self.model.state_in] = info.states
if self.is_training and env.brains[brain_name].state_space_type == "continuous" and self.use_states and normalize:
Expand Down Expand Up @@ -91,7 +92,8 @@ def add_experiences(self, info, next_info, epsi, actions, a_dist, value):
idx = info.agents.index(agent)
if not info.local_done[idx]:
if self.use_observations:
history['observations'].append([info.observations[0][idx]])
for i, _ in enumerate(info.observations):
history['observations%d' % i].append([info.observations[i][idx]])
if self.use_states:
history['states'].append(info.states[idx])
if self.is_continuous:
Expand Down Expand Up @@ -120,7 +122,8 @@ def process_experiences(self, info, time_horizon, gamma, lambd):
else:
feed_dict = {self.model.batch_size: len(info.states)}
if self.use_observations:
feed_dict[self.model.observation_in] = np.vstack(info.observations)
for i in range(self.info.observations):
feed_dict[self.model.observation_in[i]] = info.observations[i]
if self.use_states:
feed_dict[self.model.state_in] = info.states
value_next = self.sess.run(self.model.value, feed_dict)[l]
Expand Down Expand Up @@ -176,7 +179,8 @@ def update_model(self, batch_size, num_epoch):
if self.use_states:
feed_dict[self.model.state_in] = np.vstack(training_buffer['states'][start:end])
if self.use_observations:
feed_dict[self.model.observation_in] = np.vstack(training_buffer['observations'][start:end])
for i, _ in enumerate(self.model.observation_in):
feed_dict[self.model.observation_in[i]] = np.vstack(training_buffer['observations%d' % i][start:end])
v_loss, p_loss, _ = self.sess.run([self.model.value_loss, self.model.policy_loss,
self.model.update_batch], feed_dict=feed_dict)
total_v += v_loss
Expand Down