diff --git a/python/unityagents/brain.py b/python/unityagents/brain.py index 2188291018..d2b16d0fcb 100755 --- a/python/unityagents/brain.py +++ b/python/unityagents/brain.py @@ -3,8 +3,8 @@ class BrainInfo: def __init__(self, visual_observation, vector_observation, text_observations, memory=None, - reward=None, agents=None, local_done=None, - vector_action=None, text_action=None, max_reached=None): + reward=None, agents=None, local_done=None, + vector_action=None, text_action=None, max_reached=None): """ Describes experience at current step of all agents linked to a brain. """ @@ -49,10 +49,10 @@ def __str__(self): Vector Action space type: {5} Vector Action space size (per agent): {6} Vector Action descriptions: {7}'''.format(self.brain_name, - str(self.number_visual_observations), - self.vector_observation_space_type, - str(self.vector_observation_space_size), - str(self.num_stacked_vector_observations), - self.vector_action_space_type, - str(self.vector_action_space_size), - ', '.join(self.vector_action_descriptions)) + str(self.number_visual_observations), + self.vector_observation_space_type, + str(self.vector_observation_space_size), + str(self.num_stacked_vector_observations), + self.vector_action_space_type, + str(self.vector_action_space_size), + ', '.join(self.vector_action_descriptions)) diff --git a/python/unitytrainers/ppo/trainer.py b/python/unitytrainers/ppo/trainer.py index f50b927ad1..92c36d5e29 100755 --- a/python/unitytrainers/ppo/trainer.py +++ b/python/unitytrainers/ppo/trainer.py @@ -8,7 +8,7 @@ import numpy as np import tensorflow as tf -from unityagents import AllBrainInfo +from unityagents import AllBrainInfo, BrainInfo from unitytrainers.buffer import Buffer from unitytrainers.ppo.models import PPOModel from unitytrainers.trainer import UnityTrainerException, Trainer @@ -196,22 +196,61 @@ def take_action(self, all_brain_info: AllBrainInfo): else: return run_out[self.model.output], None, None, run_out + def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: + """ + Constructs a BrainInfo which contains the most recent previous experiences for all agents info + which correspond to the agents in a provided next_info. + :BrainInfo next_info: A t+1 BrainInfo. + :return: curr_info: Reconstructed BrainInfo to match agents of next_info. + """ + visual_observations = [[]] + vector_observations = [] + text_observations = [] + memories = [] + rewards = [] + local_dones = [] + max_reacheds = [] + agents = [] + prev_vector_actions = [] + prev_text_actions = [] + for agent_id in next_info.agents: + agent_brain_info = self.training_buffer[agent_id].last_brain_info + agent_index = agent_brain_info.agents.index(agent_id) + if agent_brain_info is None: + agent_brain_info = next_info + for i in range(len(next_info.visual_observations)): + visual_observations[i].append(agent_brain_info.visual_observations[i][agent_index]) + vector_observations.append(agent_brain_info.vector_observations[agent_index]) + text_observations.append(agent_brain_info.text_observations[agent_index]) + if self.use_recurrent: + memories.append(agent_brain_info.memories[agent_index]) + rewards.append(agent_brain_info.rewards[agent_index]) + local_dones.append(agent_brain_info.local_done[agent_index]) + max_reacheds.append(agent_brain_info.max_reached[agent_index]) + agents.append(agent_brain_info.agents[agent_index]) + prev_vector_actions.append(agent_brain_info.previous_vector_actions[agent_index]) + prev_text_actions.append(agent_brain_info.previous_text_actions[agent_index]) + curr_info = BrainInfo(visual_observations, vector_observations, text_observations, memories, rewards, + agents, local_dones, prev_vector_actions, prev_text_actions, max_reacheds) + return curr_info + def generate_intrinsic_rewards(self, curr_info, next_info): """ Generates intrinsic reward used for Curiosity-based training. - :param curr_info: Current BrainInfo. - :param next_info: Next BrainInfo. + :BrainInfo curr_info: Current BrainInfo. + :BrainInfo next_info: Next BrainInfo. :return: Intrinsic rewards for all agents. """ if self.use_curiosity: - if curr_info.agents != next_info.agents: - raise UnityTrainerException("Training with Curiosity-driven exploration" - " and On-Demand Decision making is currently not supported.") - feed_dict = {self.model.batch_size: len(curr_info.vector_observations), self.model.sequence_length: 1} + feed_dict = {self.model.batch_size: len(next_info.vector_observations), self.model.sequence_length: 1} if self.is_continuous_action: feed_dict[self.model.output] = next_info.previous_vector_actions else: feed_dict[self.model.action_holder] = next_info.previous_vector_actions.flatten() + + if curr_info.agents != next_info.agents: + curr_info = self.construct_curr_info(next_info) + if self.use_visual_obs: for i in range(len(curr_info.visual_observations)): feed_dict[self.model.visual_in[i]] = curr_info.visual_observations[i] @@ -262,12 +301,12 @@ def add_experiences(self, curr_all_info: AllBrainInfo, next_all_info: AllBrainIn curr_info = curr_all_info[self.brain_name] next_info = next_all_info[self.brain_name] - intrinsic_rewards = self.generate_intrinsic_rewards(curr_info, next_info) - for agent_id in curr_info.agents: self.training_buffer[agent_id].last_brain_info = curr_info self.training_buffer[agent_id].last_take_action_outputs = take_action_outputs + intrinsic_rewards = self.generate_intrinsic_rewards(curr_info, next_info) + for agent_id in next_info.agents: stored_info = self.training_buffer[agent_id].last_brain_info stored_take_action_outputs = self.training_buffer[agent_id].last_take_action_outputs