In [1]:
# Basics
from Solver import Particle, Perceptron, PerceptronModel, VicsekModel, NeuralNetwork, PerceptronMode, Mode, NeuralSwarmModel

import numpy        as np
import os
import logging
import time
import matplotlib.pyplot as plt

# Logging
# logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import ray
from ray import tune
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.utils import check_env
from ray.rllib.algorithms.maddpg import MADDPGConfig
from ray.tune.registry import register_env
from ray.rllib.policy.policy import PolicySpec
from ray.rllib.algorithms.maddpg import maddpg_tf_policy

from gymnasium.spaces import Box


  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


# Simulation Parameters

In [2]:
# Simulation settings
settings = {
        #                  N,      L,      v,      noise,  r
        "testing": [       1,      10,     0.03,   0.1,    1],
        "small": [         100,    10,     0.03,   0.1,    1],
        "medium": [        1000,   10,     0.03,   0.1,    1],
    }
    
# Choose between RADIUS, FIXED, FIXEDRADIUS (don't use RADIUS)
mode = Mode.FIXEDRADIUS
# Flags
ZDimension = False     # 2D or 3D
seed = False           # Random seed
# Choose settings
chosen_settings = settings["small"]
N       = chosen_settings[0]
L       = chosen_settings[1]
v       = chosen_settings[2]
noise   = chosen_settings[3]
r       = chosen_settings[4]

k_neighbors = 5
# Timesteps in an episode
T = 1000

# Custom Multi-Agent Environment

In [3]:
class MultiAgentSimulationEnv(MultiAgentEnv):
    minimum = 0.0
    maximum = 2 * np.pi
    
    def __init__(self, config):
        super().__init__()
        self.num_agents = N
        self._spaces_in_preferred_format = True
        self._agent_ids = set(range(100))
        
        # We asume the same action space for all agents
        self.action_space = Box(low=self.minimum, high=self.maximum, shape=(), dtype=np.float64)
        
        # We assume the same observation space for all agents
        self.observation_space = Box(low=self.minimum, high=self.maximum, shape=(k_neighbors + 1,), dtype=np.float64)
        
        self.simulation = NeuralSwarmModel(N, L, v, noise, r, mode, k_neighbors, ZDimension, seed=seed)
        self.new_angles = np.zeros(shape=(N,), dtype=np.float64)
        self.index = 0

    def reset(self, seed=None, options=None):
        # Reset the state of the environment to an initial state
        observations = {}
        infos = {}
        self.simulation = NeuralSwarmModel(N, L, v, noise, r, mode, k_neighbors, ZDimension, seed=False)
        self.index = 0
        self.new_angles = np.zeros(shape=(N,), dtype=np.float64)
        for agent_id in range(self.num_agents):
            observations[agent_id] = self.simulation.get_angles(agent_id)
        return observations, infos

    def step(self, action_dict):
        # Actions for all agents are provided in a dictionary
        
        # Rewards for all agents are provided in a dictionary {agent_id: reward}
        rewards = {}
        # Observations for all agents are provided in a dictionary {agent_id: observation}
        new_obs = {}
        # Dones for all agents are provided in a dictionary {agent_id: done (boolean))}
        dones = {}
        # Truncated for all agents are provided in a dictionary {agent_id: truncated (boolean))}
        # Truncated is used to indicate that the episode was ended early
        truncated = {}
        # Infos for all agents are provided in a dictionary {agent_id: info}
        # Infos can be used to provide extra information about an agent's state or action
        infos = {}
        
        # Collect all actions and set dones
        for agent_id, action in action_dict.items():
            action = np.clip(action, self.minimum, self.maximum)
            self.new_angles[agent_id] = action
            dones[agent_id] = True if self.index >= T else False
            
        # Update the simulation
        self.simulation.update_angles(self.new_angles)
        self.simulation.update()
        self.index += 1
        reward = self.simulation.mean_direction2D()
        
        # Collect observations and rewards
        for agent_id in range(self.num_agents):
            new_obs[agent_id] = self.simulation.get_angles(agent_id)
            rewards[agent_id] = reward

        dones['__all__'] = all(dones.values())  # Ends the episode if all agents are done
        
        return new_obs, rewards, dones, truncated, infos

    def render(self, mode='human'):
        # Optional: For visualization
        # Draw particles with matplotlib
        # Particles are stored in self.simulation.particles . Positions are stored in particles[i].x and particles[i].y
        # NOT YET FUNCTIONAL
        fig, ax = plt.subplots(figsize=(10, 10))
        ax.set_xlim(0, L)
        ax.set_ylim(0, L)
        ax.set_aspect('equal')
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_title('Simulation')
        
        for particle in self.simulation.particles:
            ax.plot(particle.x, particle.y, 'o', color='black', markersize=10)
            
        plt.show()

    def close(self):
        # Optional: Clean up. Called at the end of an episode.
        pass
    
    # Optional methods
    def observation_space_contains(self, observation):
        # Check if the observation is a valid observation
        # Obervation is a dictionary {agent_id: observation}
        observations = observation.values()
        return all([self.observation_space.contains(obs) for obs in observations])
    
    def action_space_contains(self, action):
        # Check if the action is a valid action
        # Action is a dictionary {agent_id: action}
        actions = action.values()
        return all([self.action_space.contains(act) for act in actions])
    
    def observation_space_sample(self):
        return {agent_id: self.observation_space.sample() for agent_id in range(self.num_agents)}

    def action_space_sample(self, action):
        return {agent_id: self.action_space.sample() for agent_id in range(self.num_agents)}


# Create and test environment

In [4]:
env = MultiAgentSimulationEnv(None)

for i_episode in range(2):
    observations, infos = env.reset()
    total_rewards = {agent_id: 0 for agent_id in observations.keys()}
    print(f"Starting episode {i_episode + 1}")
    
    # Max steps per episode
    for t in range(T + 1):
        # Optional: Render the environment for visualization
        # env.render()
        
        # Choose random actions
        actions = {agent_id: env.action_space.sample() for agent_id in observations.keys()}
        
        observations, rewards, dones, truncated, infos = env.step(actions)
        
        for agent_id, reward in rewards.items():
            total_rewards[agent_id] += reward
            
        print(f"Step {t}... \r", end="")
            
        if any(dones.values()):
            print(f"Step {t} finished")
            # The reward is the same for all agents. We just take the first one.
            print(f"Episode {i_episode + 1} finished after {t} timesteps with rewards: {next(iter(rewards.values()))}")
            break

env.close()

Starting episode 1
Step 1000 finished
Episode 1 finished after 1000 timesteps with rewards: 0.07974984115534882
Starting episode 2
Step 1000 finished
Episode 2 finished after 1000 timesteps with rewards: 0.11110731104797063


# Policy Mapping

In the following code, ``policy_mapping_fn(agent_id)`` is defined to map each agent to a policy. The agents id is used to map each agent to a policy. The policy is then used to compute the action for each agent.

In this case, a shared policy is used for all agents. The policy is defined in the ``policy_graph`` function. The policy is a simple neural network.



In [5]:
# Create a dict with all agent_ids
agent_ids = {"agent_" + str(i): i for i in range(N)}

def policy_mapping_fn(agent_id):
    """Returns the policy that should be used by the agent with the id agent_id.
    In this case, all agents share the same policy.
    
    Later on, multiple policies can be used for different agents."""
    return "shared_policy"


def get_shared_policy():
    policies = {
        "shared_policy": PolicySpec(
            policy_class=maddpg_tf_policy.MADDPGTFPolicy,   # Can also be set to None. Should be the same.
            observation_space=env.observation_space, 
            action_space=env.action_space, 
            config={}
            # Or maybe config=agent_ids? But then the constructor of MADDPGTFPolicy throws an error
        )
    }
    return policies


# Create an individual policy for each agent (for testing)
This doesn't work for some reason. It conflicts with the tensorflow framework. The tensor shapes aren't correct. I'm not sure how to fix this. I think it's because the policy is expecting a batch of observations, but I'm only giving it one observation.

In [6]:
# def policy_mapping_fn(agent_id):
#     """Returns the policy that should be used by the agent with the id agent_id.
#     Here, each agent gets its own policy based on its agent_id."""
#     return "policy_for_{}".format(agent_id)

# def get_individual_policies():
#     """Create a separate policy for each agent."""
#     policies = {}
#     for i in range(N):
#         policy_name = "policy_for_agent_{}".format(i)
#         policies[policy_name] = PolicySpec(
#             policy_class=None,
#             observation_space=env.observation_space,
#             action_space=env.action_space,
#             config={"agent_id": i}
#         )
#     return policies

# Configurations

`config` is not an ordinary ``dict``. To set its values, use the `config.update_from_dict` function.

# Default MADDPG config settings
```python
extra_python_environs_for_driver {}
extra_python_environs_for_worker {}
num_gpus 0
num_cpus_per_worker 1
num_gpus_per_worker 0
_fake_gpus False
num_learner_workers 0
num_gpus_per_learner_worker 0
num_cpus_per_learner_worker 1
local_gpu_idx 0
custom_resources_per_worker {}
placement_strategy PACK
eager_tracing True
eager_max_retraces 20
tf_session_args {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}
local_tf_session_args {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}
torch_compile_learner False
torch_compile_learner_what_to_compile TorchCompileWhatToCompile.FORWARD_TRAIN
torch_compile_learner_dynamo_backend inductor
torch_compile_learner_dynamo_mode None
torch_compile_worker False
torch_compile_worker_dynamo_backend onnxrt
torch_compile_worker_dynamo_mode None
env None
env_config {}
observation_space None
action_space None
env_task_fn None
render_env False
clip_rewards None
normalize_actions True
clip_actions False
disable_env_checking False
_is_atari None
auto_wrap_old_gym_envs True
action_mask_key action_mask
env_runner_cls None
num_envs_per_worker 1
sample_collector <class 'ray.rllib.evaluation.collectors.simple_list_collector.SimpleListCollector'>
sample_async False
enable_connectors True
update_worker_filter_stats True
use_worker_filter_stats True
rollout_fragment_length 100
batch_mode truncate_episodes
remote_worker_envs False
remote_env_batch_wait_ms 0
validate_workers_after_construction True
preprocessor_pref deepmind
observation_filter NoFilter
compress_observations False
enable_tf1_exec_eagerly False
sampler_perf_stats_ema_coef None
gamma 0.99
lr 0.001
grad_clip None
grad_clip_by global_norm
train_batch_size 1024
model {'_disable_preprocessor_api': False, '_disable_action_flattening': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': True, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action': False, 'lstm_use_prev_reward': False, '_time_major': False, 'use_attention': False, 'attention_num_transformer_units': 1, 'attention_dim': 64, 'attention_num_heads': 1, 'attention_head_dim': 32, 'attention_memory_inference': 50, 'attention_memory_training': 50, 'attention_position_wise_mlp_dim': 32, 'attention_init_gru_gate_bias': 2.0, 'attention_use_n_prev_actions': 0, 'attention_use_n_prev_rewards': 0, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_model': None, 'custom_model_config': {}, 'custom_action_dist': None, 'custom_preprocessor': None, 'encoder_latent_dim': None, 'always_check_shapes': False, 'lstm_use_prev_action_reward': -1, '_use_default_native_models': -1}
optimizer {}
max_requests_in_flight_per_sampler_worker 2
_learner_class None
_enable_learner_api False
explore True
exploration_config {'type': 'StochasticSampling'}
algorithm_config_overrides_per_module {}
policy_map_capacity 100
policy_mapping_fn <function AlgorithmConfig.DEFAULT_POLICY_MAPPING_FN at 0x7f311e7b6320>
policies_to_train None
policy_states_are_swappable False
observation_fn None
count_steps_by env_steps
input_config {}
actions_in_input_normalized False
postprocess_inputs False
shuffle_buffer_size 0
output None
output_config {}
output_compress_columns ['obs', 'new_obs']
output_max_file_size 67108864
offline_sampling False
evaluation_interval None
evaluation_duration 10
evaluation_duration_unit episodes
evaluation_sample_timeout_s 180.0
evaluation_parallel_to_training False
evaluation_config None
off_policy_estimation_methods {}
ope_split_batch_by_episode True
evaluation_num_workers 0
always_attach_evaluation_results False
enable_async_evaluation False
in_evaluation False
sync_filters_on_rollout_workers_timeout_s 60.0
keep_per_episode_custom_metrics False
metrics_episode_collection_timeout_s 60.0
metrics_num_episodes_for_smoothing 100
min_time_s_per_iteration 0
min_train_timesteps_per_iteration 0
min_sample_timesteps_per_iteration 0
export_native_model_files False
checkpoint_trainable_policies_only False
logger_creator None
logger_config None
log_level WARN
log_sys_usage True
fake_sampler False
seed None
ignore_worker_failures False
recreate_failed_workers False
max_num_worker_restarts 1000
delay_between_worker_restarts_s 60.0
restart_failed_sub_environments False
num_consecutive_worker_failures_tolerance 100
worker_health_probe_timeout_s 60
worker_restore_timeout_s 1800
rl_module_spec None
_enable_rl_module_api False
_AlgorithmConfig__prior_exploration_config None
_tf_policy_handles_more_than_one_loss False
_disable_preprocessor_api False
_disable_action_flattening False
_disable_execution_plan_api True
_disable_initialize_loss_from_dummy_batch False
simple_optimizer -1
policy_map_cache -1
worker_cls -1
synchronize_filters -1
replay_sequence_length None
agent_id None
use_local_critic False
use_state_preprocessor False
actor_hiddens [64, 64]
actor_hidden_activation relu
critic_hiddens [64, 64]
critic_hidden_activation relu
n_step 1
good_policy maddpg
adv_policy maddpg
replay_buffer_config {'type': 'MultiAgentReplayBuffer', 'prioritized_replay': -1, 'capacity': 1000000, 'replay_mode': 'lockstep'}
training_intensity None
num_steps_sampled_before_learning_starts 25600
critic_lr 0.01
actor_lr 0.01
target_network_update_freq 0
tau 0.01
actor_feature_reg 0.001
grad_norm_clipping 0.5
input sampler
policies {'default_policy': (None, None, None, None)}
callbacks <class 'ray.rllib.algorithms.callbacks.DefaultCallbacks'>
create_env_on_driver False
custom_eval_function None
framework torch
num_cpus_for_driver 1
num_workers 1
```

In [7]:
# Initialize Ray
# ray.init(local_mode=True)
ray.init(num_gpus=0)    # Has nicer output

config = MADDPGConfig()

# Register the custom environment
register_env("multi_agent_simulation", lambda config: MultiAgentSimulationEnv(config))
config.environment("multi_agent_simulation")

# Disable automatic environment checking
config.environment(disable_env_checking=True)

# Test if the environment is valid
check_env(env, config)

policies = get_shared_policy()
# policies = get_individual_policies()      # Uncomment this line to use individual policies

config.update_from_dict({
    "simple_optimizer": True,
    "policies": policies,
    "policy_mapping_fn": policy_mapping_fn,
    "policies_to_train": list(policies.keys()),
    "count_steps_by": "env_steps",
    "framework": "tf",
    "eager_tracing": False,
    "observation_space": env.observation_space,
    "action_space": env.action_space,
})



# Print the configuration (contains all the hyperparameters)
for key, value in config.items():
    print(key, value)

2023-08-12 13:05:43,852	INFO worker.py:1621 -- Started a local Ray instance.


extra_python_environs_for_driver {}
extra_python_environs_for_worker {}
num_gpus 0
num_cpus_per_worker 1
num_gpus_per_worker 0
_fake_gpus False
num_learner_workers 0
num_gpus_per_learner_worker 0
num_cpus_per_learner_worker 1
local_gpu_idx 0
custom_resources_per_worker {}
placement_strategy PACK
eager_tracing False
eager_max_retraces 20
tf_session_args {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}
local_tf_session_args {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}
torch_compile_learner False
torch_compile_learner_what_to_compile TorchCompileWhatToCompile.FORWARD_TRAIN
torch_compile_learner_dynamo_backend inductor
torch_compile_learner_dynamo_mode None
torch_compile_worker False
torch_compile_worker_dynamo_backend onnxrt
torch_compile_worker_dynamo_mode None
env multi_agent_simulation
env_config {}
observation_s

  logger.warn("Casting input x to numpy array.")


# Training

In [8]:
stop = {
    "training_iteration": 100,
}

results = tune.run(
    "MADDPG",
    stop=stop,
    config=config,
    checkpoint_at_end=True,
    verbose=2,
    checkpoint_freq=10,
    storage_path="results",
    name="MADDPG",
)


2023-08-12 13:05:44,564	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


0,1
Current time:,2023-08-12 13:05:49
Running for:,00:00:04.47
Memory:,6.2/15.5 GiB

Trial name,# failures,error file
MADDPG_multi_agent_simulation_2f001_00000,1,/mnt/c/GitHub/bachelor_thesis23/notebooks/results/MADDPG/MADDPG_multi_agent_simulation_2f001_00000_0_2023-08-12_13-05-44/error.txt

Trial name,status,loc
MADDPG_multi_agent_simulation_2f001_00000,ERROR,


[2m[33m(raylet)[0m bash: /home/renlephy/miniconda3/envs/bachelor/lib/libtinfo.so.6: no version information available (required by bash)
[2m[33m(raylet)[0m bash: /home/renlephy/miniconda3/envs/bachelor/lib/libtinfo.so.6: no version information available (required by bash)
2023-08-12 13:05:49,100	ERROR tune_controller.py:911 -- Trial task failed for trial MADDPG_multi_agent_simulation_2f001_00000
Traceback (most recent call last):
  File "/home/renlephy/miniconda3/envs/bachelor/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/renlephy/miniconda3/envs/bachelor/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/renlephy/miniconda3/envs/bachelor/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/renlephy/miniconda3/envs/bach

Trial name
MADDPG_multi_agent_simulation_2f001_00000


[2m[36m(MADDPG pid=27681)[0m 2023-08-12 13:05:49,090	ERROR actor_manager.py:500 -- Ray error, taking actor 1 out of service. The actor died because of an error raised in its creation task, [36mray::RolloutWorker.__init__()[39m (pid=27739, ip=172.20.85.17, actor_id=d1cd889a3c07b29f5495d77801000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7fb5a5254970>)
[2m[36m(MADDPG pid=27681)[0m   File "/home/renlephy/miniconda3/envs/bachelor/lib/python3.10/site-packages/ray/rllib/evaluation/rollout_worker.py", line 525, in __init__
[2m[36m(MADDPG pid=27681)[0m     self._update_policy_map(policy_dict=self.policy_dict)
[2m[36m(MADDPG pid=27681)[0m   File "/home/renlephy/miniconda3/envs/bachelor/lib/python3.10/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1727, in _update_policy_map
[2m[36m(MADDPG pid=27681)[0m     self._build_policy_map(
[2m[36m(MADDPG pid=27681)[0m   File "/home/renlephy/miniconda3/envs/bachelor/lib/python3.10/site-packages

TuneError: ('Trials did not complete', [MADDPG_multi_agent_simulation_2f001_00000])