In [1]:
# Basics
from Solver import Particle, Perceptron, PerceptronModel, VicsekModel, NeuralNetwork, PerceptronMode, Mode, NeuralSwarmModel

import numpy        as np
import os
import logging
import time
import matplotlib.pyplot as plt

# Logging
# logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import ray
from ray import tune
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.utils import check_env
from ray.rllib.algorithms.maddpg import MADDPGConfig
from ray.tune.registry import register_env
from ray.rllib.policy.policy import PolicySpec
from ray.rllib.algorithms.maddpg import maddpg_tf_policy


from gymnasium.spaces import Box


  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [2]:
# Simulation settings
settings = {
        #                  N,      L,      v,      noise,  r
        "testing": [       1,      10,     0.03,   0.1,    1],
        "small": [         100,    10,     0.03,   0.1,    1],
        "medium": [        1000,   10,     0.03,   0.1,    1],
    }
    
# Choose between RADIUS, FIXED, FIXEDRADIUS (don't use RADIUS)
mode = Mode.FIXEDRADIUS
# Flags
ZDimension = False     # 2D or 3D
seed = False           # Random seed
# Choose settings
chosen_settings = settings["small"]
N       = chosen_settings[0]
L       = chosen_settings[1]
v       = chosen_settings[2]
noise   = chosen_settings[3]
r       = chosen_settings[4]

k_neighbors = 5
# Timesteps in an episode
T = 1000

In [3]:
class MultiAgentSimulationEnv(MultiAgentEnv):
    minimum = 0.0
    maximum = 2 * np.pi
    
    def __init__(self, config):
        super().__init__()
        self.num_agents = N
        self._spaces_in_preferred_format = True
        self._agent_ids = list(range(self.num_agents))
        
        # We asume the same action space for all agents
        self.action_space = Box(low=self.minimum, high=self.maximum, shape=(), dtype=np.float64)
        
        # We assume the same observation space for all agents
        self.observation_space = Box(low=self.minimum, high=self.maximum, shape=(k_neighbors + 1,), dtype=np.float64)
        
        self.simulation = NeuralSwarmModel(N, L, v, noise, r, mode, k_neighbors, ZDimension, seed=seed)
        self.new_angles = np.zeros(shape=(N,), dtype=np.float64)
        self.index = 0

    def reset(self, seed=None, options=None):
        # Reset the state of the environment to an initial state
        observations = {}
        infos = {}
        self.simulation = NeuralSwarmModel(N, L, v, noise, r, mode, k_neighbors, ZDimension, seed=False)
        self.index = 0
        self.new_angles = np.zeros(shape=(N,), dtype=np.float64)
        for agent_id in range(self.num_agents):
            observations[agent_id] = self.simulation.get_angles(agent_id)
        return observations, infos

    def step(self, action_dict):
        # Actions for all agents are provided in a dictionary
        
        # Rewards for all agents are provided in a dictionary {agent_id: reward}
        rewards = {}
        # Observations for all agents are provided in a dictionary {agent_id: observation}
        new_obs = {}
        # Dones for all agents are provided in a dictionary {agent_id: done (boolean))}
        dones = {}
        # Truncated for all agents are provided in a dictionary {agent_id: truncated (boolean))}
        # Truncated is used to indicate that the episode was ended early
        truncated = {}
        # Infos for all agents are provided in a dictionary {agent_id: info}
        # Infos can be used to provide extra information about an agent's state or action
        infos = {}
        
        # Collect all actions and set dones
        for agent_id, action in action_dict.items():
            action = np.clip(action, self.minimum, self.maximum)
            self.new_angles[agent_id] = action
            dones[agent_id] = True if self.index >= T else False
            
        # Update the simulation
        self.simulation.update_angles(self.new_angles)
        self.simulation.update()
        self.index += 1
        reward = self.simulation.mean_direction2D()
        
        # Collect observations and rewards
        for agent_id in range(self.num_agents):
            new_obs[agent_id] = self.simulation.get_angles(agent_id)
            rewards[agent_id] = reward

        dones['__all__'] = all(dones.values())  # Ends the episode if all agents are done
        
        return new_obs, rewards, dones, truncated, infos

    def render(self, mode='human'):
        # Optional: For visualization
        # Draw particles with matplotlib
        # Particles are stored in self.simulation.particles . Positions are stored in particles[i].x and particles[i].y
        # NOT YET FUNCTIONAL
        fig, ax = plt.subplots(figsize=(10, 10))
        ax.set_xlim(0, L)
        ax.set_ylim(0, L)
        ax.set_aspect('equal')
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_title('Simulation')
        
        for particle in self.simulation.particles:
            ax.plot(particle.x, particle.y, 'o', color='black', markersize=10)
            
        plt.show()

    def close(self):
        # Optional: Clean up. Called at the end of an episode.
        pass
    
    # Optional methods
    def observation_space_contains(self, observation):
        # Check if the observation is a valid observation
        # Obervation is a dictionary {agent_id: observation}
        observations = observation.values()
        return all([self.observation_space.contains(obs) for obs in observations])
    
    def action_space_contains(self, action):
        # Check if the action is a valid action
        # Action is a dictionary {agent_id: action}
        actions = action.values()
        return all([self.action_space.contains(act) for act in actions])
    
    def observation_space_sample(self):
        return {agent_id: self.observation_space.sample() for agent_id in range(self.num_agents)}

    def action_space_sample(self, action):
        return {agent_id: self.action_space.sample() for agent_id in range(self.num_agents)}


In [4]:
env = MultiAgentSimulationEnv(None)

for i_episode in range(2):
    observations, infos = env.reset()
    total_rewards = {agent_id: 0 for agent_id in observations.keys()}
    print(f"Starting episode {i_episode + 1}")
    
    # Max steps per episode
    for t in range(T + 1):
        # Optional: Render the environment for visualization
        # env.render()
        
        # Choose random actions
        actions = {agent_id: env.action_space.sample() for agent_id in observations.keys()}
        
        observations, rewards, dones, truncated, infos = env.step(actions)
        
        for agent_id, reward in rewards.items():
            total_rewards[agent_id] += reward
            
        print(f"Step {t}... \r", end="")
            
        if any(dones.values()):
            print(f"Step {t} finished")
            # The reward is the same for all agents. We just take the first one.
            print(f"Episode {i_episode + 1} finished after {t} timesteps with rewards: {next(iter(rewards.values()))}")
            break

env.close()

Starting episode 1
Step 1000 finished
Episode 1 finished after 1000 timesteps with rewards: 0.027931419821878845
Starting episode 2
Step 1000 finished
Episode 2 finished after 1000 timesteps with rewards: 0.08690546198350288


In [5]:
# Create a dict with all agent_ids
agent_ids = {"agent_" + str(i): i for i in range(N)}

def policy_mapping_fn(agent_id):
    """Returns the policy that should be used by the agent with the id agent_id.
    In this case, all agents share the same policy.
    
    Later on, multiple policies can be used for different agents."""
    return "shared_policy"


def get_shared_policy():
    policies = {
        "shared_policy": PolicySpec(
            policy_class=maddpg_tf_policy.MADDPGTFPolicy,   # Can also be set to None. Should be the same.
            observation_space=env.observation_space, 
            action_space=env.action_space, 
            config={"agent_id": 0}
            # Or maybe config=agent_ids? But then the constructor of MADDPGTFPolicy throws an error
        )
    }
    return policies

def get_individual_policies():
    return {
        "policy_for_agent_" + str(i): PolicySpec(
            policy_class=maddpg_tf_policy.MADDPGTFPolicy,
            observation_space=env.observation_space, 
            action_space=env.action_space, 
            config={"agent_id": i}
        ) for i in range(N)
    }

def policy_mapping_fn_individual(agent_id):
    """Returns the policy that should be used by the agent with the id agent_id.
    In this case, all agents share the same policy.
    
    Later on, multiple policies can be used for different agents."""
    return "policy_for_agent_" + str(agent_id)

def gen_policy(i):
    use_local_critic = False  # or False, depending on your needs
    return (
        maddpg_tf_policy.MADDPGTFPolicy,
        env.observation_space,
        env.action_space,
        {
            "agent_id": i,
            "use_local_critic": use_local_critic,
        }
    )


In [6]:
config = MADDPGConfig()

# Register the custom environment
register_env("multi_agent_simulation", lambda config: MultiAgentSimulationEnv(config))
config.environment("multi_agent_simulation")

# Disable automatic environment checking
config.environment(disable_env_checking=True)

# Test if the environment is valid
check_env(env, config)

# policies = get_shared_policy()
# policies = get_individual_policies()      # Uncomment this line to use individual policies
policies = {"policy_%d" % i: gen_policy(i) for i in range(N)}


config.update_from_dict({
    # "simple_optimizer": True,
    # "policies": policies,
    # "policy_mapping_fn": policy_mapping_fn,
    # "policy_mapping_fn": policy_mapping_fn_individual,
    # "policies_to_train": list(policies.keys()),
    # "count_steps_by": "env_steps",
    "framework": "tf",
    "use_local_critic": False,
    "use_state_preprocessor": True,
    # "eager_tracing": False,
    # "observation_space": env.observation_space,
    # "action_space": env.action_space,
    "multiagent": {
        "policies": policies,
        "policy_mapping_fn": lambda i: "policy_%d" % i,
    }
})



<ray.rllib.algorithms.maddpg.maddpg.MADDPGConfig at 0x7f5a5d6933d0>

In [7]:
algo = config.build() 

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2023-08-21 15:15:11,882	INFO worker.py:1621 -- Started a local Ray instance.
2023-08-21 15:15:17,312	ERROR actor_manager.py:500 -- Ray error, taking actor 1 out of service. The actor died because of an error raised in its creation task, [36mray::RolloutWorker.__init__()[39m (pid=6299, ip=172.20.85.17

AttributeError: 'FullyConnectedNetwork' object has no attribute 'last_layer'

In [None]:
algo.train() 

In [None]:
algo.evaluate()