# Projeto de Reinforcement Learning

Environment: Knighs Archers Zombies

---
Esse arquivo é uma cópia dos cnn.py + env_setup.py + DQN.py + PPO.py, mas em versão notebook e

possui uma visualização melhor do dataframe resultado do tuner usando a bibliboteca ipywidgets.

In [1]:
# Para utilizar os algoritmos PPO e DQN
import os
import ray
from ray import tune
from ray.air import session, RunConfig, CheckpointConfig
from ray.air.checkpoint import Checkpoint
from ray.rllib.algorithms.dqn import DQNConfig, DQN
from ray.rllib.algorithms.ppo import PPOConfig, PPO
from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv, PettingZooEnv
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.tune.registry import register_env
from torch import nn

# Para preparar o ambiente
from pettingzoo.butterfly import knights_archers_zombies_v10
import supersuit as ss

# Para visualizar resultado do tuner
from ipywidgets import interact, interactive, IntSlider, SelectionSlider, Dropdown, Checkbox
import pandas as pd



In [2]:
ray.init(
    num_cpus=12, # Número máximo de CPUs que o ray.tune pode utilizar
    num_gpus=1 # Número máximo de GPUs que o ray.tune pode utilizar
)
# O número de CPUs e GPUs verdadeiramente utilizados é definido nas 
# configurações num_rollout_workers(cpus) e num_gpus do algoritmo.
# Aviso: "num_rollout_workers" deve ser "num_cpus -1" para deixar 1 CPU para o 
# processo desse script que também roda um rollout_workers

2023-09-11 16:15:38,177	INFO worker.py:1621 -- Started a local Ray instance.


0,1
Python version:,3.9.17
Ray version:,2.6.3


### Definição da rede convolucional

In [3]:
class CNNModelV2(TorchModelV2, nn.Module):
    def __init__(self, obs_space, act_space, num_outputs, *args, **kwargs):
        TorchModelV2.__init__(self, obs_space, act_space, num_outputs, *args, **kwargs)
        nn.Module.__init__(self)
        self.model = nn.Sequential(
            nn.Conv2d(3, 32, [8, 8], stride=(4, 4)),
            nn.ReLU(),
            nn.Conv2d(32, 64, [4, 4], stride=(2, 2)),
            nn.ReLU(),
            nn.Conv2d(64, 64, [3, 3], stride=(1, 1)),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
        )
        self.policy_fn = nn.Linear(512, num_outputs)
        self.value_fn = nn.Linear(512, 1)

    def forward(self, input_dict, state, seq_lens):
        model_out = self.model(input_dict["obs"].permute(0, 3, 1, 2))
        self._value_out = self.value_fn(model_out)
        return self.policy_fn(model_out), state

    def value_function(self):
        return self._value_out.flatten()
    
# Registra o modelo para o RAY encontrar e usar
ModelCatalog.register_custom_model("CNNModelV2", CNNModelV2)

### Setup environment

In [4]:
def env_creator(args):
    env = knights_archers_zombies_v10.parallel_env(
        spawn_rate=5,
        num_archers=1, # <--- Single agent env, mas ainda é tratado como multi-agent
        num_knights=0,
        max_zombies=5,
        max_arrows=20,
        killable_knights=True,
        killable_archers=True,
        pad_observation=True,
        line_death=False,
        max_cycles=900,
        vector_state=False, # <--- Observação em formato de imagem
        use_typemasks=False,
    )
    env = ss.color_reduction_v0(env, mode="B")
    env = ss.dtype_v0(env, "float32")
    env = ss.resize_v1(env, x_size=84, y_size=84)
    env = ss.normalize_obs_v0(env, env_min=0, env_max=1)
    env = ss.frame_stack_v1(env, 3)
    return env

# Resgista o custom environment para RAY encontrar pelo nome
# paralell = permite usar mais de 1 cpu para treinar
env_name = "knights_archers_zombies_v10"
register_env(
    env_name, 
    lambda config: ParallelPettingZooEnv(env_creator(config))
)

### Configuração e run do DQN

In [5]:
config = (
    DQNConfig()
    .environment(
        env=env_name,
        disable_env_checking=True # 'True' devido ao erro "not passing checking"
    )
    .rollouts(
        num_rollout_workers=5
    )
    .training(
        n_step = 10,
        lr = 1e-3,
        gamma = 0.95
    )
    .debugging(log_level="ERROR")
    .framework(framework="torch")
    .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
)
# Epsilon decrescente
config.exploration_config.update({
    "initial_epsilon": 1.5,
    "final_epsilon": 0.01,
    "epsilon_timesteps": 10000,
})

print("Começou run")
trial_response = tune.run(
    "DQN",
    name="dqn_notebook",
    stop={
        # Critério de parada do experimento
        # "timesteps_total": 2500,
        "time_total_s": 10, # 600 segundos = 10 minutos
    },
    checkpoint_freq=100,
    storage_path="~\\ray_results\\" + env_name,
    config=config.to_dict(),
)
print(trial_response)
################################################################################
################### Cria e treina o agente DQN por 1 episódio ##################

# agent = DQN(config=config)
# for episode in range(1):
    # train_result = agent.train()
# print(train_result)
# print("=------------------=")
################################################################################
####### Prints para observar o output do treinamento sem usar tuner.run() ######

# for k in train_result.keys():
#     print(k)
# train_result['config']
# train_result["hist_stats"] # reward por epsisódio
# print("=------------------=")
################################################################################
############# Prints para observar variável retornada por tuner.run ############

# df = trial_response.dataframe()
# for c in df.columns:
#     print(c)
# df['episodes_total']
# trial_response.results['32fe0_00000']
# for k in tuner.results['32fe0_00000'].keys():
#     print(k)

2023-09-11 16:15:43,029	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949




Começou run


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


0,1
Current time:,2023-09-11 16:16:24
Running for:,00:00:40.16
Memory:,11.1/15.9 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQN_knights_archers_zombies_v10_9ac7c_00000,TERMINATED,127.0.0.1:6440,1,12.3801,1000,2.8,4,2,151


[2m[36m(pid=6440)[0m   VALID_NP_HPARAMS = (np.bool8, np.float32, np.float64, np.int32, np.int64)
[2m[36m(pid=16564)[0m   VALID_NP_HPARAMS = (np.bool8, np.float32, np.float64, np.int32, np.int64)
[2m[36m(pid=5148)[0m   VALID_NP_HPARAMS = (np.bool8, np.float32, np.float64, np.int32, np.int64)
[2m[36m(RolloutWorker pid=7712)[0m   prep = cls(observation_space, options)
[2m[36m(RolloutWorker pid=16564)[0m   self._preprocessor = get_preprocessor(obs_space)(
[2m[36m(DQN pid=6440)[0m Trainable.setup took 12.422 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Trial name,agent_timesteps_total,connector_metrics,counters,custom_metrics,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,info,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_sampled_throughput_per_sec,num_env_steps_trained,num_env_steps_trained_this_iter,num_env_steps_trained_throughput_per_sec,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,timers
DQN_knights_archers_zombies_v10_9ac7c_00000,1000,"{'ObsPreprocessorConnector_ms': 0.0, 'StateBufferConnector_ms': 0.010895729064941406, 'ViewRequirementAgentConnector_ms': 0.3677797317504883}","{'num_env_steps_sampled': 1000, 'num_env_steps_trained': 0, 'num_agent_steps_sampled': 1000, 'num_agent_steps_trained': 0}",{},151,{},4,2.8,2,5,"{'learner': {}, 'num_env_steps_sampled': 1000, 'num_env_steps_trained': 0, 'num_agent_steps_sampled': 1000, 'num_agent_steps_trained': 0}",1000,0,1000,1000,80.8136,0,0,0,0,5,0,0,0,"{'cpu_util_percent': 56.17857142857142, 'ram_util_percent': 69.47142857142858, 'gpu_util_percent0': 0.0, 'vram_util_percent0': 0.0}",{},{},{},"{'mean_raw_obs_processing_ms': 1.4177713821183389, 'mean_inference_ms': 18.287872675046394, 'mean_action_processing_ms': 0.3018571369683564, 'mean_env_wait_ms': 37.67951946353437, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 4.0, 'episode_reward_min': 2.0, 'episode_reward_mean': 2.8, 'episode_len_mean': 151.0, 'episode_media': {}, 'episodes_this_iter': 5, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [4.0, 3.0, 2.0, 3.0, 2.0], 'episode_lengths': [153, 148, 153, 148, 153]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 1.4177713821183389, 'mean_inference_ms': 18.287872675046394, 'mean_action_processing_ms': 0.3018571369683564, 'mean_env_wait_ms': 37.67951946353437, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0, 'connector_metrics': {'ObsPreprocessorConnector_ms': 0.0, 'StateBufferConnector_ms': 0.010895729064941406, 'ViewRequirementAgentConnector_ms': 0.3677797317504883}}","{'training_iteration_time_ms': 612.511, 'sample_time_ms': 603.325}"


2023-09-11 16:16:24,781	INFO tune.py:1148 -- Total run time: 41.75 seconds (40.05 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis object at 0x000001FD0EBF9160>


### Visualização com widgets do dataframe do resultado do tuner

Classe InteractableColumnPrinter criada para facilitar a visualização

In [6]:
class InteractableColumnPrinter:
    df: pd.DataFrame
    initial_items_shown: int
    number_of_items_shown: int
    max_t: int
    t_slider: IntSlider # parametric value

    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.initial_items_shown = 8
        self.number_of_items_shown = self.initial_items_shown
        self.max_t = len(df.columns) // self.number_of_items_shown + len(df.columns) % self.number_of_items_shown
        self.t_slider = IntSlider(min=0, max=self.max_t, step=1, value=0)

    def print_columns(self, t: int = 0, with_values: bool = False):
        assert t >= 0, "t must be not negative"
        assert self.number_of_items_shown > 0, "number_of_items_shown must be positive"

        for c in self.df.columns[
            self.number_of_items_shown*t : 
            self.number_of_items_shown*t + self.number_of_items_shown
        ]:
            print(c, self.df[c].values) if with_values else print(c)
    
    def update_number_of_items_shown(self, new_value: int):
        self.number_of_items_shown = new_value
        self.max_t = len(self.df.columns) // self.number_of_items_shown + len(self.df.columns) % self.number_of_items_shown
        self.t_slider.max = self.max_t
        print(f"Number of items to show: {new_value}")

    def create_widgets(self):
        
        # Cria coleção de widgets
        interact(
            self.update_number_of_items_shown, # Função que roda ao interagir com um widget
            new_value = IntSlider(min=1, max=10,  step=1, value=self.initial_items_shown),  # Cria widget IntSlider
        )

        # Cria coleção de widgets
        interact(
            self.print_columns, # Função que roda ao interagir com um widget
            with_values=Checkbox(value=False), # Cria widget Checkbox
            t = self.t_slider,  # Cria widget IntSlider
        )

column_printer = InteractableColumnPrinter(trial_response.results_df)
column_printer.create_widgets()


interactive(children=(IntSlider(value=8, description='new_value', max=10, min=1), Output()), _dom_classes=('wi…

interactive(children=(IntSlider(value=0, description='t', max=39), Checkbox(value=False, description='with_val…

### Definição e run do PPO

In [7]:
config = (
    PPOConfig()
    .environment(
        env=env_name, 
        disable_env_checking=True # 'True' devido ao erro "not passing checking"
    )
    .rollouts(
        num_rollout_workers=4, 
    )
    .training(
        train_batch_size=512,
        lr=2e-5,
        gamma=0.99,
        lambda_=0.9,
        use_gae=True,
        grad_clip=None,
        entropy_coeff=0.1,
        vf_loss_coeff=0.25,
        sgd_minibatch_size=64,
        num_sgd_iter=10,
    )
    .debugging(log_level="ERROR")
    .framework(framework="torch")
    .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
)

print("Começou run PPO")
PPO_trial_results = tune.run(
    "PPO",
    name="PPO_notebook",
    stop={
        # Critério de parada do experimento
        "timesteps_total": 5000
    },
    checkpoint_freq=10,
    local_dir="C:\\Users\\victo\\Desktop\\RL\\ray_results\\" + env_name,
    config=config.to_dict(),
    resume="AUTO" # resume: [True, False, "LOCAL", "REMOTE", "PROMPT", "AUTO"] para continuar o treino de onde parou
)

print(PPO_trial_results.results_df)

2023-09-11 16:16:25,116	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


Começou run PPO




0,1
Current time:,2023-09-11 16:21:33
Running for:,00:05:08.32
Memory:,10.7/15.9 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_knights_archers_zombies_v10_b34a8_00000,RUNNING,127.0.0.1:18072,2,258.745,1024,2.5,4,1,141.5


2023-09-11 16:16:25,140	INFO experiment_state.py:407 -- Trying to find and download experiment checkpoint at c://\Users\victo\Desktop\RL\ray_results\knights_archers_zombies_v10\PPO_notebook
 
Please check this error message for potential access problems - if a directory was not found, that is expected at this stage when you're starting a new experiment.
[2m[36m(pid=18072)[0m   VALID_NP_HPARAMS = (np.bool8, np.float32, np.float64, np.int32, np.int64)
[2m[36m(pid=2836)[0m   VALID_NP_HPARAMS = (np.bool8, np.float32, np.float64, np.int32, np.int64)
[2m[36m(pid=6124)[0m   VALID_NP_HPARAMS = (np.bool8, np.float32, np.float64, np.int32, np.int64)
[2m[36m(RolloutWorker pid=2836)[0m   prep = cls(observation_space, options)
[2m[36m(RolloutWorker pid=2836)[0m   self._preprocessor = get_preprocessor(obs_space)(
[2m[36m(PPO pid=18072)[0m Trainable.setup took 11.407 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overhea

Trial name,agent_timesteps_total,connector_metrics,counters,custom_metrics,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,info,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_sampled_throughput_per_sec,num_env_steps_trained,num_env_steps_trained_this_iter,num_env_steps_trained_throughput_per_sec,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,timers
PPO_knights_archers_zombies_v10_b34a8_00000,1024,"{'ObsPreprocessorConnector_ms': 0.03733038902282715, 'StateBufferConnector_ms': 0.0, 'ViewRequirementAgentConnector_ms': 0.6462752819061279}","{'num_env_steps_sampled': 1024, 'num_env_steps_trained': 0, 'num_agent_steps_sampled': 1024, 'num_agent_steps_trained': 0}",{},141.5,{},4,2.5,1,3,"{'learner': {'__all__': {'num_agent_steps_trained': 64.0, 'num_env_steps_trained': 512.0, 'total_loss': -0.16900024758651852}, 'default_policy': {'total_loss': -0.16900024758651852, 'policy_loss': -0.0035169731825590135, 'vf_loss': 0.04138172760035559, 'vf_loss_unclipped': 0.04138172760035559, 'vf_explained_var': -0.14492907822132112, 'entropy': 1.7653888776898383, 'mean_kl_loss': 0.0035509729276554935, 'curr_lr': 2e-05, 'curr_entropy_coeff': 0.1, 'curr_kl_coeff': 0.10000000149011612}}, 'num_env_steps_sampled': 1024, 'num_env_steps_trained': 0, 'num_agent_steps_sampled': 1024, 'num_agent_steps_trained': 0}",1024,0,1024,512,4.02029,0,0,0,0,4,0,0,0,"{'cpu_util_percent': 22.014193548387095, 'ram_util_percent': 66.60451612903226, 'gpu_util_percent0': 0.0, 'vram_util_percent0': 0.0}",{},{},{},"{'mean_raw_obs_processing_ms': 2.276182176480483, 'mean_inference_ms': 37.044412918307714, 'mean_action_processing_ms': 0.30439941245851604, 'mean_env_wait_ms': 31.383464242043267, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 4.0, 'episode_reward_min': 1.0, 'episode_reward_mean': 2.5, 'episode_len_mean': 141.5, 'episode_media': {}, 'episodes_this_iter': 3, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [2.0, 4.0, 1.0, 3.0], 'episode_lengths': [122, 148, 148, 148]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 2.276182176480483, 'mean_inference_ms': 37.044412918307714, 'mean_action_processing_ms': 0.30439941245851604, 'mean_env_wait_ms': 31.383464242043267, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0, 'connector_metrics': {'ObsPreprocessorConnector_ms': 0.03733038902282715, 'StateBufferConnector_ms': 0.0, 'ViewRequirementAgentConnector_ms': 0.6462752819061279}}","{'training_iteration_time_ms': 129365.762, 'sample_time_ms': 9014.307, 'synch_weights_time_ms': 56.916}"


2023-09-11 16:21:43,578	INFO tune.py:1148 -- Total run time: 318.46 seconds (308.30 seconds for the tuning loop).
Resume experiment with: tune.run(..., resume=True)


             episode_reward_max  episode_reward_min  episode_reward_mean  \
trial_id                                                                   
b34a8_00000                 4.0                 1.0                  2.5   

             episode_len_mean  episodes_this_iter  num_faulty_episodes  \
trial_id                                                                 
b34a8_00000             141.5                   3                    0   

             num_healthy_workers  num_in_flight_async_reqs  \
trial_id                                                     
b34a8_00000                    4                         0   

             num_remote_worker_restarts  num_agent_steps_sampled  ...  \
trial_id                                                          ...   
b34a8_00000                           0                     1024  ...   

             info/learner/default_policy/vf_loss  \
trial_id                                           
b34a8_00000                          