# Treinando modelos com o agente em PPO

Documentação do [PPO](https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html) para referência.

## Importando os pacotes necessários e construíndo os callbacks

In [1]:
import os
import pickle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

In [2]:
import gymnasium as gym
print(f"Using gymnasium version {gym.__version__}")


Using gymnasium version 0.29.1


In [3]:
import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor

print(f"Using stable-baselines3 version {stable_baselines3.__version__}")

Using stable-baselines3 version 2.2.1


In [4]:
# Define o callback para salvar o melhor modelo durante o treinamento 
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback que salva o melhor modelo baseado na recompensa média de treinamento.
    """
    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Cria o diretório de log se não existir
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            # Avalia o modelo
            x, y = ts2xy(load_results(self.log_dir), 'timesteps')
            if len(x) > 0:
                # Calcula a média da recompensa de treinamento
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    print(f"Num timesteps: {self.num_timesteps}")
                    print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward: {mean_reward:.2f}")

                # Se a média da recompensa for maior que a melhor média de recompensa
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    if self.verbose > 0:
                        print(f"Saving new best model to {self.save_path}.zip")
                    self.model.save(self.save_path)

        return True

In [5]:
from Enviroment.Settings import *
from Enviroment.Manager import Enviroment

## Ambientes para os resultados de baselines

In [6]:
def run_test(env, model, num_sim=10):
    ## Retorna a PB para o modelo treinado
    print("Testing the model...")

    np.random.seed(42)
    seeds = np.random.randint(0, 100_000, num_sim, dtype=int)
    pbs = np.zeros(num_sim)
    reward = np.zeros(num_sim)

    for i, seed in enumerate(seeds):

        print(f"Executando simulação {i+1} de {num_sim}")

        # Reseta o ambiente
        state, info = env.reset(int(seed))

        for reqs in range(MAX_REQS+1):

            if int == type(model):
                alg_heuristic = model
            else:
                alg_heuristic = model.predict(observation=state, deterministic=True)[0]

            state, _, done, trunk, info = env.step(alg_heuristic)

            if (done or trunk) and reward[i] == 0:
                reward[i] = env._reward_episode

        pbs[i] = info['total_number_of_blocks'] / reqs

        print(f"Blocking Probability: {pbs[i]} | Reward: {reward[i]} | Req: {reqs}")

    print(f"\nBlocking Probability: {np.mean(pbs)} | Min: {np.min(pbs)} | Max: {np.max(pbs)} | +- {np.std(pbs)}")
    print(f"Reward: {np.mean(reward)} | Min: {np.min(reward)} | Max: {np.max(reward)} | +- {np.std(reward)}")

    return pbs, reward

### Resultados para RSA

In [7]:
# Avaliando a PB do modelo treinado
enviroment_type_test = {
    "Observation": "availability-vector",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "MaxReq",
    "StartCond": "Empty"
}

# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type_test,
    data_folder="RSA_eval",
)

run_test(env, 0, num_sim=10) # Executa o RSA 10 vezes para calcular a PB

print("Done!")

Testing the model...
Executando simulação 1 de 10
Blocking Probability: 0.00268 | Reward: 99464.0 | Req: 100000
Executando simulação 2 de 10
Blocking Probability: 0.00311 | Reward: 99378.0 | Req: 100000
Executando simulação 3 de 10
Blocking Probability: 0.00302 | Reward: 99396.0 | Req: 100000
Executando simulação 4 de 10
Blocking Probability: 0.00275 | Reward: 99450.0 | Req: 100000
Executando simulação 5 de 10
Blocking Probability: 0.00311 | Reward: 99378.0 | Req: 100000
Executando simulação 6 de 10
Blocking Probability: 0.00348 | Reward: 99304.0 | Req: 100000
Executando simulação 7 de 10
Blocking Probability: 0.00304 | Reward: 99392.0 | Req: 100000
Executando simulação 8 de 10
Blocking Probability: 0.00336 | Reward: 99328.0 | Req: 100000
Executando simulação 9 de 10
Blocking Probability: 0.00271 | Reward: 99458.0 | Req: 100000
Executando simulação 10 de 10
Blocking Probability: 0.00349 | Reward: 99302.0 | Req: 100000

Blocking Probability: 0.0030750000000000005 | Min: 0.00268 | Max: 0

### Resultados para SAR

In [8]:
# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type_test,
    data_folder="SAR_eval",
)

run_test(env, 1, num_sim=10) # Executa o RSA 10 vezes para calcular a PB

print("Done!")

Testing the model...
Executando simulação 1 de 10
Blocking Probability: 0.00089 | Reward: 99822.0 | Req: 100000
Executando simulação 2 de 10
Blocking Probability: 0.00119 | Reward: 99762.0 | Req: 100000
Executando simulação 3 de 10
Blocking Probability: 0.00142 | Reward: 99716.0 | Req: 100000
Executando simulação 4 de 10
Blocking Probability: 0.00095 | Reward: 99810.0 | Req: 100000
Executando simulação 5 de 10
Blocking Probability: 0.00124 | Reward: 99752.0 | Req: 100000
Executando simulação 6 de 10
Blocking Probability: 0.0013 | Reward: 99740.0 | Req: 100000
Executando simulação 7 de 10
Blocking Probability: 0.00144 | Reward: 99712.0 | Req: 100000
Executando simulação 8 de 10
Blocking Probability: 0.00164 | Reward: 99672.0 | Req: 100000
Executando simulação 9 de 10
Blocking Probability: 0.00081 | Reward: 99838.0 | Req: 100000
Executando simulação 10 de 10
Blocking Probability: 0.00151 | Reward: 99698.0 | Req: 100000

Blocking Probability: 0.001239 | Min: 0.00081 | Max: 0.00164 | +- 0.

## Criando o primeiro ambiente para os testes

In [9]:
enviroment_type = {
    "Observation": "availability-vector",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "40kReqs",
    "StartCond": "Empty"
}

# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type,
    data_folder="PPO",
)

LOG_PATH = env.folder_name

env = Monitor(env, LOG_PATH + '\\training\\training')

# Cria o modelo
model = PPO("MlpPolicy", env, verbose=0, tensorboard_log=LOG_PATH + '\\tensorboard\\')

# Cria o callback para salvar o melhor modelo
callback = SaveOnBestTrainingRewardCallback(check_freq=40_000, log_dir=LOG_PATH + "\\training\\", verbose=0)

# Treina o modelo
model.learn(total_timesteps=1_800_000, callback=callback, progress_bar=True)
#model.learn(total_timesteps=1_800_000, callback=callback, progress_bar=True, tb_log_name="first_run")

# Salva o modelo treinado
model.save(LOG_PATH + '\\training\\final_model')


Output()

In [10]:
# Evaluate the agent
# NOTE: If you use wrappers with your environment that modify rewards,
#       this will be reflected here. To evaluate with original rewards,
#       wrap environment in a "Monitor" wrapper before other wrappers.
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward: 39894.80 +/- 28.29


In [12]:
model.policy

ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=415, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=415, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=2, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)

In [13]:
LOG_PATH

'../logs/PPO_003'

In [14]:
# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type_test,
    data_folder="PPO_Eval",
)

model = PPO.load(LOG_PATH + '\\training\\best_model.zip')

## Retorna a PB para o modelo treinado
run_test(env, model, num_sim=10)

Testing the model...
Executando simulação 1 de 10
Blocking Probability: 0.00156 | Reward: 99688.0 | Req: 100000
Executando simulação 2 de 10
Blocking Probability: 0.00135 | Reward: 99730.0 | Req: 100000
Executando simulação 3 de 10
Blocking Probability: 0.00137 | Reward: 99726.0 | Req: 100000
Executando simulação 4 de 10
Blocking Probability: 0.00111 | Reward: 99778.0 | Req: 100000
Executando simulação 5 de 10
Blocking Probability: 0.00164 | Reward: 99672.0 | Req: 100000
Executando simulação 6 de 10
Blocking Probability: 0.00135 | Reward: 99730.0 | Req: 100000
Executando simulação 7 de 10
Blocking Probability: 0.00133 | Reward: 99734.0 | Req: 100000
Executando simulação 8 de 10
Blocking Probability: 0.0017 | Reward: 99660.0 | Req: 100000
Executando simulação 9 de 10
Blocking Probability: 0.00118 | Reward: 99764.0 | Req: 100000
Executando simulação 10 de 10
Blocking Probability: 0.00202 | Reward: 99596.0 | Req: 100000

Blocking Probability: 0.001461 | Min: 0.00111 | Max: 0.00202 | +- 0.

(array([0.00156, 0.00135, 0.00137, 0.00111, 0.00164, 0.00135, 0.00133,
        0.0017 , 0.00118, 0.00202]),
 array([99688., 99730., 99726., 99778., 99672., 99730., 99734., 99660.,
        99764., 99596.]))

## Criando o segundo ambiente para os testes

In [10]:
import torch as th

In [8]:
enviroment_type = {
    "Observation": "availability-vector",
    "Action": "Route",
    "Reward": "RL-defaut",
    "StopCond": "40kReqs",
    "StartCond": "Empty"
}

# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type,
    data_folder="PPO",
)

LOG_PATH = env.folder_name

env = Monitor(env, LOG_PATH + '\\training\\training')

# Cria o dicionário com as configurações da política da rede. 
policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[1024, 512, 128], vf=[1024, 512, 128]))

# Create the agent
model = PPO("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=0, tensorboard_log=LOG_PATH + '\\tensorboard\\')


# Cria o callback para salvar o melhor modelo
callback = EvalCallback(env, best_model_save_path=LOG_PATH + '\\training\\best_model', log_path=LOG_PATH + '\\training\\logs', eval_freq=40_000, deterministic=True, render=False, verbose=0)

# Treina o modelo
model.learn(total_timesteps=1_800_000, callback=callback, progress_bar=True, tb_log_name="v1_Route")

# Salva o modelo treinado
model.save(LOG_PATH + '\\training\\final_model')


Output()

In [9]:
model.policy

ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=415, out_features=1024, bias=True)
      (1): ReLU()
      (2): Linear(in_features=1024, out_features=512, bias=True)
      (3): ReLU()
      (4): Linear(in_features=512, out_features=128, bias=True)
      (5): ReLU()
    )
    (value_net): Sequential(
      (0): Linear(in_features=415, out_features=1024, bias=True)
      (1): ReLU()
      (2): Linear(in_features=1024, out_features=512, bias=True)
      (3): ReLU()
      (4): Linear(in_features=512, out_features=128, bias=True)
      (5): ReLU()
    )
  )
  (action_net): Linear(in_features=128, out_features=3, bias=True)
  (value_net): Linear(

In [10]:
# Evaluate the agent
# NOTE: If you use wrappers with your environment that modify rewards,
#       this will be reflected here. To evaluate with original rewards,
#       wrap environment in a "Monitor" wrapper before other wrappers.
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward: 38755.40 +/- 54.02


## Criando o terceiro ambiente para os testes

In [11]:
enviroment_type = {
    "Observation": "availability-vector",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "40kReqs",
    "StartCond": "Empty"
}

# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type,
    data_folder="PPO",
)

LOG_PATH = env.folder_name

env = Monitor(env, LOG_PATH + '\\training\\training')

# Cria o dicionário com as configurações da política da rede. 
policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[1024, 512, 128], vf=[1024, 512, 128]))

# Create the agent
model = PPO("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=0, tensorboard_log=LOG_PATH + '\\tensorboard\\')


# Cria o callback para salvar o melhor modelo
callback = EvalCallback(env, best_model_save_path=LOG_PATH + '\\training\\best_model', log_path=LOG_PATH + '\\training\\logs', eval_freq=40_000, deterministic=True, render=False, verbose=0)

# Treina o modelo
model.learn(total_timesteps=4_800_000, callback=callback, progress_bar=True, tb_log_name="v2_Route")

# Salva o modelo treinado
model.save(LOG_PATH + '\\training\\final_model')


Output()

In [12]:
# Evaluate the agent
# NOTE: If you use wrappers with your environment that modify rewards,
#       this will be reflected here. To evaluate with original rewards,
#       wrap environment in a "Monitor" wrapper before other wrappers.
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward: 39832.20 +/- 42.83


In [13]:
LOG_PATH

'../logs/PPO_004'

In [14]:
model.policy

ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=415, out_features=1024, bias=True)
      (1): ReLU()
      (2): Linear(in_features=1024, out_features=512, bias=True)
      (3): ReLU()
      (4): Linear(in_features=512, out_features=128, bias=True)
      (5): ReLU()
    )
    (value_net): Sequential(
      (0): Linear(in_features=415, out_features=1024, bias=True)
      (1): ReLU()
      (2): Linear(in_features=1024, out_features=512, bias=True)
      (3): ReLU()
      (4): Linear(in_features=512, out_features=128, bias=True)
      (5): ReLU()
    )
  )
  (action_net): Linear(in_features=128, out_features=2, bias=True)
  (value_net): Linear(

In [20]:
# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type_test,
    data_folder="PPO_Eval",
)

model = PPO.load(LOG_PATH + '\\training\\best_model\\best_model.zip')

## Retorna a PB para o modelo treinado
run_test(env, model, num_sim=10)

Testing the model...
Executando simulação 1 de 10
Blocking Probability: 0.00068 | Reward: 99864.0 | Req: 100000
Executando simulação 2 de 10
Blocking Probability: 0.00104 | Reward: 99792.0 | Req: 100000
Executando simulação 3 de 10
Blocking Probability: 0.00088 | Reward: 99824.0 | Req: 100000
Executando simulação 4 de 10
Blocking Probability: 0.00085 | Reward: 99830.0 | Req: 100000
Executando simulação 5 de 10
Blocking Probability: 0.00119 | Reward: 99762.0 | Req: 100000
Executando simulação 6 de 10
Blocking Probability: 0.00113 | Reward: 99774.0 | Req: 100000
Executando simulação 7 de 10
Blocking Probability: 0.00099 | Reward: 99802.0 | Req: 100000
Executando simulação 8 de 10
Blocking Probability: 0.0013 | Reward: 99740.0 | Req: 100000
Executando simulação 9 de 10
Blocking Probability: 0.00081 | Reward: 99838.0 | Req: 100000
Executando simulação 10 de 10
Blocking Probability: 0.0014 | Reward: 99720.0 | Req: 100000

Blocking Probability: 0.001027 | Min: 0.00068 | Max: 0.0014 | +- 0.00

(array([0.00068, 0.00104, 0.00088, 0.00085, 0.00119, 0.00113, 0.00099,
        0.0013 , 0.00081, 0.0014 ]),
 array([99864., 99792., 99824., 99830., 99762., 99774., 99802., 99740.,
        99838., 99720.]))

In [8]:
LOG_PATH = '..\logs\PPO_002_good'

In [10]:
# Create the agent
model = PPO.load(LOG_PATH + '\\training\\final_model.zip')

In [21]:
enviroment_type = {
    "Observation": "availability-vector",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "40kReqs",
    "StartCond": "Empty"
}

# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type,
    data_folder="PPO",
)

LOG_PATH = env.folder_name

env = Monitor(env, LOG_PATH + '\\training\\training')

In [22]:
# Cria o callback para salvar o melhor modelo
callback = EvalCallback(env, best_model_save_path=LOG_PATH + '\\training\\best_model', log_path=LOG_PATH + '\\training\\logs', eval_freq=40_000, deterministic=True, render=False, verbose=0)

In [24]:
model.env = env

In [25]:
# Treina o modelo
model.learn(total_timesteps=4_800_000, callback=callback, progress_bar=True, tb_log_name="cont_RSA-SAR", reset_num_timesteps=False)

# Salva o modelo treinado
model.save(LOG_PATH + '\\training\\final_model')

Output()

RuntimeError: Tried to step environment that needs reset



## Teste de informações Extra no Tensorboard

In [7]:
class TensorboardCallback(BaseCallback):
    """
    Custom callback for plotting additional values in tensorboard.
    """

    def __init__(self, verbose=0):
        super().__init__(verbose)

    def _on_step(self) -> bool:

        env = self.model.get_env()
        
        # Plotando a probabilidade de bloqueio
        self.logger.record("blocking_probability", env._total_number_of_blocks / env._last_request)


        value = np.random.random()
        self.logger.record("random_value", value)
        return True

In [12]:
enviroment_type = {
    "Observation": "availability-vector",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "40kReqs",
    "StartCond": "Empty"
}

# Cria o ambiente de simulação
env2 = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type,
    data_folder="PPO",
)

LOG_PATH2 = env2.folder_name

env2 = Monitor(env2, LOG_PATH2 + '\\training\\training')

# Cria o dicionário com as configurações da política da rede. 
policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[512, 512, 128], vf=[512, 512, 128]))

# Create the agent
model2 = PPO("MlpPolicy", env2, policy_kwargs=policy_kwargs, verbose=0, tensorboard_log=LOG_PATH2 + '\\tensorboard\\')


# Cria o callback para salvar o melhor modelo
callback2 = EvalCallback(env2, best_model_save_path=LOG_PATH2 + '\\training\\best_model', log_path=LOG_PATH2 + '\\training\\logs', eval_freq=20_000, deterministic=True, render=False, verbose=0)
callback22 = TensorboardCallback()


# Treina o modelo
model2.learn(total_timesteps=100_000, callback=[callback2, callback22], progress_bar=True, tb_log_name="v3_Route")

# Salva o modelo treinado
model2.save(LOG_PATH2 + '\\training\\final_model')


Output()

AttributeError: 'DummyVecEnv' object has no attribute '_total_number_of_blocks'