In [1]:
import os
import pickle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import torch as th

%matplotlib inline

In [2]:
import gymnasium as gym
print(f"Using gymnasium version {gym.__version__}")

Using gymnasium version 0.29.1


In [3]:
import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor

print(f"Using stable-baselines3 version {stable_baselines3.__version__}")

Using stable-baselines3 version 2.2.1


In [4]:
from Enviroment.Settings import *
from Enviroment.Manager import Enviroment

In [12]:
def run_test(env, model, num_sim=10):
    ## Retorna a PB para o modelo treinado
    print("Testing the model...")

    np.random.seed(42)
    seeds = np.random.randint(0, 100_000, num_sim, dtype=int)
    pbs = np.zeros(num_sim)
    reward = np.zeros(num_sim)

    for i, seed in enumerate(seeds):

        print(f"Executando simulação {i+1} de {num_sim}")

        # Reseta o ambiente
        state, info = env.reset(int(seed))

        for reqs in range(MAX_REQS+1):

            if int == type(model):
                alg_heuristic = model
            else:
                alg_heuristic = model.predict(observation=state, deterministic=False)[0]

            state, _, done, trunk, info = env.step(alg_heuristic)

            if (done or trunk) and reward[i] == 0:
                reward[i] = env._reward_episode

        pbs[i] = info['total_number_of_blocks'] / reqs

        print(f"Blocking Probability: {pbs[i]} | Reward: {reward[i]} | Req: {reqs}")

    print(f"\nBlocking Probability: {np.mean(pbs)} | Min: {np.min(pbs)} | Max: {np.max(pbs)} | +- {np.std(pbs)}")
    print(f"Reward: {np.mean(reward)} | Min: {np.min(reward)} | Max: {np.max(reward)} | +- {np.std(reward)}")

    return pbs, reward

In [6]:
# Avaliando a PB do modelo treinado
enviroment_type_test = {
    "Observation": "availability-vector",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "MaxReq",
    "StartCond": "Empty"
}

In [7]:
# Cria o ambiente de simulação
env = Enviroment(
    network_load=300,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type_test,
    data_folder="SAR_eval",
)

run_test(env, 1, num_sim=10) # Executa o RSA 10 vezes para calcular a PB

print("Done!")

Testing the model...
Executando simulação 1 de 10
Blocking Probability: 0.01135 | Reward: 97730.0 | Req: 100000
Executando simulação 2 de 10
Blocking Probability: 0.01162 | Reward: 97676.0 | Req: 100000
Executando simulação 3 de 10
Blocking Probability: 0.01227 | Reward: 97546.0 | Req: 100000
Executando simulação 4 de 10
Blocking Probability: 0.01056 | Reward: 97888.0 | Req: 100000
Executando simulação 5 de 10
Blocking Probability: 0.01148 | Reward: 97704.0 | Req: 100000
Executando simulação 6 de 10
Blocking Probability: 0.01365 | Reward: 97270.0 | Req: 100000
Executando simulação 7 de 10
Blocking Probability: 0.01252 | Reward: 97496.0 | Req: 100000
Executando simulação 8 de 10
Blocking Probability: 0.01258 | Reward: 97484.0 | Req: 100000
Executando simulação 9 de 10
Blocking Probability: 0.01118 | Reward: 97764.0 | Req: 100000
Executando simulação 10 de 10
Blocking Probability: 0.01259 | Reward: 97482.0 | Req: 100000

Blocking Probability: 0.011980000000000001 | Min: 0.01056 | Max: 0.

In [7]:
enviroment_type = {
    "Observation": "availability-vector",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "40kReqs",
    "StartCond": "Empty"
}

In [8]:
from typing import Callable

def linear_schedule(initial_value: float, final_value) -> Callable[[float], float]:
    """
    Linear learning rate schedule from `initial_value` to `final_value` over `progress_remaining` fraction of steps.

    :param initial_value: initial learning rate
    :param final_value: final learning rate
    :return: schedule function
    """

    def func(progress_remaining: float) -> float:
        return initial_value + (final_value - initial_value) * progress_remaining

    return func

In [10]:
# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type,
    data_folder="PPO",
)

LOG_PATH = env.folder_name

print(f"Logs will be saved at {LOG_PATH}")

env = Monitor(env, LOG_PATH + '\\training\\training')

# Cria o dicionário com as configurações da política da rede. 
policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[1024, 512, 128], vf=[1024, 512, 128]))

# Cria o modelo de treinamento PPO com decaimento do learning rate
model = PPO("MlpPolicy", env, 
            policy_kwargs=policy_kwargs, 
            verbose=0, 
            tensorboard_log=LOG_PATH + '\\tensorboard\\',
            learning_rate=linear_schedule(0.0008, 0.00005))


# Cria o callback para salvar o melhor modelo
callback = EvalCallback(env, best_model_save_path=LOG_PATH + '\\training\\best_model', log_path=LOG_PATH + '\\training\\logs', eval_freq=40_000, deterministic=True, render=False, verbose=0)

# Treina o modelo
model.learn(total_timesteps=12_800_000, callback=callback, progress_bar=True, tb_log_name="V0_RSA-SAR")

# Salva o modelo treinado
model.save(LOG_PATH + '\\training\\final_model')


Logs will be saved at ../logs/PPO_001


Output()

In [16]:
model = PPO.load('../logs/PPO_001' + '\\training\\best_model\\best_model.zip', env=env)

In [9]:
# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type,
    data_folder="PPO",
)

LOG_PATH = env.folder_name

print(f"Logs will be saved at {LOG_PATH}")

env = Monitor(env, LOG_PATH + '\\training\\training')

# Cria o dicionário com as configurações da política da rede. 
policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[1024, 512, 128], vf=[1024, 512, 128]))

# Cria o modelo de treinamento PPO com decaimento do learning rate
model = PPO("MlpPolicy", env, 
            policy_kwargs=policy_kwargs, 
            verbose=0, 
            tensorboard_log=LOG_PATH + '\\tensorboard\\',
            learning_rate=linear_schedule(0.0003, 0.0002))


# Cria o callback para salvar o melhor modelo
callback = EvalCallback(env, best_model_save_path=LOG_PATH + '\\training\\best_model', log_path=LOG_PATH + '\\training\\logs', eval_freq=40_000, deterministic=True, render=False, verbose=0)

# Treina o modelo
model.learn(total_timesteps=5_000_000, callback=callback, progress_bar=True, tb_log_name="V1_RSA-SAR")

# Salva o modelo treinado
model.save(LOG_PATH + '\\training\\final_model')


Logs will be saved at ../logs/PPO_003


Output()

In [10]:
best_model = PPO.load(LOG_PATH + '\\training\\best_model\\best_model.zip', env=env)

In [11]:
# Cria o ambiente de simulação
env = Enviroment(
    network_load=300,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type_test,
    data_folder="Model_003_eval",
)

run_test(env, best_model, num_sim=10) # Executa o RSA 10 vezes para calcular a PB

print("Done!")

Testing the model...
Executando simulação 1 de 10
Blocking Probability: 0.01135 | Reward: 97730.0 | Req: 100000
Executando simulação 2 de 10
Blocking Probability: 0.01162 | Reward: 97676.0 | Req: 100000
Executando simulação 3 de 10
Blocking Probability: 0.01227 | Reward: 97546.0 | Req: 100000
Executando simulação 4 de 10
Blocking Probability: 0.01056 | Reward: 97888.0 | Req: 100000
Executando simulação 5 de 10
Blocking Probability: 0.01148 | Reward: 97704.0 | Req: 100000
Executando simulação 6 de 10
Blocking Probability: 0.01365 | Reward: 97270.0 | Req: 100000
Executando simulação 7 de 10
Blocking Probability: 0.01252 | Reward: 97496.0 | Req: 100000
Executando simulação 8 de 10
Blocking Probability: 0.01258 | Reward: 97484.0 | Req: 100000
Executando simulação 9 de 10
Blocking Probability: 0.01118 | Reward: 97764.0 | Req: 100000
Executando simulação 10 de 10
Blocking Probability: 0.01259 | Reward: 97482.0 | Req: 100000

Blocking Probability: 0.011980000000000001 | Min: 0.01056 | Max: 0.

In [13]:
# Cria o ambiente de simulação
env = Enviroment(
    network_load=300,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type_test,
    data_folder="Model_003_eval",
)

run_test(env, best_model, num_sim=10) # Executa o RSA 10 vezes para calcular a PB

print("Done!")

Testing the model...
Executando simulação 1 de 10
Blocking Probability: 0.01135 | Reward: 97730.0 | Req: 100000
Executando simulação 2 de 10
Blocking Probability: 0.01162 | Reward: 97676.0 | Req: 100000
Executando simulação 3 de 10
Blocking Probability: 0.01227 | Reward: 97546.0 | Req: 100000
Executando simulação 4 de 10
Blocking Probability: 0.01056 | Reward: 97888.0 | Req: 100000
Executando simulação 5 de 10
Blocking Probability: 0.01148 | Reward: 97704.0 | Req: 100000
Executando simulação 6 de 10
Blocking Probability: 0.01365 | Reward: 97270.0 | Req: 100000
Executando simulação 7 de 10
Blocking Probability: 0.01252 | Reward: 97496.0 | Req: 100000
Executando simulação 8 de 10
Blocking Probability: 0.01258 | Reward: 97484.0 | Req: 100000
Executando simulação 9 de 10
Blocking Probability: 0.01118 | Reward: 97764.0 | Req: 100000
Executando simulação 10 de 10
Blocking Probability: 0.01259 | Reward: 97482.0 | Req: 100000

Blocking Probability: 0.011980000000000001 | Min: 0.01056 | Max: 0.

In [14]:
enviroment_type = {
    "Observation": "ODD-one-hot",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "40kReqs",
    "StartCond": "Empty"
}

In [15]:
# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type,
    data_folder="PPO",
)

LOG_PATH = env.folder_name

print(f"Logs will be saved at {LOG_PATH}")

env = Monitor(env, LOG_PATH + '\\training\\training')

# Cria o dicionário com as configurações da política da rede. 
policy_kwargs = dict(activation_fn=th.nn.LeakyReLU,
                     net_arch=dict(pi=[512, 128], vf=[512, 128]))

# Cria o modelo de treinamento PPO com decaimento do learning rate
model = PPO("MlpPolicy", env, 
            policy_kwargs=policy_kwargs, 
            verbose=0, 
            tensorboard_log=LOG_PATH + '\\tensorboard\\',
            learning_rate=0.0001)


# Cria o callback para salvar o melhor modelo
callback = EvalCallback(env, best_model_save_path=LOG_PATH + '\\training\\best_model', log_path=LOG_PATH + '\\training\\logs', eval_freq=40_000, deterministic=True, render=False, verbose=0)

# Treina o modelo
model.learn(total_timesteps=2_000_000, callback=callback, progress_bar=True, tb_log_name="V4_RSA-SAR_ODD")

# Salva o modelo treinado
model.save(LOG_PATH + '\\training\\final_model')


Output()

Logs will be saved at ../logs/PPO_004


In [20]:
# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type,
    data_folder="PPO_teste",
)

LOG_PATH = env.folder_name

print(f"Logs will be saved at {LOG_PATH}")

env = Monitor(env, LOG_PATH + '\\training\\training')

# Cria o dicionário com as configurações da política da rede. 
policy_kwargs = dict(activation_fn=th.nn.Tanh,
                     net_arch=dict(pi=[512, 128], vf=[512, 128]))


# Cria o modelo de treinamento PPO com decaimento do learning rate
model = PPO("MlpPolicy", env, 
            policy_kwargs=policy_kwargs, 
            verbose=0, 
            tensorboard_log=LOG_PATH + '\\tensorboard\\',
            learning_rate=0.0002,
            n_steps=256,
            batch_size=512,
            n_epochs=10,
            gae_lambda=0.95,
            gamma=0.99,
            clip_range=0.2,
            ent_coef= 0.01,
            vf_coef= 0.5,
            max_grad_norm=0.5)


# Cria o callback para salvar o melhor modelo
callback = EvalCallback(env, best_model_save_path=LOG_PATH + '\\training\\best_model', log_path=LOG_PATH + '\\training\\logs', eval_freq=40_000, deterministic=True, render=False, verbose=0, n_eval_episodes=3)

# Treina o modelo
model.learn(total_timesteps=100_000, callback=callback, progress_bar=True, tb_log_name="V5_RSA-SAR_ODD")

# Salva o modelo treinado
model.save(LOG_PATH + '\\training\\final_model')


Output()

Logs will be saved at ../logs/PPO_teste_005


In [21]:
# Treina o modelo
model.learn(total_timesteps=1_000_000, callback=callback, progress_bar=True, tb_log_name="V5_RSA-SAR_ODD", reset_num_timesteps=False)

# Salva o modelo treinado
model.save(LOG_PATH + '\\training\\final_model')

Output()

In [22]:
best_model = PPO.load(LOG_PATH + '\\training\\best_model\\best_model.zip', env=env)

In [24]:
# Avaliando a PB do modelo treinado
enviroment_type_test = {
    "Observation": "ODD-one-hot",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "MaxReq",
    "StartCond": "Empty"
}


# Cria o ambiente de simulação
env = Enviroment(
    network_load=300,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type_test,
    data_folder="Model_003_eval",
)

run_test(env, best_model, num_sim=10) # Executa o RSA 10 vezes para calcular a PB

print("Done!")

Testing the model...
Executando simulação 1 de 10
Blocking Probability: 0.01158 | Reward: 97684.0 | Req: 100000
Executando simulação 2 de 10
Blocking Probability: 0.01313 | Reward: 97374.0 | Req: 100000
Executando simulação 3 de 10
Blocking Probability: 0.01256 | Reward: 97488.0 | Req: 100000
Executando simulação 4 de 10
Blocking Probability: 0.01163 | Reward: 97674.0 | Req: 100000
Executando simulação 5 de 10
Blocking Probability: 0.01244 | Reward: 97512.0 | Req: 100000
Executando simulação 6 de 10
Blocking Probability: 0.01362 | Reward: 97276.0 | Req: 100000
Executando simulação 7 de 10
Blocking Probability: 0.01318 | Reward: 97364.0 | Req: 100000
Executando simulação 8 de 10
Blocking Probability: 0.01341 | Reward: 97318.0 | Req: 100000
Executando simulação 9 de 10
Blocking Probability: 0.01219 | Reward: 97562.0 | Req: 100000
Executando simulação 10 de 10
Blocking Probability: 0.01349 | Reward: 97302.0 | Req: 100000

Blocking Probability: 0.012723000000000002 | Min: 0.01158 | Max: 0.

In [25]:
enviroment_type = {
    "Observation": "availability-vector",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "40kReqs",
    "StartCond": "Empty"
}

In [None]:
# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type,
    data_folder="PPO_param",
)

LOG_PATH = env.folder_name

print(f"Logs will be saved at {LOG_PATH}")

env = Monitor(env, LOG_PATH + '\\training\\training')

# Cria o dicionário com as configurações da política da rede. 
policy_kwargs = dict(activation_fn=th.nn.Tanh,
                     net_arch=dict(pi=[512, 128], vf=[512, 128]))


# Cria o modelo de treinamento PPO com decaimento do learning rate
model = PPO("MlpPolicy", env, 
            policy_kwargs=policy_kwargs, 
            verbose=0, 
            tensorboard_log=LOG_PATH + '\\tensorboard\\',
            learning_rate=0.0004,
            n_steps=128,
            batch_size=2048,
            n_epochs=10,
            gae_lambda=0.95,
            gamma=0.99,
            clip_range=0.05,
            ent_coef= 0.05,
            vf_coef= 0.5,
            max_grad_norm=0.5)


# Cria o callback para salvar o melhor modelo
callback = EvalCallback(env, best_model_save_path=LOG_PATH + '\\training\\best_model', log_path=LOG_PATH + '\\training\\logs', eval_freq=40_000, deterministic=True, render=False, verbose=0, n_eval_episodes=3)

# Treina o modelo
model.learn(total_timesteps=100_000, callback=callback, progress_bar=True, tb_log_name="V5_RSA-SAR_vector")

# Salva o modelo treinado
model.save(LOG_PATH + '\\training\\final_model')
