In [1]:
import os
import pickle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

In [2]:
import gymnasium as gym
print(f"Using gymnasium version {gym.__version__}")


Using gymnasium version 0.29.1


In [3]:
import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor

print(f"Using stable-baselines3 version {stable_baselines3.__version__}")

Using stable-baselines3 version 2.2.1


In [4]:
from Enviroment.Settings import *
from Enviroment.Manager import Enviroment

In [7]:
import torch as th

In [8]:
enviroment_type = {
    "Observation": "availability-vector",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "40kReqs",
    "StartCond": "Empty"
}

# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type,
    data_folder="PPO",
)

LOG_PATH = env.folder_name

env = Monitor(env, LOG_PATH + '\\training\\training')

# Cria o dicionário com as configurações da política da rede. 
policy_kwargs = dict(activation_fn=th.nn.Tanh,
                     net_arch=dict(pi=[512, 512, 128], vf=[512, 512, 128]))

# Create the agent
model = PPO("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=0, tensorboard_log=LOG_PATH + '\\tensorboard\\', seed=42,
            learning_rate=0.00025, 
            n_steps=256, 
            batch_size=4096, 
            n_epochs=10, 
            gae_lambda=0.95, 
            clip_range=0.2, 
            ent_coef=0.005)


# Cria o callback para salvar o melhor modelo
callback = EvalCallback(env, best_model_save_path=LOG_PATH + '\\training\\best_model', log_path=LOG_PATH + '\\training\\logs', eval_freq=40_000, deterministic=True, render=False, verbose=0)

# Treina o modelo
model.learn(total_timesteps=4_800_000, callback=callback, progress_bar=True, tb_log_name="RL_RSA-SAR_vector_v0")

# Salva o modelo treinado
model.save(LOG_PATH + '\\training\\final_model')


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=256 and n_envs=1)


Output()

In [9]:
enviroment_type = {
    "Observation": "ODD-one-hot",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "40kReqs",
    "StartCond": "Empty"
}

# Cria o ambiente de simulação
env = Enviroment(
    network_load=LOAD,
    k_routes=K_ROUTES,
    number_of_slots=NUMBER_OF_SLOTS,
    enviroment_type=enviroment_type,
    data_folder="PPO",
)

LOG_PATH = env.folder_name

env = Monitor(env, LOG_PATH + '\\training\\training')

# Cria o dicionário com as configurações da política da rede. 
policy_kwargs = dict(activation_fn=th.nn.Tanh,
                     net_arch=dict(pi=[512, 128], vf=[512, 128]))

# Create the agent
model = PPO("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=0, tensorboard_log=LOG_PATH + '\\tensorboard\\', seed=42,
            learning_rate=0.00005, 
            n_steps=128, 
            batch_size=4096, 
            n_epochs=4,
            gamma=0.9,
            gae_lambda=0.95, 
            clip_range=0.2, 
            ent_coef=0.01)


# Cria o callback para salvar o melhor modelo
callback = EvalCallback(env, best_model_save_path=LOG_PATH + '\\training\\best_model', log_path=LOG_PATH + '\\training\\logs', eval_freq=40_000, deterministic=True, render=False, verbose=0)

# Treina o modelo
model.learn(total_timesteps=5_800_000, callback=callback, progress_bar=True, tb_log_name="RL_RSA-SAR_vector_v0")

# Salva o modelo treinado
model.save(LOG_PATH + '\\training\\final_model')


Output()

We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=128 and n_envs=1)
