In [1]:
# Printa a versão do Python
import sys
print(f"Versão do Python: {sys.version}")

Versão do Python: 3.11.7 (tags/v3.11.7:fa7a6f2, Dec  4 2023, 19:24:49) [MSC v.1937 64 bit (AMD64)]


In [2]:
# Importa as bibliotecas
import os
import pickle

import numpy as np
import pandas as pd

import torch as th

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

%matplotlib inline

print(f"Versão do PyTorch: {th.__version__}")

Versão do PyTorch: 2.1.2+cpu


In [3]:
from Enviroment.Settings import *
from Enviroment.Manager import Enviroment

In [4]:
from stable_baselines3 import PPO

In [5]:
from stable_baselines3.common.env_util import make_vec_env, DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Importando o Monitor
from stable_baselines3.common.monitor import Monitor

In [6]:
import gymnasium as gym

print(f"Versão do Gymnasium: {gym.__version__}")

Versão do Gymnasium: 0.29.1


In [7]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

In [9]:
N_TRIALS = 500  # Maximum number of trials
N_JOBS = 1 # Number of jobs to run in parallel
N_STARTUP_TRIALS = 40  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 6  # Number of evaluations during the training
N_TIMESTEPS = int(6e5)  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 5
N_EVAL_EPISODES = 10
TIMEOUT = int(60 * 60 * 24 * 5)  # 5 dias 

enviroment_type = {
    "Observation": "ODD-one-hot",
    "Action": "RSA-SAR",
    "Reward": "RL-defaut",
    "StopCond": "40kReqs",
    "StartCond": "Empty"
}

env = Enviroment(
    network_load=300,
    k_routes=3,
    number_of_slots=128,
    enviroment_type=enviroment_type,
    data_folder="Train_PPO_Optuna_v2",
)

env_eval = Enviroment(
    network_load=300,
    k_routes=3,
    number_of_slots=128,
    enviroment_type=enviroment_type,
    data_folder="Train_PPO_Optuna_v2_e",
)

#env = Monitor(DummyVecEnv(env))

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": env,
}

In [14]:
from typing import Any, Dict
import torch
import torch.nn as nn

def sample_PPO_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for PPO hyperparameters.

    :param trial: Optuna trial object
    :return: The sampled hyperparameters for the given trial.
    """
    learning_rate = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    # 16, 32, ... 16384
    n_steps = 2 ** trial.suggest_int("exponent_n_steps", 4, 14)
    batch_size = 2 ** trial.suggest_int("exponent_batch_size", 4, 11)
    # Discount factor between 0.9 and 0.9999
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    ent_coef = trial.suggest_float('ent_coef', 1e-10, 0.1, log=True)
    clip_range = trial.suggest_float('clip_range', 0.1, 0.4)
    vf_coef = trial.suggest_float('vf_coef', 0.25, 0.75)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.01, 10.0, log=True)
    gae_lambda = trial.suggest_float("gae_lambda", 0.8, 0.99, log=True)
    n_epochs = trial.suggest_int("n_epochs", 2, 10)
    clip_range_vf = trial.suggest_float("clip_range_vf", 0.1, 0.4)

    net_arch_c = trial.suggest_categorical("net_arch", ["tiny", "small", "big", "huge"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "LeakyReLU"])

    # Display true values
    trial.set_user_attr("gamma", gamma)
    trial.set_user_attr("n_steps", n_steps)
    trial.set_user_attr("batch_size", batch_size)

    if net_arch_c == "tiny":
        net_arch = {"pi": [128], "vf": [128]}
    elif net_arch_c == "small":
        net_arch = {"pi": [256, 256], "vf": [256, 256]}
    elif net_arch_c == "big":
        net_arch = {"pi": [512, 256], "vf": [512, 128]}
    elif net_arch_c == "huge":
        net_arch = {"pi": [1024, 512, 256], "vf": [1024, 512, 128]}
        
    activation_fn = {"tanh": nn.Tanh, "LeakyReLU": nn.LeakyReLU}[activation_fn]

    return {
        "learning_rate": learning_rate,
        "n_steps": n_steps,
        "batch_size": batch_size,
        "gamma": gamma,
        "ent_coef": ent_coef,
        "clip_range": clip_range,
        "vf_coef": vf_coef,
        "max_grad_norm": max_grad_norm,
        "gae_lambda": gae_lambda,
        "n_epochs": n_epochs,
        "clip_range_vf": clip_range_vf,
        "policy_kwargs": {
            "net_arch": net_arch,
            "activation_fn": activation_fn,
        },
    }

In [11]:
from stable_baselines3.common.callbacks import EvalCallback

class TrialEvalCallback(EvalCallback):
    """
    Callback used for evaluating and reporting a trial.
    
    :param eval_env: Evaluation environement
    :param trial: Optuna trial object
    :param n_eval_episodes: Number of evaluation episodes
    :param eval_freq:   Evaluate the agent every ``eval_freq`` call of the callback.
    :param deterministic: Whether the evaluation should
        use a stochastic or deterministic policy.
    :param verbose:
    """

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 80_000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [12]:
def objective(trial: optuna.Trial) -> float:
    """
    Objective function using by Optuna to evaluate
    one configuration (i.e., one set of hyperparameters).

    Given a trial object, it will sample hyperparameters,
    evaluate it and report the result (mean episodic reward after training)

    :param trial: Optuna trial object
    :return: Mean episodic reward after training
    """

    kwargs = DEFAULT_HYPERPARAMS.copy()
    ### YOUR CODE HERE
    # TODO: 
    # 1. Sample hyperparameters and update the default keyword arguments: `kwargs.update(other_params)`
    # 2. Create the evaluation envs
    # 3. Create the `TrialEvalCallback`

    # 1. Sample hyperparameters and update the keyword arguments
    kwargs.update(sample_PPO_params(trial))

    # Create the RL model
    model = PPO(**kwargs)

    # 2. Create envs used for evaluation using `make_vec_env`, `ENV_ID` and `N_EVAL_ENVS`
    #eval_envs = make_vec_env(env, N_EVAL_ENVS)

    # 3. Create the `TrialEvalCallback` callback defined above that will periodically evaluate
    # and report the performance using `N_EVAL_EPISODES` every `EVAL_FREQ`
    # TrialEvalCallback signature:
    # TrialEvalCallback(eval_env, trial, n_eval_episodes, eval_freq, deterministic, verbose)
    eval_callback = TrialEvalCallback(env_eval, trial, N_EVAL_EPISODES, EVAL_FREQ, True)

    ### END OF YOUR CODE

    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        #eval_envs.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

In [16]:
import torch as th

# Set pytorch num threads to 1 for faster training
th.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)
# Create the study and start the hyperparameter optimization
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize", study_name="ODD_RSA_RSA_v5", storage="sqlite:///ODD_RSA_RSA.db")

try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

# Write report
study.trials_dataframe().to_csv("study_results_a2c_cartpole.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()

[I 2024-02-21 17:44:47,487] A new study created in RDB with name: ODD_RSA_RSA_v5


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=32 and n_envs=1)
[I 2024-02-21 19:28:26,238] Trial 0 finished with value: 38952.2 and parameters: {'lr': 2.6759521992847878e-05, 'exponent_n_steps': 5, 'exponent_batch_size': 10, 'gamma': 0.00014820339123240616, 'ent_coef': 0.00014529563103765725, 'clip_range': 0.12541397183747077, 'vf_coef': 0.38547914570085823, 'max_grad_norm': 0.04992311591926643, 'gae_lambda': 0.8805286591550723, 'n_epochs': 9, 'clip_range_vf': 0.3269632381040989, 'net_arch': 'huge', 'activation_fn': 'LeakyReLU'}. Best is trial 0 with value: 38952.2.
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=16 and n_envs=1)
[I 2024-02-21 21:28:12,752] Trial 1 finished with value: 38927.4 and parameters: {'lr': 0.0002607407076119749, 'exponent_n_steps': 4, 'exponent_batch_size': 10, 'gamma': 0.0010992491023240024, 'ent_coef': 2.039937104917831e-08, 'clip_range': 0.33970068445557355, 'vf_coef': 0.69