In [14]:
import gym
env = gym.make('CartPole-v1', render_mode = 'human')

In [15]:
import mlflow

# connect mlflow client to the mlflow server that runs on localhost:5000
MLFLOW_SERVER_URI = 'http://localhost:5000'
mlflow.set_tracking_uri(str(MLFLOW_SERVER_URI))

EXPERIMENT_NAME = 'DQN_24_02_07'
mlflow.set_experiment(EXPERIMENT_NAME)

2024/02/06 14:00:06 INFO mlflow.tracking.fluent: Experiment with name 'DQN_24_02_06' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/108688011859788956', creation_time=1707238806027, experiment_id='108688011859788956', last_update_time=1707238806027, lifecycle_stage='active', name='DQN_24_02_06', tags={}>

In [16]:
import optuna

from config import OPTUNA_DB

study = optuna.create_study(
    study_name=EXPERIMENT_NAME,
    direction='maximize',
    load_if_exists=True,
    storage=f'sqlite:///{OPTUNA_DB}'
)

[I 2024-02-06 14:00:06,136] Using an existing study with name 'DQN_24_02_06' instead of creating a new one.


In [17]:
from optimize_hyperparameters import objective

# we define a lambda function because study.optimize()
# expect the objective function to have only 1 input
# (trial), while our objective function hast 2 extra
# inputs I defined to add flexibility to the script
func = lambda trial: objective(trial,
                               force_linear_model=False,
                               n_episodes_to_train=200)

class CheckHyperparamMeanRewardThreshold:
    def __init__(self, reward_threshold: float):
        self.reward_threshold = reward_threshold

    def __call__(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
        if trial.value is not None and trial.value >= self.reward_threshold:
            print((f'Stopping hyperparameter search because trial.value ({trial.value}) '
                   f'hit threshold ({self.reward_threshold})'))
            study.stop()

# Stop hyperparameter search when we hit a perfect mean reward of 500
hyperparam_search_stop_callback = CheckHyperparamMeanRewardThreshold(50.0)

study.optimize(func, n_trials=1000, callbacks=[hyperparam_search_stop_callback], show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

67,586 parameters



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 100/100 [00:07<00:00, 13.81it/s]
100%|██████████| 200/200 [00:35<00:00,  5.61it/s]


Reward mean: 317.04, std: 111.01
Num steps mean: 317.04, std: 111.01


100%|██████████| 1000/1000 [01:12<00:00, 13.70it/s]


[I 2024-02-06 14:01:56,073] Trial 479 finished with value: 322.083 and parameters: {'learning_rate': 6.85863829888384e-05, 'discount_factor': 0.95, 'batch_size': 16, 'memory_size': 50000, 'freq_steps_train': 8, 'freq_steps_update_target': 10, 'n_steps_warm_up_memory': 1000, 'n_gradient_steps': 16, 'nn_hidden_layers': 2, 'max_grad_norm': 10, 'normalize_state': True, 'epsilon_start': 0.9, 'epsilon_end': 0.125869250878496, 'steps_epsilon_decay': 100000, 'seed': 804151958}. Best is trial 229 with value: 500.0.
Stopping hyperparameter search because trial.value (322.083) hit threshold (50.0)


In [18]:
best_trial = study.best_trial

hparams = {k: best_trial.params[k] for k in best_trial.params if k != 'seed'}
#hparams['nn_hidden_layers'] = eval(hparams['nn_hidden_layers']) 
print(hparams)

SEED = best_trial.params['seed']
print('Seed: ', SEED)

{'learning_rate': 6.465531216430013e-05, 'discount_factor': 0.9, 'batch_size': 32, 'memory_size': 100000, 'freq_steps_train': 256, 'freq_steps_update_target': 100, 'n_steps_warm_up_memory': 1000, 'n_gradient_steps': 16, 'nn_hidden_layers_index': 0, 'max_grad_norm': 1, 'normalize_state': True, 'epsilon_start': 0.9, 'epsilon_end': 0.19225553957356353, 'steps_epsilon_decay': 1000}
Seed:  900695048


In [21]:
hparams

{'learning_rate': 6.465531216430013e-05,
 'discount_factor': 0.9,
 'batch_size': 32,
 'memory_size': 100000,
 'freq_steps_train': 256,
 'freq_steps_update_target': 100,
 'n_steps_warm_up_memory': 1000,
 'n_gradient_steps': 16,
 'nn_hidden_layers_index': 0,
 'max_grad_norm': 1,
 'normalize_state': True,
 'epsilon_start': 0.9,
 'epsilon_end': 0.19225553957356353,
 'steps_epsilon_decay': 1000}

In [19]:
from utils import set_seed
set_seed(env, SEED)

from q_agent import QAgent
agent = QAgent(env, **hparams)

from loops import train
train_rewards, train_steps = train(agent, env, n_episodes=200)

TypeError: QAgent.__init__() got an unexpected keyword argument 'nn_hidden_layers_index'

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(1, len(train_rewards) + 1), train_rewards, label='Rewards per Episode')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Rewards per Episode')
plt.legend()
plt.show()

In [None]:
from q_agent import QAgent
from config import SAVED_AGENTS_DIR

# you can find the agent_id for the best run in the MLflow
# dashboard.
# 298 is the value in my case, but you need to check what is your
agent_id = 6

path_to_saved_model = SAVED_AGENTS_DIR / 'CartPole-v1' / str(agent_id)
agent = QAgent.load_from_disk(env, path_to_saved_model)

In [None]:
from loops import evaluate
rewards, steps = evaluate(
    agent, env,
    n_episodes=1000,
    epsilon=0.00
)

In [None]:
plt.plot(range(1, len(rewards) + 1), rewards, label='Rewards per Episode')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Rewards per Episode')
plt.legend()
plt.show()

In [None]:
import numpy as np
reward_avg = np.array(rewards).mean()
reward_std = np.array(rewards).std()
print(f'Reward average {reward_avg:.2f}, std {reward_std:.2f}')

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

fig, ax = plt.subplots(figsize = (10, 4))
ax.set_title("Rewards")    
pd.Series(rewards).plot(kind='hist', bins=100)

plt.show()

Seria bom o gráfico de progresso do treino