In [1]:
from bayes_opt import BayesianOptimization, JSONLogger, Events, util

In [2]:
import sys
import os

In [3]:
ROOT_DIR = os.path.dirname(os.path.abspath("."))
sys.path.insert(0, ROOT_DIR)

In [4]:
import gym
import copy
import numpy as np
from datetime import datetime
import tensorflow.compat.v1 as tf_v1
# Custom modules
from src.Utils import parse_args, random_seed_gen
from src.Config import POLICIES, REPLAY_BUFFERS, ALGORITHMS, BASE_CONFIG, get_configuration, get_algorithm_from_variant, get_buffer_from_variant, get_policy_from_variant

In [5]:
class Trainable:
    
    def __init__(self, env_name, *, sess_config, render, summary_dir, display_interval, epochs, 
                 goal_trials, goal_reward, seed, policy):
        self.render = render
        self.epochs = epochs
        self.sess_config = sess_config
        self.env = gym.make(env_name)
        self.summary_dir = summary_dir
        self.goal_trials = goal_trials
        self.goal_reward = goal_reward
        self.display_interval = display_interval
        self.model_i = 1
        self.buffer_param = copy.deepcopy(REPLAY_BUFFERS.get("replay_buffer"))
        self.policy_param = copy.deepcopy(POLICIES.get(policy))
        self.algorithm_param = {"function": ALGORITHMS["ddqn"]["function"], "kwargs":{}}
        self.algorithm_param["kwargs"].update(render=self.render,
                                              goal_trials=self.goal_trials,
                                              goal_reward=self.goal_reward,
                                              display_interval=self.display_interval,
                                              **BASE_CONFIG,
                                             )
        self.env.seed(seed)
        
    def __call__(self, **kwargs):
        kwargs["update_interval"] = int(kwargs["update_interval"])
        self.algorithm_param["kwargs"].update(kwargs)
        config = {
            "policy_param": self.policy_param,
            "buffer_param": self.buffer_param,
            "algorithm_param": self.algorithm_param
        }
        model_name = f"Model {self.model_i}"
        # Create summary directory
        model_summary_dir = os.path.join(self.summary_dir, model_name)
        with tf_v1.Session(config=self.sess_config) as sess:
            buffer = get_buffer_from_variant(config)
            policy = get_policy_from_variant(self.env, config)
            algo = get_algorithm_from_variant(sess, self.env, policy, buffer, model_summary_dir, config)
            sess.run(tf_v1.global_variables_initializer())
            print("\n# Training: {}".format(model_name))
            # Run the algorithm for given epochs
            algo.run(epochs=self.epochs)
            result = np.mean(algo.epoch_rewards)*(1+algo.goals_achieved/10.0)
            # TODO: Return algorithm goal results in the end
        # Clear replay buffer and tensorflow graph
        buffer.clear()
        tf_v1.reset_default_graph()
        self.model_i += 1
        return result

In [6]:
seed = 853
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
TF_CONFIG = tf_v1.ConfigProto(gpu_options=tf_v1.GPUOptions(per_process_gpu_memory_fraction=0.75),
                              allow_soft_placement=True)

In [7]:
date_time = datetime.now().strftime("%d.%m.%Y %H.%M")
env_name = "CartPole-v0"
algorithm = "ddqn"
policy = "greedy_epsilon"
summary_dir = os.path.join("/home/raj/summaries", f"{env_name}-{algorithm}-{policy}-{date_time}")
setup_kwargs = dict(
    env_name=env_name,
    sess_config=TF_CONFIG,
    render=False,
    summary_dir=summary_dir,
    display_interval=100,
    epochs=1000,
    goal_trials=100,
    goal_reward=200,
    seed=seed,
    policy=policy,
)

param_bounds = {
    "lr": (0.8, 1),
    "df": (0.8, 1),
    "tau": (0.5, 1),
    "update_interval": (1, 25),
}

trainable = Trainable(**setup_kwargs)

optimizer = BayesianOptimization(
            f=trainable,
            pbounds=param_bounds,
            random_state=seed,
)

log_file = os.path.join(summary_dir, "logs.json")
logger = JSONLogger(path=log_file)
optimizer.subscribe(Events.OPTMIZATION_STEP, logger)

optimizer.probe(
    params=[0.9, 0.9, 0.88, 4],
    lazy=True,
)

optimizer.maximize(
    init_points=10,
    n_iter=40,
    acq="ei",
)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

# Training: Model 1
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_losses: 0.0407, 0.0000, total_reward: 154.0, in 20.4456 secs
Epoch: 200, mean_losses: 0.0199, 0.0000, total_reward: 166.0, in 45.7281 secs
Epoch: 300, mean_losses: 0.0198, 0.0000, total_reward: 200.0, in 48.1786 secs
Epoch: 400, mean_losses: 0.0213, 0.0000, total_reward: 171.0, in 50.3367 secs
Epoch: 500, mean_losses: 0.0227, 0.0000, total_reward: 184.0, in 52.1610 secs
Epoch: 600, mean_losses: 0.0238, 0.0000, total_reward: 162.0, in 57.1612 secs
Epoch: 700, mean_losses: 0.0255, 0.0000, total_reward: 200.0, in 58.1074 secs
Epoch: 800, mean_losses: 0.0281, 0.0000, total_reward: 200.0, in 54.5382 secs
Epoch: 900, mean_losses: 0.0304, 0.0000, total_reward: 74.0, in 50.5045 secs
Ep

Epoch: 400, mean_losses: 0.0344, 0.0000, total_reward: 200.0, in 45.9473 secs
Epoch: 500, mean_losses: 0.0354, 0.0000, total_reward: 200.0, in 44.5361 secs
Epoch: 600, mean_losses: 0.0365, 0.0000, total_reward: 200.0, in 44.9812 secs
Epoch: 700, mean_losses: 0.0375, 0.0000, total_reward: 182.0, in 43.8918 secs
Epoch: 800, mean_losses: 0.0380, 0.0000, total_reward: 200.0, in 44.3290 secs
Epoch: 900, mean_losses: 0.0378, 0.0000, total_reward: 200.0, in 43.5160 secs
Epoch: 1000, mean_losses: 0.0372, 0.0000, total_reward: 200.0, in 43.2698 secs
############# Goal Summary ############           
Number of achieved goals: 0
Max mean reward over 100 trials achieved at epoch 144 with reward 190.27

# Training: Model 9
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_losses: 0.1400, 0.0000, total_reward: 11.0, in 2.2657 secs
Epoch: 200, mean_losses: 0.1195, 0.0000, total_reward: 200.0, in 3.9602 secs
Epoch: 300, mean_losses: 0.3030, 0.0000, total_reward: 200.0, i

Epoch: 500, mean_losses: 0.0606, 0.0000, total_reward: 143.0, in 40.8894 secs
Epoch: 600, mean_losses: 0.0594, 0.0000, total_reward: 143.0, in 45.4671 secs
Epoch: 700, mean_losses: 0.0598, 0.0000, total_reward: 154.0, in 45.5927 secs
Epoch: 800, mean_losses: 0.0611, 0.0000, total_reward: 122.0, in 39.7170 secs
Epoch: 900, mean_losses: 0.0630, 0.0000, total_reward: 200.0, in 40.0338 secs
Epoch: 1000, mean_losses: 0.0643, 0.0000, total_reward: 200.0, in 40.8620 secs
############# Goal Summary ############           
Number of achieved goals: 0
Max mean reward over 100 trials achieved at epoch 380 with reward 174.12

# Training: Model 17
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_losses: 0.0230, 0.0000, total_reward: 200.0, in 33.5314 secs
Epoch: 200, mean_losses: 0.0187, 0.0000, total_reward: 123.0, in 52.0828 secs
Epoch: 300, mean_losses: 0.0186, 0.0000, total_reward: 200.0, in 52.6625 secs
Epoch: 400, mean_losses: 0.0196, 0.0000, total_reward: 129.

Epoch: 600, mean_losses: 0.0199, 0.0000, total_reward: 49.0, in 40.9173 secs
Epoch: 700, mean_losses: 0.0210, 0.0000, total_reward: 155.0, in 40.4764 secs
Epoch: 800, mean_losses: 0.0214, 0.0000, total_reward: 200.0, in 44.0903 secs
Epoch: 900, mean_losses: 0.0214, 0.0000, total_reward: 200.0, in 44.1867 secs
Epoch: 1000, mean_losses: 0.0215, 0.0000, total_reward: 191.0, in 40.4146 secs
############# Goal Summary ############           
Number of achieved goals: 0
Max mean reward over 100 trials achieved at epoch 451 with reward 186.42

# Training: Model 25
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_losses: 0.0220, 0.0000, total_reward: 153.0, in 27.1063 secs
Epoch: 200, mean_losses: 0.0181, 0.0000, total_reward: 200.0, in 43.3068 secs
Epoch: 300, mean_losses: 0.0175, 0.0000, total_reward: 100.0, in 44.6102 secs
Epoch: 400, mean_losses: 0.0174, 0.0000, total_reward: 200.0, in 42.5727 secs
Epoch: 500, mean_losses: 0.0170, 0.0000, total_reward: 200.0

Epoch: 700, mean_losses: 0.0340, 0.0000, total_reward: 200.0, in 45.6174 secs
Epoch: 800, mean_losses: 0.0387, 0.0000, total_reward: 200.0, in 40.1444 secs
Epoch: 900, mean_losses: 0.0429, 0.0000, total_reward: 200.0, in 41.1532 secs
Epoch: 1000, mean_losses: 0.0457, 0.0000, total_reward: 167.0, in 38.7179 secs
############# Goal Summary ############           
Number of achieved goals: 0
Max mean reward over 100 trials achieved at epoch 698 with reward 172.69

# Training: Model 33
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_losses: 0.0570, 0.0000, total_reward: 111.0, in 34.0823 secs
Epoch: 200, mean_losses: 0.0572, 0.0000, total_reward: 200.0, in 46.1437 secs
Epoch: 300, mean_losses: 0.0578, 0.0000, total_reward: 179.0, in 40.3954 secs
Epoch: 400, mean_losses: 0.0568, 0.0000, total_reward: 170.0, in 41.4285 secs
Epoch: 500, mean_losses: 0.0539, 0.0000, total_reward: 200.0, in 42.5771 secs
Epoch: 600, mean_losses: 0.0516, 0.0000, total_reward: 110.

Epoch: 800, mean_losses: 0.1057, 0.0000, total_reward: 108.0, in 42.6114 secs
Epoch: 900, mean_losses: 0.1082, 0.0000, total_reward: 200.0, in 42.7939 secs
Epoch: 1000, mean_losses: 0.1104, 0.0000, total_reward: 150.0, in 44.0101 secs
############# Goal Summary ############           
Number of achieved goals: 0
Max mean reward over 100 trials achieved at epoch 644 with reward 160.63

# Training: Model 41
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_losses: 0.0565, 0.0000, total_reward: 200.0, in 37.8635 secs
Epoch: 200, mean_losses: 0.0499, 0.0000, total_reward: 197.0, in 43.4141 secs
Epoch: 300, mean_losses: 0.0471, 0.0000, total_reward: 129.0, in 42.6479 secs
Epoch: 400, mean_losses: 0.0449, 0.0000, total_reward: 146.0, in 43.3179 secs
Epoch: 500, mean_losses: 0.0417, 0.0000, total_reward: 115.0, in 42.4279 secs
Epoch: 600, mean_losses: 0.0387, 0.0000, total_reward: 142.0, in 41.3528 secs
Epoch: 700, mean_losses: 0.0365, 0.0000, total_reward: 169.

Epoch: 900, mean_losses: 0.0749, 0.0000, total_reward: 200.0, in 40.6610 secs
Epoch: 1000, mean_losses: 0.0739, 0.0000, total_reward: 67.0, in 42.1083 secs
############# Goal Summary ############           
Number of achieved goals: 0
Max mean reward over 100 trials achieved at epoch 287 with reward 176.71

# Training: Model 49
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_losses: 0.0170, 0.0000, total_reward: 200.0, in 20.3560 secs
Epoch: 200, mean_losses: 0.0157, 0.0000, total_reward: 157.0, in 47.5421 secs
Epoch: 300, mean_losses: 0.0157, 0.0000, total_reward: 155.0, in 44.0347 secs
Epoch: 400, mean_losses: 0.0153, 0.0000, total_reward: 200.0, in 38.0384 secs
Epoch: 500, mean_losses: 0.0149, 0.0000, total_reward: 200.0, in 41.6793 secs
Epoch: 600, mean_losses: 0.0146, 0.0000, total_reward: 200.0, in 42.9326 secs
Epoch: 700, mean_losses: 0.0147, 0.0000, total_reward: 200.0, in 45.8003 secs
Epoch: 800, mean_losses: 0.0149, 0.0000, total_reward: 100.0

In [8]:
optimizer.max

{'target': 178.804,
 'params': {'df': 0.9955604903914581,
  'lr': 0.9888760668246095,
  'tau': 0.6709917974498818,
  'update_interval': 23.761169610884398}}

In [12]:
old_log = log_file
from bayes_opt.util import load_logs

load_logs(optimizer, logs=[old_log])
optimizer.maximize(
    init_points=1,
    n_iter=40,
    acq="ei",
)


# Training: Model 102
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_losses: 0.0093, 0.0000, total_reward: 10.0, in 3.4066 secs
Epoch: 200, mean_losses: 0.0054, 0.0000, total_reward: 10.0, in 2.2112 secs
Epoch: 300, mean_losses: 0.0039, 0.0000, total_reward: 9.0, in 2.7361 secs
Epoch: 400, mean_losses: 0.0031, 0.0000, total_reward: 8.0, in 2.3423 secs
Epoch: 500, mean_losses: 0.0026, 0.0000, total_reward: 9.0, in 2.4739 secs
Epoch: 600, mean_losses: 0.0023, 0.0000, total_reward: 10.0, in 2.2023 secs
Epoch: 700, mean_losses: 0.0020, 0.0000, total_reward: 9.0, in 2.4996 secs
Epoch: 800, mean_losses: 0.0018, 0.0000, total_reward: 10.0, in 2.5489 secs
Epoch: 900, mean_losses: 0.0017, 0.0000, total_reward: 10.0, in 2.6727 secs
Epoch: 1000, mean_losses: 0.0015, 0.0000, total_reward: 10.0, in 2.4655 secs
############# Goal Summary ############           
Number of achieved goals: 0
Max mean reward over 100 trials achieved at epoch 100 with reward 11.68

# Tr

Epoch: 300, mean_losses: 0.0068, 0.0000, total_reward: 8.0, in 2.2097 secs
Epoch: 400, mean_losses: 0.0053, 0.0000, total_reward: 10.0, in 2.5402 secs
Epoch: 500, mean_losses: 0.0044, 0.0000, total_reward: 9.0, in 2.3982 secs
Epoch: 600, mean_losses: 0.0037, 0.0000, total_reward: 8.0, in 2.5030 secs
Epoch: 700, mean_losses: 0.0032, 0.0000, total_reward: 9.0, in 2.1927 secs
Epoch: 800, mean_losses: 0.0029, 0.0000, total_reward: 9.0, in 2.6984 secs
Epoch: 900, mean_losses: 0.0026, 0.0000, total_reward: 9.0, in 2.5180 secs
Epoch: 1000, mean_losses: 0.0024, 0.0000, total_reward: 9.0, in 2.3800 secs
############# Goal Summary ############           
Number of achieved goals: 0
Max mean reward over 100 trials achieved at epoch 100 with reward 10.73

# Training: Model 111
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_losses: 0.0196, 0.0000, total_reward: 10.0, in 2.6236 secs
Epoch: 200, mean_losses: 0.0108, 0.0000, total_reward: 9.0, in 2.4337 secs
Epoch: 30

Epoch: 800, mean_losses: 0.0016, 0.0000, total_reward: 11.0, in 3.7001 secs
Epoch: 900, mean_losses: 0.0014, 0.0000, total_reward: 22.0, in 3.3967 secs
Epoch: 1000, mean_losses: 0.0013, 0.0000, total_reward: 11.0, in 3.7367 secs
############# Goal Summary ############           
Number of achieved goals: 0
Max mean reward over 100 trials achieved at epoch 104 with reward 40.09

# Training: Model 127
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_losses: 0.0011, 0.0000, total_reward: 15.0, in 3.0049 secs
Epoch: 200, mean_losses: 0.0006, 0.0000, total_reward: 9.0, in 2.9733 secs
Epoch: 300, mean_losses: 0.0004, 0.0000, total_reward: 8.0, in 2.3458 secs
Epoch: 400, mean_losses: 0.0003, 0.0000, total_reward: 10.0, in 3.8572 secs
Epoch: 500, mean_losses: 0.0003, 0.0000, total_reward: 10.0, in 2.4038 secs
Epoch: 600, mean_losses: 0.0002, 0.0000, total_reward: 9.0, in 2.2601 secs
Epoch: 700, mean_losses: 0.0002, 0.0000, total_reward: 8.0, in 2.5548 secs
Epoch


# Training: Model 135
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_losses: 0.0154, 0.0000, total_reward: 10.0, in 2.3663 secs
Epoch: 200, mean_losses: 0.0083, 0.0000, total_reward: 11.0, in 2.5028 secs
Epoch: 300, mean_losses: 0.0058, 0.0000, total_reward: 10.0, in 2.1434 secs
Epoch: 400, mean_losses: 0.0045, 0.0000, total_reward: 10.0, in 2.1437 secs
Epoch: 500, mean_losses: 0.0037, 0.0000, total_reward: 10.0, in 2.5037 secs
Epoch: 600, mean_losses: 0.0032, 0.0000, total_reward: 10.0, in 2.2527 secs
Epoch: 700, mean_losses: 0.0028, 0.0000, total_reward: 9.0, in 2.5427 secs
Epoch: 800, mean_losses: 0.0025, 0.0000, total_reward: 10.0, in 2.3149 secs
Epoch: 900, mean_losses: 0.0023, 0.0000, total_reward: 10.0, in 2.6310 secs
Epoch: 1000, mean_losses: 0.0021, 0.0000, total_reward: 8.0, in 2.6125 secs
############# Goal Summary ############           
Number of achieved goals: 0
Max mean reward over 100 trials achieved at epoch 100 with reward 10.17

# 