In [6]:
import os
import sys
import gym
import numpy as np
import configparser
from datetime import datetime
from itertools import product
# Import Tensorflow
import tensorflow as tf
import tensorflow.compat.v1 as tf_v1

# Insert module root directory to sys.path

In [1]:
ROOT_DIR = os.path.abspath(os.path.join(os.path.abspath(""), '..'))
sys.path.insert(0, ROOT_DIR)
from run import MountainCar_v0
from src import ReplayBuffer
from src.Utils import get_logger, eval_dict_values

This function generates **init_kwargs** and **train_kwargs**. *init_kwargs* contains keyworded-arguments to pass while initializing the agent whereas *train_kwargs* contains keyworded-arguments to pass while training the agent.

def parameter_generator_old(eps_decays, exp_intervals, target_update_steps, explore_ratios, ddqns, seeds):
    for eps_decay in eps_decays:
        for exp_interval in exp_intervals:
            for target_update_step in target_update_steps:
                for exp_ratio in explore_ratios:
                    for ddqn in ddqns:
                        for seed in seeds:
                            init_kwargs = {"eps_decay": eps_decay, "explore_exploit_interval":exp_interval, "DDQN":ddqn}
                            train_kwargs = {"target_update_steps":target_update_step, "explore_ratio":exp_ratio}
                            yield seed, init_kwargs, train_kwargs

def parameter_generator(eps_decays, exp_intervals, target_update_steps, explore_ratios, ddqns, seeds):
    init_parameters = product(eps_decays, exp_intervals, ddqns)
    train_parameters = product(target_update_steps, explore_ratios)
    for para in product(init_parameters, train_parameters, seeds):
        init_args, train_args, seed = para
        init_kwargs = {"eps_decay": init_args[0], "explore_exploit_interval":init_args[1], "DDQN":init_args[2]}
        train_kwargs = {"target_update_steps":train_args[0], "explore_ratio":train_args[1]}
        yield seed, init_kwargs, train_kwargs

The following function receives parameters like tensorflow **sess**, OpenAI gym **env**, model id and keyworded arguments to initialize and train the agent.  

The function performs following tasks:
- Creates a *ReplayBuffer* to store transitions **(state, action, reward, next_state, done)** of the agent.
- Creates an *agent*
- Creates tf.train.Saver object to save/restore agent model (as checkpoints)
- Trains the agent for given number of episodes, measures goal achievements, logs the summaries of each epoch (average loss, epoch length, maximum position, total epoch reward) in Tensorboard and returns the final summaries
- Saves trained model
- Plots the summaries using matplotlib if **plot_result** is set to True.
- Returns only the goal summary (number of achieved goals, first achieved goal, goal with most reward) where the format of each goal is *(epoch and epoch reward)*

"""
Agent init:
1. Fixed parameters: sess, env, eps, lr, df, tau, render, (change later) [mem, batch_size]
2. Variable parameters: eps_decay, explore_exploit_interval, DDQN

Agent train:
1. Fixed parameters: NUM_EPISODES, display_every, goal_trials, goal_reward
2. Variable parameters: target_update_steps, explore_ratio
"""
def run(sess, env, model_dir, summ_dir, model_i, init_kwargs, train_kwargs, plot_result=False):
    model_name = "Model {}".format(model_i)
    print("# Training: {}".format(model_name))
    summ_dir = os.path.join(summ_dir, model_name)
    # create replay buffer, agent and (model checkpoint) saver
    mem = ReplayBuffer(160_000)
    agent = MountainCar(sess, env, mem, batch_size=100, eps=(1, 0.001), lr=0.97, 
                           df=0.99, tau=1, render=render, summ_dir=summ_dir, 
                           **init_kwargs)
    # TODO: Increase this size
    saver = tf.train.Saver(max_to_keep=100)
    sess.run(tf.global_variables_initializer())
    results= agent.train(NUM_EPISODES, display_every=DISPLAY_RATE,
                         goal_trials=100, goal_reward=-110.0,
                         **train_kwargs)
    saver.save(sess, os.path.join(model_dir, model_name))
    *results, goal_summary = results
    if plot_result:
        for p, plt_name in zip(results, ("Losses", "Rewards", "Max pos", "Epoch length")):
            plt.plot(p)
            plt.xlabel('Episodes')
            plt.ylabel(plt_name)
            plt.show()
            plt.close("all")
    return goal_summary

# Explore logs
**8th Sept 2019 - 22:53** 
- eps_decay:(0.1, 0.5, 3), 
- explore_interval:(5, 20, 4), 
- target_update_steps:(5, 20, 4), 
- explore_ratios:(0.1, 0.5, 3), 
- random seeds:(100, 1000, 2)

**12th Sept 2019 - 23:47** 
- eps_decay:(0.1, 0.5, 5), 
- explore_interval:(10, 40, 4), 
- target_update_steps:(10, 1), 
- explore_ratios:(0.25, 1), 
- random seeds:(100, 1000, 2)

In [None]:
LOG_DIR = "log"
SUMM_DIR = "summaries"
CONFIG_DIR = "config"
ENV_NAME = "MountainCar-v0"
SEEDS = np.random.randint(100, 1000, size=1, dtype=np.uint16)
TF_CONFIG = tf_v1.ConfigProto(gpu_options=tf_v1.GPUOptions(per_process_gpu_memory_fraction=0.5), 
                              allow_soft_placement=True)

In [None]:
if __name__ == "__main__":
    # Default summary directoy, log and config file
    date_time = datetime.now().strftime("%d.%m.%Y %H.%M")
    summ_dir = os.path.join(SUMM_DIR, "{} {}".format(ENV_NAME, date_time))
    log_file = os.path.join(summ_dir, LOG_DIR, "Results {} {}.log".format(ENV_NAME, date_time))
    config_file = os.path.join(CONFIG_DIR, "MountainCar-v0.ini")
    # Parse configuration from config file
    config_parser = configparser.ConfigParser()
    """
    Somehow in Jupyter Notebook config_parser keeps looking in ROOT_DIR\notebooks\config_dir 
    instead of ROOT_DIR\config_dir, even though the ROOT_DIR is in sys.path. 
    Therefore, explicit addressing is required in Jupyter Notebook.
    """
    config_parser.read(os.path.join(ROOT_DIR, config_file))
    # Load configurations from config file
    init_kwargs = eval_dict_values(config_parser["init_kwargs"])
    train_kwargs = eval_dict_values(config_parser["train_kwargs"])
    mem_size = eval(config_parser["others"].get("mem_size", 50_000))
    # Setup logger
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    logger = get_logger(log_file)
    # Create environment and replay buffer
    env = gym.make(ENV_NAME)
    mem = ReplayBuffer(mem_size)
    if record_interval > 0:
        # Wrap environment with Monitor wrapper to record videos
        env = gym.wrappers.Monitor(env, os.path.join(summ_dir, "videos"), force=True,
                                   video_callable=lambda epoch: not epoch%record_interval)
    # Testing model
    if test_model_chkpt is not None:
        # Override goal_trials and display_every parameters
        train_kwargs["goal_trials"] = 1
        train_kwargs["display_every"] = train_kwargs["epochs"]/10
    for init_kwargs, train_kwargs in 
    # Run the program
    MountainCar_v0.run(env, SEEDS, mem, logger, summ_dir, init_kwargs, train_kwargs, 
                       log_init_kwargs, log_train_kwargs, plot_result, sess_config=TF_CONFIG, 
                       test_model_chkpt=test_model_chkpt)

The agent is trained with different combinations of the following parameters:

In [4]:
DDQNS = [True]
EPS_DECAYS = np.linspace(0.1, 0.5, num=5, dtype=np.float16)
EXPLORE_EXPLOIT_INTERVALS = np.linspace(10, 40, num=4, dtype=np.uint16)
TARGET_UPDATE_STEPS = [10]#np.linspace(5, 20, num=4, dtype=np.uint16)
EXPLORE_RATIOS = [0.25]#np.linspace(0.1, 0.5, num=3, dtype=np.float16)
SEEDS = np.random.randint(100, 1000, size=2, dtype=np.uint16)

In [1]:
def parameter_generator(eps_decays, exp_intervals, target_update_steps, explore_ratios, ddqns):
    init_parameters = product(eps_decays, ddqns)
    train_parameters = product(target_update_steps, exp_intervals, explore_ratios)
    for param in product(init_parameters, train_parameters):
        init_args, train_args= param
        init_kwargs = {"eps_decay": init_args[0], 
                       "ddqn":init_args[1]}
        train_kwargs = {"target_update_steps":train_args[0], 
                        "explore_exploit_interval":init_args[1], 
                        "explore_ratio":train_args[2]}
        yield init_kwargs, train_kwargs

In [7]:
for k1, k2 in parameter_generator(EPS_DECAYS, EXPLORE_EXPLOIT_INTERVALS, TARGET_UPDATE_STEPS, EXPLORE_RATIOS, DDQNS):
    print(k1, k2)

{'eps_decay': 0.1, 'ddqn': True} {'target_update_steps': 10, 'explore_exploit_interval': True, 'explore_ratio': 0.25}
{'eps_decay': 0.1, 'ddqn': True} {'target_update_steps': 10, 'explore_exploit_interval': True, 'explore_ratio': 0.25}
{'eps_decay': 0.1, 'ddqn': True} {'target_update_steps': 10, 'explore_exploit_interval': True, 'explore_ratio': 0.25}
{'eps_decay': 0.1, 'ddqn': True} {'target_update_steps': 10, 'explore_exploit_interval': True, 'explore_ratio': 0.25}
{'eps_decay': 0.2, 'ddqn': True} {'target_update_steps': 10, 'explore_exploit_interval': True, 'explore_ratio': 0.25}
{'eps_decay': 0.2, 'ddqn': True} {'target_update_steps': 10, 'explore_exploit_interval': True, 'explore_ratio': 0.25}
{'eps_decay': 0.2, 'ddqn': True} {'target_update_steps': 10, 'explore_exploit_interval': True, 'explore_ratio': 0.25}
{'eps_decay': 0.2, 'ddqn': True} {'target_update_steps': 10, 'explore_exploit_interval': True, 'explore_ratio': 0.25}
{'eps_decay': 0.3, 'ddqn': True} {'target_update_steps':

# Explore plan
1. Set parameters as far apart as possible.
2. Determine parameter ranges that produce reasonably good results
3. Set parameters within this ranges or explore new parameter range
4. Repeat step 2  

# Further plan
- Create class to explore hyperparameters. 
- The class includes parameter_generator and run methods.
- It would probably have a context manager option to save and load trained parameters information
- Store and plot achieved goal information

In [3]:
class Hyperparameter_Checker:
    def __init__(self, env, mem, model_dir, summ_dir, log_file, start_index=1, saver_max_size=100, 
                 batch_size=100, display_every=100, epochs=1000, render=False):
        self.env = env
        self.mem = mem
        self.saver_max_size = saver_max_size
        self.display_every = display_every
        self.start_index = start_index
        self.batch_size = batch_size
        self.epochs = epochs
        self.render = render
        self.model_dir = model_dir
        self.summ_dir = summ_dir
        self.log_file = log_file
        self.logger = get_logger(self.log_file)
        
    @staticmethod
    def parameter_generator(eps_decays, exp_intervals, target_update_steps, explore_ratios, ddqns, seeds):
        init_parameters = product(eps_decays, exp_intervals, ddqns)
        train_parameters = product(target_update_steps, explore_ratios)
        for param in product(init_parameters, train_parameters, seeds):
            init_args, train_args, seed = param
            init_kwargs = {"eps_decay": init_args[0], "explore_exploit_interval":init_args[1], "ddqn":init_args[2]}
            train_kwargs = {"target_update_steps":train_args[0], "explore_ratio":train_args[1]}
            yield seed, init_kwargs, train_kwargs
    
    def run(self, sess, model_i, init_kwargs, train_kwargs, seed=None, plot_result=False):
        model_name = "Model {}".format(model_i)
        print("# Training: {}".format(model_name))
        summ_dir = os.path.join(self.summ_dir, model_name)
        agent = MountainCar(sess, self.env, self.mem, batch_size=self.batch_size, eps_limits=(1, 0.001), 
                               lr=0.97, df=0.99, tau=1, render=self.render, summ_dir=summ_dir, 
                               **init_kwargs)
        saver = tf.train.Saver(max_to_keep=self.saver_max_size)
        sess.run(tf.global_variables_initializer())
        results= agent.train(self.epochs, display_every=self.display_every, 
                             goal_trials=100, goal_reward=-110.0, 
                             **train_kwargs)
        saver.save(sess, os.path.join(self.model_dir, model_name))
        *results, goal_summary = results
        if plot_result:
            for p, plt_name in zip(results, ("Losses", "Rewards", "Max pos", "Epoch length")):
                plt.plot(p)
                plt.xlabel('Episodes')
                plt.ylabel(plt_name)
                plt.show()
                plt.close("all")
        parameter_dict = {"seed":seed, **init_kwargs, **train_kwargs}
        agent.log(self.logger, model_i, parameter_dict, goal_summary)
    
    def check_parameters(self, *args, plot_result=False):
        para_gen = self.parameter_generator(*args)
        for model_i, (seed, init_kwargs, train_kwargs) in enumerate(para_gen, start=1):
            if model_i < self.start_index:
                continue                        # Skip parameters until the model_i >= start_index 
            self.env.seed(int(seed))
            with tf.Session(config=TF_CONFIG) as sess:
                self.run(sess, model_i, seed=seed, init_kwargs=init_kwargs,
                         train_kwargs=train_kwargs, plot_result=plot_result)
            tf.compat.v1.reset_default_graph()
            self.mem.clear()
             """
            parameter_str = dict2str(parameter_dict)
            self.logger.debug("Model {:<2} - {}".format(model_i, parameter_str))
            num_goals, first_goal, max_goal = goal_summary
            first_goal_epoch, first_goal_reward = first_goal 
            max_goal_epoch, max_goal_reward = max_goal
            self.logger.debug("Goals achieved: {}".format(num_goals))
            if num_goals:
                self.logger.info("First goal achieved: {:.2f} mean reward at {} epoch.".format(first_goal_reward, first_goal_epoch))
            self.logger.info("Max goal achieved: {:.2f} mean reward at {} epoch.\n".format(max_goal_reward, max_goal_epoch))"""

Global variables

In [4]:
LOG_DIR = "Log"
ENV_NAME = "MountainCar-v0"
DISPLAY_RATE = 250
NUM_EPISODES = 1000
render = False
TF_CONFIG = tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.25), 
                           allow_soft_placement=True)
if not os.path.exists(LOG_DIR):
    os.mkdir(LOG_DIR)

Following tasks are performed in the block below:
- Generate different combinations of parameters
- Initialize the agent with parameters in **init_kwargs**
- Train the agent with parameters in **train_kwargs**
- Reset tensorflow graph (to reuse the graph to train future models)
- Log goal summary

In [5]:
if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    mem = ReplayBuffer(50_000)
    """model_dir = r'models\Model 08.09.2019 23.05'
    summ_dir = r'summaries\MountainCar-v0 08.09.2019 23.05'
    log_file = r'Log\Results 08.09.2019 23.05.txt'"""
    date_time = datetime.now().strftime("%d.%m.%Y %H.%M")
    model_dir = os.path.join("models", "Model {}".format(date_time))
    summ_dir = os.path.join("summaries", "{} {}".format(ENV_NAME, date_time))
    log_file = os.path.join(LOG_DIR, "Results {}.txt".format(date_time))
    checker = Hyperparameter_Checker(env, mem, model_dir, summ_dir, log_file, start_index=1, 
                                     saver_max_size=300, display_every=DISPLAY_RATE, epochs=NUM_EPISODES,
                                     render=render)
    checker.check_parameters(EPS_DECAYS, EXPLORE_EXPLOIT_INTERVALS, TARGET_UPDATE_STEPS, 
                             EXPLORE_RATIOS, DDQNS, SEEDS)

W0913 00:06:05.869809  8472 deprecation.py:506] From c:\users\raj k\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0913 00:06:05.979811  8472 deprecation.py:323] From c:\users\raj k\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\losses\losses_impl.py:121: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


# Training: Model 1
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 2.9343, total_reward: -143.0, max_pos: 0.5059,  in 266.3583 secs
Epoch: 500, mean_loss: 0.8678, total_reward: -103.0, max_pos: 0.5135,  in 184.2229 secs
Epoch: 750, mean_loss: 0.1018, total_reward: -200.0, max_pos: -0.0846,  in 159.7277 secs
Epoch: 1000, mean_loss: 0.2040, total_reward: -97.0, max_pos: 0.5052,  in 155.1219 secs


First goal achieved: -109.99 mean reward at 703 epoch.
Max goal achieved: -105.18 mean reward at 890 epoch.



# Training: Model 2
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.5008, total_reward: -174.0, max_pos: 0.5044,  in 213.1094 secs
Epoch: 500, mean_loss: 0.1313, total_reward: -116.0, max_pos: 0.5105,  in 168.3576 secs
Epoch: 750, mean_loss: 0.1695, total_reward: -171.0, max_pos: 0.5053,  in 163.2149 secs
Epoch: 1000, mean_loss: 0.1052, total_reward: -140.0, max_pos: 0.5369,  in 158.5637 secs


First goal achieved: -109.88 mean reward at 486 epoch.
Max goal achieved: -105.55 mean reward at 556 epoch.



# Training: Model 3
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.4848, total_reward: -150.0, max_pos: 0.5369,  in 215.4989 secs
Epoch: 500, mean_loss: 0.4096, total_reward: -98.0, max_pos: 0.5168,  in 182.8998 secs
Epoch: 750, mean_loss: 0.1913, total_reward: -111.0, max_pos: 0.5251,  in 166.8978 secs
Epoch: 1000, mean_loss: 0.2037, total_reward: -104.0, max_pos: 0.5201,  in 157.7732 secs


First goal achieved: -109.71 mean reward at 908 epoch.
Max goal achieved: -106.78 mean reward at 918 epoch.



# Training: Model 4
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 3.2807, total_reward: -200.0, max_pos: 0.2555,  in 270.0422 secs
Epoch: 500, mean_loss: 0.5327, total_reward: -160.0, max_pos: 0.5369,  in 189.8401 secs
Epoch: 750, mean_loss: 0.0716, total_reward: -106.0, max_pos: 0.5073,  in 165.2046 secs
Epoch: 1000, mean_loss: 0.0712, total_reward: -121.0, max_pos: 0.5109,  in 153.0079 secs


First goal achieved: -109.47 mean reward at 691 epoch.
Max goal achieved: -103.39 mean reward at 778 epoch.



# Training: Model 5
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 0.5877, total_reward: -116.0, max_pos: 0.5241,  in 200.4850 secs
Epoch: 500, mean_loss: 0.1541, total_reward: -92.0, max_pos: 0.5115,  in 154.5413 secs
Epoch: 750, mean_loss: 0.1527, total_reward: -103.0, max_pos: 0.5069,  in 154.7825 secs
Epoch: 1000, mean_loss: 0.1285, total_reward: -195.0, max_pos: 0.5369,  in 156.8226 secs


First goal achieved: -109.99 mean reward at 440 epoch.
Max goal achieved: -102.40 mean reward at 696 epoch.



# Training: Model 6
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.4924, total_reward: -176.0, max_pos: 0.5214,  in 230.0499 secs
Epoch: 500, mean_loss: 0.1782, total_reward: -115.0, max_pos: 0.5454,  in 187.6773 secs
Epoch: 750, mean_loss: 0.0849, total_reward: -112.0, max_pos: 0.5268,  in 165.8811 secs
Epoch: 1000, mean_loss: 0.0916, total_reward: -116.0, max_pos: 0.5168,  in 157.3814 secs


First goal achieved: -109.86 mean reward at 544 epoch.
Max goal achieved: -103.87 mean reward at 926 epoch.



# Training: Model 7
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 2.2192, total_reward: -149.0, max_pos: 0.5407,  in 223.2527 secs
Epoch: 500, mean_loss: 0.1585, total_reward: -87.0, max_pos: 0.5045,  in 164.2207 secs
Epoch: 750, mean_loss: 0.0476, total_reward: -103.0, max_pos: 0.5000,  in 146.5226 secs
Epoch: 1000, mean_loss: 0.0486, total_reward: -96.0, max_pos: 0.5133,  in 153.2426 secs


First goal achieved: -109.72 mean reward at 491 epoch.
Max goal achieved: -100.69 mean reward at 686 epoch.



# Training: Model 8
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.5654, total_reward: -169.0, max_pos: 0.5408,  in 234.7432 secs
Epoch: 500, mean_loss: 0.1637, total_reward: -105.0, max_pos: 0.5196,  in 168.6585 secs
Epoch: 750, mean_loss: 0.0591, total_reward: -103.0, max_pos: 0.5121,  in 147.9427 secs
Epoch: 1000, mean_loss: 0.2665, total_reward: -107.0, max_pos: 0.5139,  in 168.5768 secs


First goal achieved: -109.98 mean reward at 487 epoch.
Max goal achieved: -103.02 mean reward at 740 epoch.



# Training: Model 9
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 2.0339, total_reward: -128.0, max_pos: 0.5366,  in 234.0919 secs
Epoch: 500, mean_loss: 0.1968, total_reward: -106.0, max_pos: 0.5113,  in 176.2980 secs
Epoch: 750, mean_loss: 0.1848, total_reward: -157.0, max_pos: 0.5058,  in 169.4055 secs
Epoch: 1000, mean_loss: 0.3459, total_reward: -121.0, max_pos: 0.5117,  in 180.3913 secs


Max goal achieved: -112.45 mean reward at 993 epoch.



# Training: Model 10
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.0492, total_reward: -106.0, max_pos: 0.5109,  in 200.4614 secs
Epoch: 500, mean_loss: 0.1386, total_reward: -110.0, max_pos: 0.5383,  in 163.3435 secs
Epoch: 750, mean_loss: 0.2454, total_reward: -109.0, max_pos: 0.5275,  in 156.7363 secs
Epoch: 1000, mean_loss: 0.1114, total_reward: -106.0, max_pos: 0.5186,  in 166.6664 secs


First goal achieved: -109.92 mean reward at 494 epoch.
Max goal achieved: -101.88 mean reward at 662 epoch.



# Training: Model 11
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 0.7892, total_reward: -114.0, max_pos: 0.5144,  in 192.4873 secs
Epoch: 500, mean_loss: 0.0716, total_reward: -90.0, max_pos: 0.5061,  in 154.9714 secs
Epoch: 750, mean_loss: 0.0675, total_reward: -137.0, max_pos: 0.5053,  in 161.7462 secs
Epoch: 1000, mean_loss: 0.1319, total_reward: -94.0, max_pos: 0.5006,  in 157.7278 secs


First goal achieved: -109.84 mean reward at 453 epoch.
Max goal achieved: -102.72 mean reward at 538 epoch.



# Training: Model 12
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.0747, total_reward: -105.0, max_pos: 0.5075,  in 210.3209 secs
Epoch: 500, mean_loss: 0.1347, total_reward: -107.0, max_pos: 0.5281,  in 162.1171 secs
Epoch: 750, mean_loss: 0.0576, total_reward: -103.0, max_pos: 0.5255,  in 147.5844 secs
Epoch: 1000, mean_loss: 0.6476, total_reward: -117.0, max_pos: 0.5275,  in 195.4626 secs


First goal achieved: -109.85 mean reward at 457 epoch.
Max goal achieved: -100.81 mean reward at 641 epoch.



# Training: Model 13
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 2.1481, total_reward: -110.0, max_pos: 0.5246,  in 235.7889 secs
Epoch: 500, mean_loss: 0.2566, total_reward: -88.0, max_pos: 0.5078,  in 175.0849 secs
Epoch: 750, mean_loss: 0.1032, total_reward: -106.0, max_pos: 0.5166,  in 169.8058 secs
Epoch: 1000, mean_loss: 0.1056, total_reward: -108.0, max_pos: 0.5251,  in 156.6637 secs


First goal achieved: -109.68 mean reward at 828 epoch.
Max goal achieved: -105.23 mean reward at 922 epoch.



# Training: Model 14
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.2501, total_reward: -111.0, max_pos: 0.5012,  in 209.8535 secs
Epoch: 500, mean_loss: 0.1115, total_reward: -146.0, max_pos: 0.5092,  in 168.2120 secs
Epoch: 750, mean_loss: 0.2076, total_reward: -122.0, max_pos: 0.5339,  in 172.9032 secs
Epoch: 1000, mean_loss: 0.2238, total_reward: -114.0, max_pos: 0.5046,  in 183.3610 secs


Max goal achieved: -110.37 mean reward at 477 epoch.



# Training: Model 15
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 2.6961, total_reward: -110.0, max_pos: 0.5398,  in 230.5918 secs
Epoch: 500, mean_loss: 0.1538, total_reward: -104.0, max_pos: 0.5080,  in 173.7329 secs
Epoch: 750, mean_loss: 0.1374, total_reward: -119.0, max_pos: 0.5135,  in 159.9844 secs
Epoch: 1000, mean_loss: 0.2237, total_reward: -121.0, max_pos: 0.5007,  in 168.6535 secs


First goal achieved: -109.47 mean reward at 561 epoch.
Max goal achieved: -107.73 mean reward at 565 epoch.



# Training: Model 16
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.7333, total_reward: -138.0, max_pos: 0.5058,  in 219.7741 secs
Epoch: 500, mean_loss: 0.1340, total_reward: -150.0, max_pos: 0.5133,  in 198.3640 secs
Epoch: 750, mean_loss: 0.1831, total_reward: -106.0, max_pos: 0.5064,  in 177.4350 secs
Epoch: 1000, mean_loss: 0.1559, total_reward: -112.0, max_pos: 0.5367,  in 166.5221 secs


First goal achieved: -109.84 mean reward at 827 epoch.
Max goal achieved: -109.35 mean reward at 834 epoch.



# Training: Model 17
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.6865, total_reward: -173.0, max_pos: 0.5152,  in 233.8512 secs
Epoch: 500, mean_loss: 0.1890, total_reward: -87.0, max_pos: 0.5113,  in 177.8812 secs
Epoch: 750, mean_loss: 0.4426, total_reward: -154.0, max_pos: 0.5257,  in 169.4072 secs
Epoch: 1000, mean_loss: 0.4804, total_reward: -100.0, max_pos: 0.5076,  in 167.5443 secs


Max goal achieved: -110.01 mean reward at 705 epoch.



# Training: Model 18
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 2.2643, total_reward: -176.0, max_pos: 0.5263,  in 229.7552 secs
Epoch: 500, mean_loss: 0.2520, total_reward: -119.0, max_pos: 0.5231,  in 165.0447 secs
Epoch: 750, mean_loss: 0.0940, total_reward: -117.0, max_pos: 0.5141,  in 168.8520 secs
Epoch: 1000, mean_loss: 0.7671, total_reward: -162.0, max_pos: 0.5244,  in 189.3947 secs


First goal achieved: -109.71 mean reward at 508 epoch.
Max goal achieved: -107.19 mean reward at 581 epoch.



# Training: Model 19
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.0889, total_reward: -200.0, max_pos: 0.4243,  in 222.6695 secs
Epoch: 500, mean_loss: 0.1186, total_reward: -133.0, max_pos: 0.5243,  in 177.7401 secs
Epoch: 750, mean_loss: 0.2015, total_reward: -106.0, max_pos: 0.5291,  in 159.1947 secs
Epoch: 1000, mean_loss: 0.1583, total_reward: -95.0, max_pos: 0.5147,  in 169.4934 secs


First goal achieved: -109.76 mean reward at 539 epoch.
Max goal achieved: -101.00 mean reward at 844 epoch.



# Training: Model 20
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.7926, total_reward: -137.0, max_pos: 0.5151,  in 220.9041 secs
Epoch: 500, mean_loss: 0.1246, total_reward: -105.0, max_pos: 0.5022,  in 170.5050 secs
Epoch: 750, mean_loss: 0.1329, total_reward: -105.0, max_pos: 0.5145,  in 165.8094 secs
Epoch: 1000, mean_loss: 0.2084, total_reward: -148.0, max_pos: 0.5369,  in 189.8364 secs


First goal achieved: -109.69 mean reward at 518 epoch.
Max goal achieved: -105.44 mean reward at 558 epoch.



# Training: Model 21
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.7768, total_reward: -154.0, max_pos: 0.5268,  in 235.0237 secs
Epoch: 500, mean_loss: 0.1768, total_reward: -101.0, max_pos: 0.5146,  in 175.5240 secs
Epoch: 750, mean_loss: 0.1073, total_reward: -104.0, max_pos: 0.5052,  in 148.3885 secs
Epoch: 1000, mean_loss: 0.2064, total_reward: -100.0, max_pos: 0.5096,  in 179.0574 secs


First goal achieved: -109.82 mean reward at 525 epoch.
Max goal achieved: -100.46 mean reward at 697 epoch.



# Training: Model 22
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 0.5865, total_reward: -113.0, max_pos: 0.5050,  in 209.7948 secs
Epoch: 500, mean_loss: 0.0752, total_reward: -105.0, max_pos: 0.5103,  in 156.4617 secs
Epoch: 750, mean_loss: 0.0561, total_reward: -105.0, max_pos: 0.5027,  in 154.7993 secs
Epoch: 1000, mean_loss: 0.1047, total_reward: -108.0, max_pos: 0.5182,  in 163.7310 secs


First goal achieved: -109.91 mean reward at 436 epoch.
Max goal achieved: -99.88 mean reward at 558 epoch.



# Training: Model 23
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 4.5058, total_reward: -200.0, max_pos: -0.4063,  in 277.0203 secs
Epoch: 500, mean_loss: 2.4840, total_reward: -87.0, max_pos: 0.5045,  in 240.2121 secs
Epoch: 750, mean_loss: 0.1131, total_reward: -148.0, max_pos: 0.5359,  in 192.4407 secs
Epoch: 1000, mean_loss: 0.1289, total_reward: -103.0, max_pos: 0.5078,  in 162.4567 secs


First goal achieved: -109.91 mean reward at 970 epoch.
Max goal achieved: -103.75 mean reward at 1000 epoch.



# Training: Model 24
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.9288, total_reward: -161.0, max_pos: 0.5177,  in 236.4940 secs
Epoch: 500, mean_loss: 0.1143, total_reward: -105.0, max_pos: 0.5015,  in 183.2397 secs
Epoch: 750, mean_loss: 0.8264, total_reward: -164.0, max_pos: 0.5369,  in 198.8283 secs
Epoch: 1000, mean_loss: 0.2676, total_reward: -112.0, max_pos: 0.5167,  in 160.3707 secs


First goal achieved: -109.63 mean reward at 956 epoch.
Max goal achieved: -104.43 mean reward at 993 epoch.



# Training: Model 25
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 2.4363, total_reward: -200.0, max_pos: 0.3399,  in 238.4883 secs
Epoch: 500, mean_loss: 0.1119, total_reward: -87.0, max_pos: 0.5045,  in 170.2142 secs
Epoch: 750, mean_loss: 0.2350, total_reward: -200.0, max_pos: -0.1106,  in 163.4048 secs
Epoch: 1000, mean_loss: 0.2373, total_reward: -111.0, max_pos: 0.5128,  in 165.9842 secs


First goal achieved: -109.14 mean reward at 494 epoch.
Max goal achieved: -105.43 mean reward at 577 epoch.



# Training: Model 26
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 3.6653, total_reward: -160.0, max_pos: 0.5284,  in 275.4822 secs
Epoch: 500, mean_loss: 0.6212, total_reward: -144.0, max_pos: 0.5382,  in 190.9317 secs
Epoch: 750, mean_loss: 0.0951, total_reward: -109.0, max_pos: 0.5083,  in 175.0992 secs
Epoch: 1000, mean_loss: 0.1217, total_reward: -110.0, max_pos: 0.5175,  in 169.9652 secs


Max goal achieved: -111.79 mean reward at 818 epoch.



# Training: Model 27
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.8631, total_reward: -155.0, max_pos: 0.5197,  in 242.0336 secs
Epoch: 500, mean_loss: 0.3668, total_reward: -91.0, max_pos: 0.5001,  in 176.9696 secs
Epoch: 750, mean_loss: 0.2484, total_reward: -200.0, max_pos: -0.4794,  in 165.8898 secs
Epoch: 1000, mean_loss: 0.3413, total_reward: -100.0, max_pos: 0.5100,  in 174.1226 secs


First goal achieved: -109.84 mean reward at 541 epoch.
Max goal achieved: -102.75 mean reward at 581 epoch.



# Training: Model 28
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.5805, total_reward: -103.0, max_pos: 0.5217,  in 235.2102 secs
Epoch: 500, mean_loss: 0.2090, total_reward: -117.0, max_pos: 0.5048,  in 176.4830 secs
Epoch: 750, mean_loss: 0.0585, total_reward: -103.0, max_pos: 0.5046,  in 163.2884 secs
Epoch: 1000, mean_loss: 0.1564, total_reward: -111.0, max_pos: 0.5155,  in 179.0756 secs


First goal achieved: -109.87 mean reward at 637 epoch.
Max goal achieved: -106.07 mean reward at 676 epoch.



# Training: Model 29
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 0.9672, total_reward: -109.0, max_pos: 0.5251,  in 201.1118 secs
Epoch: 500, mean_loss: 0.0707, total_reward: -87.0, max_pos: 0.5045,  in 157.5159 secs
Epoch: 750, mean_loss: 0.7580, total_reward: -159.0, max_pos: 0.5093,  in 185.8383 secs
Epoch: 1000, mean_loss: 0.3524, total_reward: -130.0, max_pos: 0.5038,  in 176.1020 secs


First goal achieved: -109.66 mean reward at 460 epoch.
Max goal achieved: -103.33 mean reward at 556 epoch.



# Training: Model 30
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.8634, total_reward: -149.0, max_pos: 0.5429,  in 233.1951 secs
Epoch: 500, mean_loss: 0.2617, total_reward: -111.0, max_pos: 0.5269,  in 178.6010 secs
Epoch: 750, mean_loss: 0.1179, total_reward: -106.0, max_pos: 0.5075,  in 159.9608 secs
Epoch: 1000, mean_loss: 0.0905, total_reward: -123.0, max_pos: 0.5057,  in 178.7412 secs


First goal achieved: -109.74 mean reward at 560 epoch.
Max goal achieved: -105.73 mean reward at 593 epoch.



# Training: Model 31
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 2.2923, total_reward: -109.0, max_pos: 0.5092,  in 247.7342 secs
Epoch: 500, mean_loss: 0.2375, total_reward: -98.0, max_pos: 0.5160,  in 184.4436 secs
Epoch: 750, mean_loss: 0.1104, total_reward: -110.0, max_pos: 0.5089,  in 165.3132 secs
Epoch: 1000, mean_loss: 0.1552, total_reward: -94.0, max_pos: 0.5006,  in 178.4806 secs


First goal achieved: -110.00 mean reward at 753 epoch.
Max goal achieved: -107.77 mean reward at 801 epoch.



# Training: Model 32
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.9353, total_reward: -108.0, max_pos: 0.5093,  in 241.7991 secs
Epoch: 500, mean_loss: 0.5116, total_reward: -116.0, max_pos: 0.5361,  in 169.3740 secs
Epoch: 750, mean_loss: 0.0746, total_reward: -155.0, max_pos: 0.5178,  in 151.7585 secs
Epoch: 1000, mean_loss: 0.0756, total_reward: -155.0, max_pos: 0.5302,  in 163.5846 secs


First goal achieved: -109.82 mean reward at 623 epoch.
Max goal achieved: -103.37 mean reward at 703 epoch.



# Training: Model 33
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.8478, total_reward: -133.0, max_pos: 0.5349,  in 235.5952 secs
Epoch: 500, mean_loss: 0.4156, total_reward: -96.0, max_pos: 0.5207,  in 165.8315 secs
Epoch: 750, mean_loss: 0.0925, total_reward: -104.0, max_pos: 0.5151,  in 155.4280 secs
Epoch: 1000, mean_loss: 0.1992, total_reward: -100.0, max_pos: 0.5082,  in 156.7521 secs


First goal achieved: -109.97 mean reward at 537 epoch.
Max goal achieved: -104.62 mean reward at 811 epoch.



# Training: Model 34
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.9249, total_reward: -172.0, max_pos: 0.5259,  in 242.0428 secs
Epoch: 500, mean_loss: 0.2335, total_reward: -115.0, max_pos: 0.5196,  in 180.6443 secs
Epoch: 750, mean_loss: 0.2346, total_reward: -104.0, max_pos: 0.5182,  in 156.3839 secs
Epoch: 1000, mean_loss: 0.1198, total_reward: -111.0, max_pos: 0.5129,  in 166.7506 secs


First goal achieved: -109.95 mean reward at 559 epoch.
Max goal achieved: -105.26 mean reward at 588 epoch.



# Training: Model 35
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 3.6521, total_reward: -200.0, max_pos: -0.0525,  in 273.1309 secs
Epoch: 500, mean_loss: 1.5010, total_reward: -175.0, max_pos: 0.5369,  in 204.9385 secs
Epoch: 750, mean_loss: 0.1823, total_reward: -123.0, max_pos: 0.5386,  in 169.3977 secs
Epoch: 1000, mean_loss: 0.1580, total_reward: -135.0, max_pos: 0.5058,  in 176.1023 secs


Max goal achieved: -112.61 mean reward at 714 epoch.



# Training: Model 36
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.6841, total_reward: -155.0, max_pos: 0.5397,  in 223.1463 secs
Epoch: 500, mean_loss: 0.0980, total_reward: -114.0, max_pos: 0.5049,  in 170.5037 secs
Epoch: 750, mean_loss: 0.0846, total_reward: -106.0, max_pos: 0.5180,  in 164.3030 secs
Epoch: 1000, mean_loss: 0.0851, total_reward: -108.0, max_pos: 0.5157,  in 148.1344 secs


First goal achieved: -109.48 mean reward at 524 epoch.
Max goal achieved: -102.06 mean reward at 931 epoch.



# Training: Model 37
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 4.5886, total_reward: -200.0, max_pos: -0.2997,  in 274.3156 secs
Epoch: 500, mean_loss: 1.3963, total_reward: -99.0, max_pos: 0.5140,  in 221.3381 secs
Epoch: 750, mean_loss: 0.1167, total_reward: -104.0, max_pos: 0.5012,  in 180.9376 secs
Epoch: 1000, mean_loss: 0.3341, total_reward: -111.0, max_pos: 0.5218,  in 189.4740 secs


Max goal achieved: -119.60 mean reward at 858 epoch.



# Training: Model 38
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 2.7057, total_reward: -189.0, max_pos: 0.5211,  in 243.2824 secs
Epoch: 500, mean_loss: 0.1961, total_reward: -112.0, max_pos: 0.5003,  in 173.3239 secs
Epoch: 750, mean_loss: 0.0547, total_reward: -108.0, max_pos: 0.5149,  in 148.4334 secs
Epoch: 1000, mean_loss: 0.7353, total_reward: -122.0, max_pos: 0.5088,  in 178.0942 secs


First goal achieved: -109.82 mean reward at 516 epoch.
Max goal achieved: -102.39 mean reward at 648 epoch.



# Training: Model 39
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 2.6428, total_reward: -151.0, max_pos: 0.5369,  in 244.2484 secs
Epoch: 500, mean_loss: 0.3937, total_reward: -91.0, max_pos: 0.5164,  in 185.7556 secs
Epoch: 750, mean_loss: 0.5960, total_reward: -104.0, max_pos: 0.5117,  in 191.7711 secs
Epoch: 1000, mean_loss: 0.6848, total_reward: -108.0, max_pos: 0.5162,  in 165.2275 secs


First goal achieved: -109.89 mean reward at 883 epoch.
Max goal achieved: -106.78 mean reward at 918 epoch.



# Training: Model 40
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 250, mean_loss: 1.0653, total_reward: -197.0, max_pos: 0.5369,  in 246.8239 secs
Epoch: 500, mean_loss: 0.2191, total_reward: -126.0, max_pos: 0.5440,  in 180.1326 secs
Epoch: 750, mean_loss: 0.1431, total_reward: -153.0, max_pos: 0.5086,  in 183.2748 secs
Epoch: 1000, mean_loss: 0.3423, total_reward: -142.0, max_pos: 0.5015,  in 202.7662 secs


First goal achieved: -109.53 mean reward at 442 epoch.
Max goal achieved: -102.61 mean reward at 486 epoch.



if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    date_time = datetime.now().strftime("%d.%m.%Y %H.%M")
    model_dir = os.path.join("models", "Model {}".format(date_time))
    summ_dir = os.path.join("summaries", "{} {}".format(ENV_NAME, date_time))
    log_file = os.path.join(LOG_DIR, "Results {}.txt".format(date_time))
    logger = get_logger(log_file)
    para_gen = parameter_generator(EPS_DECAYS, EXPLORE_EXPLOIT_INTERVALS, TARGET_UPDATE_STEPS, 
                                   EXPLORE_RATIOS, DDQNS, SEEDS)
    for model_i, (seed, init_kwargs, train_kwargs) in enumerate(para_gen, start=1):
        env.seed(int(seed))
        with tf.Session(config=TF_CONFIG) as sess:
            goal_summary = run(sess, env, model_dir, summ_dir, model_i, init_kwargs=init_kwargs,
                               train_kwargs=train_kwargs, plot_result=False)
        tf.compat.v1.reset_default_graph()
        # Log results
        parameter_dict = {"seed":seed, **init_kwargs, **train_kwargs} 
        parameter_str = dict2str(parameter_dict)
        logger.debug("Model {:<2} - {}".format(model_i, parameter_str))
        num_goals, first_goal, max_goal = goal_summary
        first_goal_epoch, first_goal_reward = first_goal 
        max_goal_epoch, max_goal_reward = max_goal
        logger.debug("Goals achieved: {}".format(num_goals))
        if num_goals:
            logger.info("First goal achieved: {:.2f} mean reward at {} epoch.".format(first_goal_reward, first_goal_epoch))
        logger.info("Max goal achieved: {:.2f} mean reward at {} epoch.\n".format(max_goal_reward, max_goal_epoch))