In [1]:
import os
import sys
import gym
import numpy as np
import configparser
from datetime import datetime
# Import Tensorflow
import tensorflow as tf
import tensorflow.compat.v1 as tf_v1

## Insert module root directory to sys.path

In [2]:
ROOT_DIR = os.path.abspath(os.path.join(os.path.abspath(""), '..'))
sys.path.insert(0, ROOT_DIR)
from run import MountainCar_v0
from src import ReplayBuffer
from src.Utils import get_logger, eval_dict_values, parameter_generator

The agent is trained with different combinations of the following parameters:

In [3]:
"""DDQNS = [True]
EPS_DECAYS = np.linspace(0.1, 0.5, num=2, dtype=np.float16)
EXPLORE_EXPLOIT_INTERVALS = np.linspace(10, 40, num=1, dtype=np.uint16)
TARGET_UPDATE_STEPS = [10]#np.linspace(5, 20, num=4, dtype=np.uint16)
EXPLORE_RATIOS = [0.25]#np.linspace(0.1, 0.5, num=3, dtype=np.float16)"""
SEEDS = np.random.randint(100, 1000, size=1, dtype=np.uint16)
LOG_INIT_KWARGS = {"eps_decay": [0.25], 
                   "ddqn": [True]}
LOG_TRAIN_KWARGS = {"target_update_steps": [10],
                    "explore_exploit_interval": np.linspace(10, 40, num=4, dtype=np.uint16),
                    "explore_ratio": [0.25]}

## Directories

In [4]:
LOG_DIR = "log"
SUMM_DIR = "summaries"
CONFIG_DIR = "config"
ENV_NAME = "LunarLander-v2"
TF_CONFIG = tf_v1.ConfigProto(gpu_options=tf_v1.GPUOptions(per_process_gpu_memory_fraction=0.5), 
                              allow_soft_placement=True)

## Parameters

In [5]:
plot_result = False                        # Plots the result in matplotlib
test_model_chkpt = None                    # Address to a trained model checkpoint
record_interval = 0
epochs = 1000
date_time = datetime.now().strftime("%d.%m.%Y %H.%M")
# Root directory needed to be specified explicitly in Jupyter Notebook
summ_dir = os.path.join(ROOT_DIR, SUMM_DIR, "{} {}".format(ENV_NAME, date_time))
log_file = os.path.join(summ_dir, LOG_DIR, "Results {} {}.log".format(ENV_NAME, date_time))
config_file = os.path.join(ROOT_DIR, CONFIG_DIR, "{}.ini".format(ENV_NAME))
os.makedirs(os.path.dirname(log_file), exist_ok=True)

## Load configuration from .ini file

In [6]:
config_dict = MountainCar_v0.get_configuration(config_file)
init_kwargs, train_kwargs = config_dict["kwargs"]
mem_size = config_dict["others"]["mem_size"]

## Train agent with different parameter combinations

In [7]:
# Setup logger
logger = get_logger(log_file)
# Create environment and replay buffer
env = gym.make(ENV_NAME)
mem = ReplayBuffer(mem_size)
if record_interval > 0:
    # Wrap environment with Monitor wrapper to record videos
    env = gym.wrappers.Monitor(env, os.path.join(summ_dir, "videos"), force=True,
                               video_callable=lambda epoch: not epoch%record_interval)
para_gen = parameter_generator(SEEDS, LOG_INIT_KWARGS, LOG_TRAIN_KWARGS)
for model_i, (seed, log_init_kwargs, log_train_kwargs) in enumerate(para_gen, start=1):
    # Run the program
    MountainCar_v0.run(env, seed, mem, logger, summ_dir, epochs, init_kwargs, train_kwargs, log_init_kwargs, 
                       log_train_kwargs, plot_result, model_i=model_i, sess_config=TF_CONFIG, 
                       test_model_chkpt=test_model_chkpt)

W0923 19:47:09.783090  6888 deprecation.py:506] From c:\users\raj k\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0923 19:47:09.868094  6888 deprecation.py:323] From c:\users\raj k\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\losses\losses_impl.py:121: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



# Training: Model 1
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_loss: 239.5242, total_reward: -282.22843337812526, max_pos: 0.0108, in 48.4483 secs
Epoch: 200, mean_loss: 16.4514, total_reward: -151.4818954855577, max_pos: 0.4961, in 55.3526 secs
Epoch: 300, mean_loss: 10.8775, total_reward: -126.65004294687489, max_pos: 0.6367, in 111.3849 secs
Epoch: 400, mean_loss: 1.9833, total_reward: -24.229315551885318, max_pos: 0.1509, in 461.1107 secs
Epoch: 500, mean_loss: 4.1738, total_reward: -78.8606688250731, max_pos: -0.0156, in 253.9051 secs
Epoch: 600, mean_loss: 3.7480, total_reward: 233.16731599812883, max_pos: 0.2051, in 358.5374 secs
Epoch: 700, mean_loss: 2.9127, total_reward: 267.8876050473191, max_pos: -0.0113, in 278.3118 secs
Epoch: 800, mean_loss: 3.2299, total_reward: 276.1420637411585, max_pos: 0.2553, in 207.4290 secs
Epoch: 900, mean_loss: 4.7402, total_reward: -9.67748345662254, max_pos: 0.1752, in 241.1294 secs
Epoch: 1000, mean_los

Goals achieved: 0                   
Max goal achieved: 193.69 mean reward at 706 epoch.




# Training: Model 2
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_loss: 2.3062, total_reward: 259.65591778013294, max_pos: 0.1159, in 374.4870 secs
Epoch: 200, mean_loss: 2.7862, total_reward: 256.65895414028785, max_pos: 0.1121, in 222.9860 secs
Epoch: 300, mean_loss: 3.0576, total_reward: -213.62351627864464, max_pos: 1.0085, in 202.1362 secs
Epoch: 400, mean_loss: 2.3793, total_reward: 289.7979008394663, max_pos: 0.2824, in 167.3108 secs
Epoch: 500, mean_loss: 3.3807, total_reward: 261.26436221447204, max_pos: 0.2551, in 177.8115 secs
Epoch: 600, mean_loss: 5.6725, total_reward: 245.18988444253742, max_pos: -0.0115, in 181.2866 secs
Epoch: 700, mean_loss: 16.1822, total_reward: 232.47315798195683, max_pos: -0.0031, in 198.7743 secs
Epoch: 800, mean_loss: 4.8523, total_reward: 279.91183421028563, max_pos: -0.0094, in 180.2717 secs
Epoch: 900, mean_loss: 4.6107, total_reward: -23.226158115952856, max_pos: 0.2724, in 196.2639 secs
Epoch: 1000, mean_l

Goals achieved: 180                 
First goal achieved: 200.51 mean reward at 416 epoch.
Max goal achieved: 225.35 mean reward at 889 epoch.




# Training: Model 3
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_loss: 3.8022, total_reward: 216.27409477108577, max_pos: 0.2909, in 281.2083 secs
Epoch: 200, mean_loss: 3.7025, total_reward: -1.6766581571069992, max_pos: -0.0116, in 253.4242 secs
Epoch: 300, mean_loss: 4.0811, total_reward: 14.680357645346675, max_pos: 0.0913, in 231.2380 secs
Epoch: 400, mean_loss: 3.2200, total_reward: 259.6470335739588, max_pos: 0.0725, in 170.9964 secs
Epoch: 500, mean_loss: 9.0626, total_reward: 2.9509543291324434, max_pos: 0.4272, in 144.7686 secs
Epoch: 600, mean_loss: 4.4533, total_reward: 280.13076761265427, max_pos: 0.2607, in 169.9572 secs
Epoch: 700, mean_loss: 4.0695, total_reward: 267.3314366435777, max_pos: 0.0713, in 160.8366 secs
Epoch: 800, mean_loss: 4.6023, total_reward: 266.0288157933463, max_pos: 0.1489, in 200.3412 secs
Epoch: 900, mean_loss: 590.4313, total_reward: -532.2342748875957, max_pos: 1.0009, in 112.0710 secs
Epoch: 1000, mean_loss:

Goals achieved: 129                 
First goal achieved: 203.52 mean reward at 576 epoch.
Max goal achieved: 228.39 mean reward at 591 epoch.




# Training: Model 4
Goal: Get average reward of 200.00 over 100 consecutive trials!
Epoch: 100, mean_loss: 3.7568, total_reward: 233.2728108638775, max_pos: 0.2318, in 275.6173 secs
Epoch: 200, mean_loss: 6.9572, total_reward: 274.8320874202782, max_pos: -0.0094, in 185.4408 secs
Epoch: 300, mean_loss: 6.4433, total_reward: 253.25315875605784, max_pos: 0.0913, in 199.2541 secs
Epoch: 400, mean_loss: 3.5313, total_reward: 263.69593056773044, max_pos: 0.1313, in 192.8135 secs
Epoch: 500, mean_loss: 3.3126, total_reward: 258.4125882483935, max_pos: 0.5170, in 181.8703 secs
Epoch: 600, mean_loss: 4.7623, total_reward: 267.5559501910953, max_pos: -0.0011, in 163.3849 secs
Epoch: 700, mean_loss: 4.1168, total_reward: 269.90417802828676, max_pos: 0.2061, in 145.1792 secs
Epoch: 800, mean_loss: 5.5852, total_reward: 8.164281709849291, max_pos: 0.2140, in 156.9217 secs
Epoch: 900, mean_loss: 5.5782, total_reward: 290.1668602428755, max_pos: 0.2944, in 156.0644 secs
Epoch: 1000, mean_loss: 7.68

Goals achieved: 316                 
First goal achieved: 201.63 mean reward at 470 epoch.
Max goal achieved: 234.33 mean reward at 903 epoch.

