In [1]:
import os
import sys
import gym
import numpy as np
import configparser
from datetime import datetime
# Import Tensorflow
import tensorflow as tf
import tensorflow.compat.v1 as tf_v1

## Insert module root directory to sys.path

In [2]:
ROOT_DIR = os.path.abspath(os.path.join(os.path.abspath(""), '..'))
sys.path.insert(0, ROOT_DIR)
from run import MountainCar_v0
from src import ReplayBuffer
from src.Utils import get_logger, eval_dict_values, parameter_generator

The agent is trained with different combinations of the following parameters:

In [3]:
"""DDQNS = [True]
EPS_DECAYS = np.linspace(0.1, 0.5, num=2, dtype=np.float16)
EXPLORE_EXPLOIT_INTERVALS = np.linspace(10, 40, num=1, dtype=np.uint16)
TARGET_UPDATE_STEPS = [10]#np.linspace(5, 20, num=4, dtype=np.uint16)
EXPLORE_RATIOS = [0.25]#np.linspace(0.1, 0.5, num=3, dtype=np.float16)"""
SEEDS = np.random.randint(100, 1000, size=1, dtype=np.uint16)
LOG_INIT_KWARGS = {"eps_decay": np.linspace(0.1, 0.5, num=2, dtype=np.float16), 
                   "ddqn": [True]}
LOG_TRAIN_KWARGS = {"target_update_steps": [10],
                    "explore_exploit_interval": np.linspace(10, 40, num=1, dtype=np.uint16),
                    "explore_ratio": [0.25]}

## Directories

In [4]:
LOG_DIR = "log"
SUMM_DIR = "summaries"
CONFIG_DIR = "config"
ENV_NAME = "MountainCar-v0"
TF_CONFIG = tf_v1.ConfigProto(gpu_options=tf_v1.GPUOptions(per_process_gpu_memory_fraction=0.5), 
                              allow_soft_placement=True)

## Parameters

In [5]:
plot_result = False                        # Plots the result in matplotlib
test_model_chkpt = None                    # Address to a trained model checkpoint
record_interval = 0
epochs = 1000
date_time = datetime.now().strftime("%d.%m.%Y %H.%M")
# Root directory needed to be specified explicitly in Jupyter Notebook
summ_dir = os.path.join(ROOT_DIR, SUMM_DIR, "{} {}".format(ENV_NAME, date_time))
log_file = os.path.join(summ_dir, LOG_DIR, "Results {} {}.log".format(ENV_NAME, date_time))
config_file = os.path.join(ROOT_DIR, CONFIG_DIR, "MountainCar-v0.ini")
os.makedirs(os.path.dirname(log_file), exist_ok=True)

## Load configuration from .ini file

In [6]:
config_dict = MountainCar_v0.get_configuration(config_file)
init_kwargs, train_kwargs = config_dict["kwargs"]
mem_size = config_dict["others"]["mem_size"]

## Train agent with different parameter combinations

In [None]:
# Setup logger
logger = get_logger(log_file)
# Create environment and replay buffer
env = gym.make(ENV_NAME)
mem = ReplayBuffer(mem_size)
if record_interval > 0:
    # Wrap environment with Monitor wrapper to record videos
    env = gym.wrappers.Monitor(env, os.path.join(summ_dir, "videos"), force=True,
                               video_callable=lambda epoch: not epoch%record_interval)
para_gen = parameter_generator(SEEDS, LOG_INIT_KWARGS, LOG_TRAIN_KWARGS)
for model_i, (seed, log_init_kwargs, log_train_kwargs) in enumerate(para_gen, start=1):
    # Run the program
    MountainCar_v0.run(env, seed, mem, logger, summ_dir, epochs, init_kwargs, train_kwargs, log_init_kwargs, 
                       log_train_kwargs, plot_result, model_i=model_i, sess_config=TF_CONFIG, 
                       test_model_chkpt=test_model_chkpt)

W0918 20:34:26.090476  6096 deprecation.py:506] From c:\users\raj k\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0918 20:34:26.159483  6096 deprecation.py:323] From c:\users\raj k\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\losses\losses_impl.py:121: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



# Training: Model 1
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 100, mean_loss: 2.2852, total_reward: -131.0, max_pos: 0.5131, in 83.0421 secs
Epoch: 200, mean_loss: 1.0264, total_reward: -141.0, max_pos: 0.5161, in 67.2969 secs
Epoch: 300, mean_loss: 0.5350, total_reward: -105.0, max_pos: 0.5041, in 63.1733 secs
Epoch: 400, mean_loss: 0.6702, total_reward: -108.0, max_pos: 0.5199, in 57.2305 secs
Epoch: 500, mean_loss: 0.0757, total_reward: -92.0, max_pos: 0.5048, in 56.5376 secs
Epoch: 600, mean_loss: 0.0733, total_reward: -85.0, max_pos: 0.5183, in 55.4536 secs
Epoch: 700, mean_loss: 0.0672, total_reward: -85.0, max_pos: 0.5061, in 61.8077 secs
Epoch: 800, mean_loss: 0.0756, total_reward: -91.0, max_pos: 0.5141, in 62.9434 secs
Epoch: 900, mean_loss: 0.0868, total_reward: -89.0, max_pos: 0.5030, in 63.6945 secs
Epoch: 1000, mean_loss: 0.1804, total_reward: -87.0, max_pos: 0.5105, in 55.2383 secs
                                                  

Goals achieved: 297                 
First goal achieved: -109.98 mean reward at 361 epoch.
Max goal achieved: -102.52 mean reward at 1000 epoch.




# Training: Model 2
Goal: Get average reward of -110.00 over 100 consecutive trials!
Epoch: 100, mean_loss: 3.0779, total_reward: -200.0, max_pos: -0.3309, in 103.0931 secs
Epoch: 200, mean_loss: 2.1854, total_reward: -175.0, max_pos: 0.5369, in 85.0755 secs
Epoch: 300, mean_loss: 1.1283, total_reward: -103.0, max_pos: 0.5197, in 74.9167 secs
Epoch: 400, mean_loss: 0.8847, total_reward: -119.0, max_pos: 0.5224, in 65.6842 secs
Epoch: 500, mean_loss: 0.5534, total_reward: -89.0, max_pos: 0.5104, in 59.7909 secs
Epoch: 600, mean_loss: 0.1124, total_reward: -85.0, max_pos: 0.5190, in 55.0037 secs
Epoch: 700, mean_loss: 0.1832, total_reward: -85.0, max_pos: 0.5052, in 65.0482 secs
Epoch: 800, mean_loss: 0.2949, total_reward: -143.0, max_pos: 0.5070, in 69.8976 secs
Epoch: 900, mean_loss: 0.2101, total_reward: -87.0, max_pos: 0.5019, in 63.7277 secs
Training agent. Please be patient...