# APEX - Mountain car

## Import Library

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import gym
import ray
import time

#from ray.rllib.ppo import PPOAgent, DEFAULT_CONFIG
#from ray.rllib.a3c import A3CAgent, DEFAULT_CONFIG
#from ray.rllib.dqn import DQNAgent, DEFAULT_CONFIG

from ray.rllib.dqn.apex import ApexAgent, APEX_DEFAULT_CONFIG
#from ray.rllib.ddpg.apex import ApexDDPGAgent, APEX_DDPG_DEFAULT_CONFIG # no API so far


# start ray library, 1G memory for object_store
ray.init(num_workers=0,
         num_cpus=8,
         num_gpus=1,
         object_store_memory=1073741824
        )

time.sleep(3)

# Set RL Agent

#config = DEFAULT_CONFIG.copy()
config = APEX_DEFAULT_CONFIG.copy()

# Ape-X dqn parameters
#config["gpu"] = False
config["lr"] = 2e-4
config["num_workers"] = 4
config["buffer_size"] = 50000
config["learning_starts"] = 500
config["train_batch_size"] = 128
config["target_network_update_freq"] = 5000

# ppo alg parameters
#config['num_sgd_iter'] = 64
#config['sgd_batchsize'] = 256
#config['model']['fcnet_hiddens'] = [100, 100]

# set agent
#agent = PPOAgent(config, 'CartPole-v0')
#agent = A3CAgent(config, 'CartPole-v0')
#agent = DQNAgent(config, 'CartPole-v0')

#agent = ApexAgent(config, 'CartPole-v0')
agent = ApexAgent(config, 'MountainCar-v0')

Process STDOUT and STDERR is being redirected to /tmp/raylogs/.
Waiting for redis server at 127.0.0.1:55878 to respond...
Waiting for redis server at 127.0.0.1:30080 to respond...
Starting local scheduler with the following resources: {'CPU': 8, 'GPU': 1}.

View the web UI at http://localhost:8889/notebooks/ray_ui98800.ipynb?token=5b6a7fa6354bc850611e9e6c3c58694f4488a7b905d1cea3

Created LogSyncer for /root/ray_results/2018-07-02_11-39-349uiosgsf -> None
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Observation shape is (2,)
Not using any observation preprocessor.


## Set Redis database

In [2]:
# Create a Redis client just for config Redis.
import redis

r_primary = ray.worker.global_worker.redis_client
r_primary.keys("*")
running_port = int(str(r_primary.lrange('RedisShards', 0, -1)).split(":")[1][:-2])
print ("Using port: {}".format(running_port))

redis_client1 = redis.StrictRedis(host="127.0.0.1", port=running_port)
#redis_client2 = redis.StrictRedis(host="127.0.0.1", port=16254)

redis_client1.config_set("maxmemory", "1024mb")  # 512 MB limit
redis_client1.config_set("maxmemory-policy", "allkeys-lru")
redis_client1.config_set("maxmemory-samples", "3")

#redis_client2.config_set("maxmemory", "2048mb")  # 512 MB limit
#redis_client2.config_set("maxmemory-policy", "allkeys-lru")
#redis_client2.config_set("maxmemory-samples", "3")

# check setting result
print(redis_client1.config_get("maxmemory"))
print(redis_client1.config_get("maxmemory-policy"))
print(redis_client1.config_get("maxmemory-samples"))

#print(redis_client2.config_get("maxmemory"))
#print(redis_client2.config_get("maxmemory-policy"))
#print(redis_client2.config_get("maxmemory-samples"))

Using port: 30080
{'maxmemory': '1073741824'}
{'maxmemory-policy': 'allkeys-lru'}
{'maxmemory-samples': '3'}


# Repeat training loop

In [3]:
import time
from datetime import datetime

print ("Current Time: {}".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

time_list = [time.time()]
time_delta_list = []

num_iters = 500
save_chkpt = False

for i in range(num_iters):
    result = agent.train()
    
    if i == 0:
        print(result)
        print("\n")
    else:
        time_delta = time_list[-1] - time_list[-2]
        time_delta_list.append(time_delta)
        print("[ {} / {} ] iteration, loop time: {:.2f}, Avg. Reward: {}".format(i+1, num_iters,
                                                                            time_delta,                                                                                           
                                                                            result.episode_reward_mean)
             )


        
    time_list.append(time.time())
    
    # save result every 10 training loop
    if save_chkpt and (i+1>1) and ((i+1)%10==0):
        checkpoint = agent.save()
        print("checkpoint saved at", checkpoint)


print ("Current Time: {}".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
print("Training done.\n")

if save_chkpt:
    checkpoint = agent.save()
    print("checkpoint saved at", checkpoint)

Current Time: 2018-07-02 11:40:42
TrainingResult(timesteps_total=25050, done=None, info={'min_exploration': 0.0006553600000000003, 'max_exploration': 0.4, 'num_target_updates': 17, 'num_steps_trained': 92032, 'num_steps_sampled': 25050, 'sample_throughput': 3812.055, 'train_throughput': 16514.994, 'num_weight_syncs': 61}, episode_reward_mean=-200.0, episode_len_mean=200.0, episodes_total=134, mean_accuracy=None, mean_validation_accuracy=None, mean_loss=None, neg_mean_loss=None, experiment_id='a3fa9ed9ee0e4f479067ec71baa604f6', training_iteration=1, timesteps_this_iter=25050, time_this_iter_s=6.713785171508789, time_total_s=6.713785171508789, pid=4142, date='2018-07-02_11-40-49', timestamp=1530502849, hostname='09a1146da8fb', node_ip='172.17.0.2', config={'dueling': True, 'double_q': True, 'hiddens': [256], 'n_step': 3, 'model': {}, 'gamma': 0.99, 'env_config': {}, 'schedule_max_timesteps': 100000, 'timesteps_per_iteration': 25000, 'exploration_fraction': 0.1, 'exploration_final_eps': 0

[ 101 / 500 ] iteration, loop time: 6.42, Avg. Reward: -111.61
[ 102 / 500 ] iteration, loop time: 6.70, Avg. Reward: -109.92
[ 103 / 500 ] iteration, loop time: 6.39, Avg. Reward: -110.23
[ 104 / 500 ] iteration, loop time: 6.46, Avg. Reward: -112.68
[ 105 / 500 ] iteration, loop time: 6.49, Avg. Reward: -111.01
[ 106 / 500 ] iteration, loop time: 6.53, Avg. Reward: -109.95
[ 107 / 500 ] iteration, loop time: 6.51, Avg. Reward: -106.63
[ 108 / 500 ] iteration, loop time: 6.53, Avg. Reward: -108.78
[ 109 / 500 ] iteration, loop time: 6.36, Avg. Reward: -112.51
[ 110 / 500 ] iteration, loop time: 6.37, Avg. Reward: -113.8
[ 111 / 500 ] iteration, loop time: 6.38, Avg. Reward: -110.11
[ 112 / 500 ] iteration, loop time: 6.51, Avg. Reward: -109.35
[ 113 / 500 ] iteration, loop time: 6.49, Avg. Reward: -110.11
[ 114 / 500 ] iteration, loop time: 6.42, Avg. Reward: -114.45
[ 115 / 500 ] iteration, loop time: 6.50, Avg. Reward: -115.72
[ 116 / 500 ] iteration, loop time: 6.49, Avg. Reward: -

[ 232 / 500 ] iteration, loop time: 6.51, Avg. Reward: -112.43
[ 233 / 500 ] iteration, loop time: 6.56, Avg. Reward: -110.07
[ 234 / 500 ] iteration, loop time: 6.48, Avg. Reward: -110.84
[ 235 / 500 ] iteration, loop time: 6.45, Avg. Reward: -111.26
[ 236 / 500 ] iteration, loop time: 6.47, Avg. Reward: -109.15
[ 237 / 500 ] iteration, loop time: 6.55, Avg. Reward: -110.04
[ 238 / 500 ] iteration, loop time: 6.64, Avg. Reward: -109.79
[ 239 / 500 ] iteration, loop time: 6.35, Avg. Reward: -109.09
[ 240 / 500 ] iteration, loop time: 6.46, Avg. Reward: -108.82
[ 241 / 500 ] iteration, loop time: 6.48, Avg. Reward: -110.65
[ 242 / 500 ] iteration, loop time: 6.44, Avg. Reward: -111.87
[ 243 / 500 ] iteration, loop time: 6.46, Avg. Reward: -111.42
[ 244 / 500 ] iteration, loop time: 6.54, Avg. Reward: -110.4
[ 245 / 500 ] iteration, loop time: 6.39, Avg. Reward: -108.01
[ 246 / 500 ] iteration, loop time: 6.50, Avg. Reward: -108.81


KeyboardInterrupt: 

# Reload saved model

In [3]:
trained_config = config.copy()

#test_agent = PPOAgent(trained_config, 'CartPole-v0')
#test_agent = ApexAgent(config, 'CartPole-v0')
test_agent = ApexAgent(config, 'MountainCar-v0')

checkpoint = '/root/ray_results/2018-06-29_02-55-05mez6ivqi/checkpoint-1560'
test_agent.restore(checkpoint)

Created LogSyncer for /root/ray_results/2018-06-29_10-18-54wws10o3e -> None
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Observation shape is (2,)
Not using any observation preprocessor.
INFO:tensorflow:Restoring parameters from /root/ray_results/2018-06-29_02-55-05mez6ivqi/checkpoint-1560


# Play in the gym env #

In [9]:
#env = gym.make('CartPole-v0')
env = gym.make('MountainCar-v0')

state = env.reset()
done = False
cumulative_reward = 0

while not done:
    #env.render()
    action = test_agent.compute_action(state)
    state, reward, done, _ = env.step(action)
    cumulative_reward += reward

print(cumulative_reward)
env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
-87.0
