# Env Setup

In [1]:
!pip install -U "ray[rllib]==1.11.0"

Collecting ray[rllib]==1.11.0
  Downloading ray-1.11.0-cp37-cp37m-manylinux2014_x86_64.whl (52.7 MB)
[K     |████████████████████████████████| 52.7 MB 160 kB/s 
Collecting grpcio<=1.43.0,>=1.28.1
  Downloading grpcio-1.43.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 43.3 MB/s 
Collecting redis>=3.5.0
  Downloading redis-4.2.2-py3-none-any.whl (226 kB)
[K     |████████████████████████████████| 226 kB 51.0 MB/s 
Collecting lz4
  Downloading lz4-4.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 45.7 MB/s 
Collecting tensorboardX>=1.9
  Downloading tensorboardX-2.5-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 38.8 MB/s 
Collecting async-timeout>=4.0.2
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting deprecated>=1.2.3
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Installin

In [2]:
# Visualizer setup
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1


In [3]:
import gym
import torch

import numpy as np
from collections import namedtuple
import random, math
import matplotlib.pyplot as plt
from ray.rllib.models import ModelCatalog

from ray.rllib.env.env_context import EnvContext
import time
import os

from gym.spaces import Discrete, Box
from ray import tune
import ray
import ray.rllib.agents.ppo as ppo
import ray.rllib.agents.sac as sac

In [4]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Visualization

In [5]:
# How to get gym AI visuals in colab
# https://colab.research.google.com/github/jeffheaton/t81_558_deep_learning/blob/master/t81_558_class_12_01_ai_gym.ipynb#scrollTo=T9RpF49oOsZj

from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

# Test with random actions 

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device: ", device)

game =  "CartPole-v1"
env =  wrap_env(gym.make(game))

print("game: ", game)
num_states = env.observation_space.shape[0]
num_actions = env.action_space.n

print("States size:", num_states, "Actions size: ",  num_actions)

device:  cuda:0
game:  CartPole-v1
States size: 4 Actions size:  2


In [7]:
done = False

env.reset()

while not done:
    env.render()
    state, reward, done, _ = env.step(env.action_space.sample()) # take a random action


env.close()
show_video()

# Use ray tune for hyperparameter search 

In [8]:
import json
def evaluation_fn(result):
    return result['episode_reward_mean']


def objective_fn(config, checkpoint_dir = "/content/drive/MyDrive/INM707/cartpole_v1"):
    start = 0
    if checkpoint_dir:
        with open(os.path.join(checkpoint_dir, "checkpoint")) as f:
            state = json.loads(f.read())
            start = state["step"] + 1


    trainer = sac.SACTrainer(config = config) 

    for i in range(start, 10):
      # Perform one iteration of training the policy with DQN
      result = trainer.train()
      intermediate_score = evaluation_fn(result)

      with tune.checkpoint_dir(step=i) as checkpoint_dir:
          path = os.path.join(checkpoint_dir, "checkpoint")
          with open(path, "w") as f:
              f.write(json.dumps({"step": i}))
    
      # Feed the score back back to Tune.
      tune.report(iterations=i, mean_reward=intermediate_score)

## config

In [9]:
ray.shutdown()
ray.init(num_cpus= 2, num_gpus=1)

{'address': '172.28.0.2:63104',
 'gcs_address': '172.28.0.2:63104',
 'metrics_export_port': 58045,
 'node_id': '655ed2088c7835f9b34efb4a447867ad44969bcca3ed4c4aa9df2069',
 'node_ip_address': '172.28.0.2',
 'object_store_address': '/tmp/ray/session_2022-04-22_11-19-54_557197_72/sockets/plasma_store',
 'raylet_ip_address': '172.28.0.2',
 'raylet_socket_name': '/tmp/ray/session_2022-04-22_11-19-54_557197_72/sockets/raylet',
 'redis_address': None,
 'session_dir': '/tmp/ray/session_2022-04-22_11-19-54_557197_72',
 'webui_url': None}

In [10]:
config = sac.DEFAULT_CONFIG.copy()

config["framework"] = "torch"
config["train_batch_size"] = tune.grid_search([32, 64])
config["target_network_update_freq"] = 32 #tune.grid_search([16, 32])
config["env"] = 'CartPole-v0' 
config["gamma"] = 0.95 #tune.uniform(0, 1)

#used from tuned values : https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/sac/cartpole-sac.yaml
config["tau"] = 1
config["optimization"] = {
        "actor_learning_rate": 0.005,
        "critic_learning_rate": 0.005,
        "entropy_learning_rate": 0.0001}
config["no_done_at_end"] = False


config["num_gpus"] = 1

In [11]:
analysis = tune.run(
        objective_fn,
        stop={"mean_reward": 200},
        metric="mean_reward",
        mode="max",
        resources_per_trial={'gpu': 1},
        num_samples=3,
        local_dir = "/content/drive/MyDrive/INM707/cartpole_v1", 
        config=config)


Trial name,status,loc,train_batch_size
objective_fn_CartPole-v0_31265_00000,RUNNING,172.28.0.2:730,32
objective_fn_CartPole-v0_31265_00001,PENDING,,64
objective_fn_CartPole-v0_31265_00002,PENDING,,32
objective_fn_CartPole-v0_31265_00003,PENDING,,64
objective_fn_CartPole-v0_31265_00004,PENDING,,32
objective_fn_CartPole-v0_31265_00005,PENDING,,64


[2m[36m(objective_fn pid=730)[0m 2022-04-22 11:20:23,278	INFO simple_q.py:155 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
[2m[36m(objective_fn pid=730)[0m 2022-04-22 11:20:23,278	INFO trainer.py:781 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc,train_batch_size
objective_fn_CartPole-v0_31265_00000,RUNNING,172.28.0.2:730,32
objective_fn_CartPole-v0_31265_00001,PENDING,,64
objective_fn_CartPole-v0_31265_00002,PENDING,,32
objective_fn_CartPole-v0_31265_00003,PENDING,,64
objective_fn_CartPole-v0_31265_00004,PENDING,,32
objective_fn_CartPole-v0_31265_00005,PENDING,,64


Trial name,status,loc,train_batch_size
objective_fn_CartPole-v0_31265_00000,RUNNING,172.28.0.2:730,32
objective_fn_CartPole-v0_31265_00001,PENDING,,64
objective_fn_CartPole-v0_31265_00002,PENDING,,32
objective_fn_CartPole-v0_31265_00003,PENDING,,64
objective_fn_CartPole-v0_31265_00004,PENDING,,32
objective_fn_CartPole-v0_31265_00005,PENDING,,64


[2m[36m(objective_fn pid=730)[0m 2022-04-22 11:20:35,420	INFO trainable.py:130 -- Trainable.setup took 12.143 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Trial name,status,loc,train_batch_size
objective_fn_CartPole-v0_31265_00000,RUNNING,172.28.0.2:730,32
objective_fn_CartPole-v0_31265_00001,PENDING,,64
objective_fn_CartPole-v0_31265_00002,PENDING,,32
objective_fn_CartPole-v0_31265_00003,PENDING,,64
objective_fn_CartPole-v0_31265_00004,PENDING,,32
objective_fn_CartPole-v0_31265_00005,PENDING,,64


Result for objective_fn_CartPole-v0_31265_00000:
  date: 2022-04-22_11-20-38
  done: false
  experiment_id: ec91b729e16c48e4ba9b9ab5e639b03f
  hostname: 9b9552d30cc8
  iterations: 0
  iterations_since_restore: 1
  mean_reward: 23.3125
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 31.530434782608697
    ram_util_percent: 26.717391304347835
  pid: 730
  should_checkpoint: true
  time_since_restore: 15.434986352920532
  time_this_iter_s: 15.434986352920532
  time_total_s: 15.434986352920532
  timestamp: 1650626438
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '31265_00000'
  


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00000,RUNNING,172.28.0.2:730,32,2.0,16.837,1.0,22.7714
objective_fn_CartPole-v0_31265_00001,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00002,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,


Result for objective_fn_CartPole-v0_31265_00000:
  date: 2022-04-22_11-20-44
  done: false
  experiment_id: ec91b729e16c48e4ba9b9ab5e639b03f
  hostname: 9b9552d30cc8
  iterations: 4
  iterations_since_restore: 5
  mean_reward: 23.1625
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 53.400000000000006
    ram_util_percent: 32.8
  pid: 730
  should_checkpoint: true
  time_since_restore: 21.04369568824768
  time_this_iter_s: 1.3929686546325684
  time_total_s: 21.04369568824768
  timestamp: 1650626444
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '31265_00000'
  


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00000,RUNNING,172.28.0.2:730,32,6.0,22.4142,5.0,24.0
objective_fn_CartPole-v0_31265_00001,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00002,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,


Result for objective_fn_CartPole-v0_31265_00000:
  date: 2022-04-22_11-20-49
  done: false
  experiment_id: ec91b729e16c48e4ba9b9ab5e639b03f
  hostname: 9b9552d30cc8
  iterations: 8
  iterations_since_restore: 9
  mean_reward: 26.20689655172414
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 55.95
    ram_util_percent: 32.8
  pid: 730
  should_checkpoint: true
  time_since_restore: 26.635223865509033
  time_this_iter_s: 1.4194719791412354
  time_total_s: 26.635223865509033
  timestamp: 1650626449
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: '31265_00000'
  
Result for objective_fn_CartPole-v0_31265_00000:
  date: 2022-04-22_11-20-51
  done: true
  experiment_id: ec91b729e16c48e4ba9b9ab5e639b03f
  experiment_tag: 0_train_batch_size=32
  hostname: 9b9552d30cc8
  iterations: 9
  iterations_since_restore: 10
  mean_reward: 27.079545454545453
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 53.93333333333333
    ram_util_percent: 32.8
  pid: 730
  should_checkpoin

Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00001,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00002,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795


[2m[36m(objective_fn pid=729)[0m 2022-04-22 11:20:57,425	INFO simple_q.py:155 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
[2m[36m(objective_fn pid=729)[0m 2022-04-22 11:20:57,425	INFO trainer.py:781 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00001,RUNNING,172.28.0.2:729,64,,,,
objective_fn_CartPole-v0_31265_00002,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795




Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00001,RUNNING,172.28.0.2:729,64,,,,
objective_fn_CartPole-v0_31265_00002,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795


Result for objective_fn_CartPole-v0_31265_00001:
  date: 2022-04-22_11-21-04
  done: false
  experiment_id: 925fa028b7f5499bad6e3f08af39e672
  hostname: 9b9552d30cc8
  iterations: 0
  iterations_since_restore: 1
  mean_reward: 21.3768115942029
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 53.3090909090909
    ram_util_percent: 27.809090909090912
  pid: 729
  should_checkpoint: true
  time_since_restore: 7.431562185287476
  time_this_iter_s: 7.431562185287476
  time_total_s: 7.431562185287476
  timestamp: 1650626464
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '31265_00001'
  


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00001,RUNNING,172.28.0.2:729,64,3.0,10.3196,2.0,22.4865
objective_fn_CartPole-v0_31265_00002,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795


Result for objective_fn_CartPole-v0_31265_00001:
  date: 2022-04-22_11-21-10
  done: false
  experiment_id: 925fa028b7f5499bad6e3f08af39e672
  hostname: 9b9552d30cc8
  iterations: 4
  iterations_since_restore: 5
  mean_reward: 23.2375
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 53.099999999999994
    ram_util_percent: 32.4
  pid: 729
  should_checkpoint: true
  time_since_restore: 13.232813358306885
  time_this_iter_s: 1.4647431373596191
  time_total_s: 13.232813358306885
  timestamp: 1650626470
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '31265_00001'
  


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00001,RUNNING,172.28.0.2:729,64,6.0,14.7058,5.0,23.7654
objective_fn_CartPole-v0_31265_00002,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795


Result for objective_fn_CartPole-v0_31265_00001:
  date: 2022-04-22_11-21-16
  done: false
  experiment_id: 925fa028b7f5499bad6e3f08af39e672
  hostname: 9b9552d30cc8
  iterations: 8
  iterations_since_restore: 9
  mean_reward: 25.44578313253012
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 62.75
    ram_util_percent: 32.4
  pid: 729
  should_checkpoint: true
  time_since_restore: 19.11180806159973
  time_this_iter_s: 1.4231224060058594
  time_total_s: 19.11180806159973
  timestamp: 1650626476
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: '31265_00001'
  
Result for objective_fn_CartPole-v0_31265_00001:
  date: 2022-04-22_11-21-18
  done: true
  experiment_id: 925fa028b7f5499bad6e3f08af39e672
  experiment_tag: 1_train_batch_size=64
  hostname: 9b9552d30cc8
  iterations: 9
  iterations_since_restore: 10
  mean_reward: 27.523809523809526
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 50.0
    ram_util_percent: 32.4
  pid: 729
  should_checkpoint: true
  time_

Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00002,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00002,RUNNING,172.28.0.2:876,32,,,,
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238


[2m[36m(objective_fn pid=876)[0m 2022-04-22 11:21:25,666	INFO simple_q.py:155 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
[2m[36m(objective_fn pid=876)[0m 2022-04-22 11:21:25,666	INFO trainer.py:781 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00002,RUNNING,172.28.0.2:876,32,,,,
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238




Result for objective_fn_CartPole-v0_31265_00002:
  date: 2022-04-22_11-21-32
  done: false
  experiment_id: 0a2b43fc999d470ea1d8ed8770ab1c72
  hostname: 9b9552d30cc8
  iterations: 0
  iterations_since_restore: 1
  mean_reward: 22.62121212121212
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 56.45454545454545
    ram_util_percent: 27.845454545454547
  pid: 876
  should_checkpoint: true
  time_since_restore: 7.355276107788086
  time_this_iter_s: 7.355276107788086
  time_total_s: 7.355276107788086
  timestamp: 1650626492
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '31265_00002'
  


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00002,RUNNING,172.28.0.2:876,32,2.0,8.80542,1.0,22.0417
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238


Result for objective_fn_CartPole-v0_31265_00002:
  date: 2022-04-22_11-21-38
  done: false
  experiment_id: 0a2b43fc999d470ea1d8ed8770ab1c72
  hostname: 9b9552d30cc8
  iterations: 4
  iterations_since_restore: 5
  mean_reward: 22.08139534883721
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 53.8
    ram_util_percent: 32.3
  pid: 876
  should_checkpoint: true
  time_since_restore: 12.874264240264893
  time_this_iter_s: 1.3385100364685059
  time_total_s: 12.874264240264893
  timestamp: 1650626498
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '31265_00002'
  


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00002,RUNNING,172.28.0.2:876,32,6.0,14.2243,5.0,21.8791
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238


Result for objective_fn_CartPole-v0_31265_00002:
  date: 2022-04-22_11-21-44
  done: false
  experiment_id: 0a2b43fc999d470ea1d8ed8770ab1c72
  hostname: 9b9552d30cc8
  iterations: 8
  iterations_since_restore: 9
  mean_reward: 21.85
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 60.85
    ram_util_percent: 32.3
  pid: 876
  should_checkpoint: true
  time_since_restore: 18.580829858779907
  time_this_iter_s: 1.556041955947876
  time_total_s: 18.580829858779907
  timestamp: 1650626504
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: '31265_00002'
  
Result for objective_fn_CartPole-v0_31265_00002:
  date: 2022-04-22_11-21-45
  done: true
  experiment_id: 0a2b43fc999d470ea1d8ed8770ab1c72
  experiment_tag: 2_train_batch_size=32
  hostname: 9b9552d30cc8
  iterations: 9
  iterations_since_restore: 10
  mean_reward: 21.64
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 57.5
    ram_util_percent: 32.3
  pid: 876
  should_checkpoint: true
  time_since_restore: 19.993245

Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00003,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00003,RUNNING,172.28.0.2:932,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64


[2m[36m(objective_fn pid=932)[0m 2022-04-22 11:21:53,568	INFO simple_q.py:155 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
[2m[36m(objective_fn pid=932)[0m 2022-04-22 11:21:53,568	INFO trainer.py:781 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00003,RUNNING,172.28.0.2:932,64,,,,
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64




Result for objective_fn_CartPole-v0_31265_00003:
  date: 2022-04-22_11-22-01
  done: false
  experiment_id: 0ccaecd8f6304b329fc83ebb5a0ebb36
  hostname: 9b9552d30cc8
  iterations: 0
  iterations_since_restore: 1
  mean_reward: 23.58730158730159
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 35.38181818181818
    ram_util_percent: 27.736363636363638
  pid: 932
  should_checkpoint: true
  time_since_restore: 7.4820942878723145
  time_this_iter_s: 7.4820942878723145
  time_total_s: 7.4820942878723145
  timestamp: 1650626521
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '31265_00003'
  


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00003,RUNNING,172.28.0.2:932,64,2.0,8.98483,1.0,23.6418
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64


Result for objective_fn_CartPole-v0_31265_00003:
  date: 2022-04-22_11-22-06
  done: false
  experiment_id: 0ccaecd8f6304b329fc83ebb5a0ebb36
  hostname: 9b9552d30cc8
  iterations: 4
  iterations_since_restore: 5
  mean_reward: 23.6375
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 25.0
    ram_util_percent: 32.4
  pid: 932
  should_checkpoint: true
  time_since_restore: 13.437954187393188
  time_this_iter_s: 1.4390008449554443
  time_total_s: 13.437954187393188
  timestamp: 1650626526
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '31265_00003'
  


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00003,RUNNING,172.28.0.2:932,64,6.0,14.8751,5.0,23.5238
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64


Result for objective_fn_CartPole-v0_31265_00003:
  date: 2022-04-22_11-22-12
  done: false
  experiment_id: 0ccaecd8f6304b329fc83ebb5a0ebb36
  hostname: 9b9552d30cc8
  iterations: 8
  iterations_since_restore: 9
  mean_reward: 23.885416666666668
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 54.35
    ram_util_percent: 32.4
  pid: 932
  should_checkpoint: true
  time_since_restore: 19.270718097686768
  time_this_iter_s: 1.4814393520355225
  time_total_s: 19.270718097686768
  timestamp: 1650626532
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: '31265_00003'
  
Result for objective_fn_CartPole-v0_31265_00003:
  date: 2022-04-22_11-22-14
  done: true
  experiment_id: 0ccaecd8f6304b329fc83ebb5a0ebb36
  experiment_tag: 3_train_batch_size=64
  hostname: 9b9552d30cc8
  iterations: 9
  iterations_since_restore: 10
  mean_reward: 23.6
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 54.5
    ram_util_percent: 32.4
  pid: 932
  should_checkpoint: true
  time_since_resto

Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00004,PENDING,,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64
objective_fn_CartPole-v0_31265_00003,TERMINATED,172.28.0.2:932,64,10.0,20.7395,9.0,23.6


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00004,RUNNING,172.28.0.2:987,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64
objective_fn_CartPole-v0_31265_00003,TERMINATED,172.28.0.2:932,64,10.0,20.7395,9.0,23.6


[2m[36m(objective_fn pid=987)[0m 2022-04-22 11:22:21,690	INFO simple_q.py:155 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
[2m[36m(objective_fn pid=987)[0m 2022-04-22 11:22:21,691	INFO trainer.py:781 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00004,RUNNING,172.28.0.2:987,32,,,,
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64
objective_fn_CartPole-v0_31265_00003,TERMINATED,172.28.0.2:932,64,10.0,20.7395,9.0,23.6




Result for objective_fn_CartPole-v0_31265_00004:
  date: 2022-04-22_11-22-29
  done: false
  experiment_id: 169748c34ab34b65b91c7e9980ab6cac
  hostname: 9b9552d30cc8
  iterations: 0
  iterations_since_restore: 1
  mean_reward: 23.265625
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 55.66363636363637
    ram_util_percent: 27.700000000000003
  pid: 987
  should_checkpoint: true
  time_since_restore: 7.4664952754974365
  time_this_iter_s: 7.4664952754974365
  time_total_s: 7.4664952754974365
  timestamp: 1650626549
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '31265_00004'
  


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00004,RUNNING,172.28.0.2:987,32,2.0,8.86316,1.0,23.1884
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64
objective_fn_CartPole-v0_31265_00003,TERMINATED,172.28.0.2:932,64,10.0,20.7395,9.0,23.6


Result for objective_fn_CartPole-v0_31265_00004:
  date: 2022-04-22_11-22-34
  done: false
  experiment_id: 169748c34ab34b65b91c7e9980ab6cac
  hostname: 9b9552d30cc8
  iterations: 4
  iterations_since_restore: 5
  mean_reward: 23.22222222222222
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 53.25
    ram_util_percent: 32.4
  pid: 987
  should_checkpoint: true
  time_since_restore: 13.003074407577515
  time_this_iter_s: 1.3582394123077393
  time_total_s: 13.003074407577515
  timestamp: 1650626554
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '31265_00004'
  


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00004,RUNNING,172.28.0.2:987,32,6.0,14.4382,5.0,24.3659
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64
objective_fn_CartPole-v0_31265_00003,TERMINATED,172.28.0.2:932,64,10.0,20.7395,9.0,23.6


Result for objective_fn_CartPole-v0_31265_00004:
  date: 2022-04-22_11-22-40
  done: false
  experiment_id: 169748c34ab34b65b91c7e9980ab6cac
  hostname: 9b9552d30cc8
  iterations: 8
  iterations_since_restore: 9
  mean_reward: 26.476744186046513
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 26.6
    ram_util_percent: 32.4
  pid: 987
  should_checkpoint: true
  time_since_restore: 18.63246178627014
  time_this_iter_s: 1.3811180591583252
  time_total_s: 18.63246178627014
  timestamp: 1650626560
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: '31265_00004'
  
Result for objective_fn_CartPole-v0_31265_00004:
  date: 2022-04-22_11-22-41
  done: true
  experiment_id: 169748c34ab34b65b91c7e9980ab6cac
  experiment_tag: 4_train_batch_size=32
  hostname: 9b9552d30cc8
  iterations: 9
  iterations_since_restore: 10
  mean_reward: 27.06896551724138
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 26.25
    ram_util_percent: 32.4
  pid: 987
  should_checkpoint: true
  time_

Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00005,PENDING,,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64
objective_fn_CartPole-v0_31265_00003,TERMINATED,172.28.0.2:932,64,10.0,20.7395,9.0,23.6
objective_fn_CartPole-v0_31265_00004,TERMINATED,172.28.0.2:987,32,10.0,20.0546,9.0,27.069


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00005,RUNNING,172.28.0.2:1044,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64
objective_fn_CartPole-v0_31265_00003,TERMINATED,172.28.0.2:932,64,10.0,20.7395,9.0,23.6
objective_fn_CartPole-v0_31265_00004,TERMINATED,172.28.0.2:987,32,10.0,20.0546,9.0,27.069


[2m[36m(objective_fn pid=1044)[0m 2022-04-22 11:22:49,558	INFO simple_q.py:155 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
[2m[36m(objective_fn pid=1044)[0m 2022-04-22 11:22:49,558	INFO trainer.py:781 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00005,RUNNING,172.28.0.2:1044,64,,,,
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10.0,28.091,9.0,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10.0,20.5998,9.0,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10.0,19.9932,9.0,21.64
objective_fn_CartPole-v0_31265_00003,TERMINATED,172.28.0.2:932,64,10.0,20.7395,9.0,23.6
objective_fn_CartPole-v0_31265_00004,TERMINATED,172.28.0.2:987,32,10.0,20.0546,9.0,27.069




Result for objective_fn_CartPole-v0_31265_00005:
  date: 2022-04-22_11-22-56
  done: false
  experiment_id: 8e5d6a4433154388bb0b0b947179691f
  hostname: 9b9552d30cc8
  iterations: 0
  iterations_since_restore: 1
  mean_reward: 23.015384615384615
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 53.58181818181818
    ram_util_percent: 27.8
  pid: 1044
  should_checkpoint: true
  time_since_restore: 7.4021642208099365
  time_this_iter_s: 7.4021642208099365
  time_total_s: 7.4021642208099365
  timestamp: 1650626576
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '31265_00005'
  


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00005,RUNNING,172.28.0.2:1044,64,2,8.87254,1,23.1471
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10,28.091,9,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10,20.5998,9,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10,19.9932,9,21.64
objective_fn_CartPole-v0_31265_00003,TERMINATED,172.28.0.2:932,64,10,20.7395,9,23.6
objective_fn_CartPole-v0_31265_00004,TERMINATED,172.28.0.2:987,32,10,20.0546,9,27.069


Result for objective_fn_CartPole-v0_31265_00005:
  date: 2022-04-22_11-23-02
  done: false
  experiment_id: 8e5d6a4433154388bb0b0b947179691f
  hostname: 9b9552d30cc8
  iterations: 4
  iterations_since_restore: 5
  mean_reward: 24.57894736842105
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 51.9
    ram_util_percent: 32.4
  pid: 1044
  should_checkpoint: true
  time_since_restore: 13.265152931213379
  time_this_iter_s: 1.4505198001861572
  time_total_s: 13.265152931213379
  timestamp: 1650626582
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '31265_00005'
  


Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00005,RUNNING,172.28.0.2:1044,64,6,14.7271,5,24.5789
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10,28.091,9,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10,20.5998,9,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10,19.9932,9,21.64
objective_fn_CartPole-v0_31265_00003,TERMINATED,172.28.0.2:932,64,10,20.7395,9,23.6
objective_fn_CartPole-v0_31265_00004,TERMINATED,172.28.0.2:987,32,10,20.0546,9,27.069


Result for objective_fn_CartPole-v0_31265_00005:
  date: 2022-04-22_11-23-08
  done: false
  experiment_id: 8e5d6a4433154388bb0b0b947179691f
  hostname: 9b9552d30cc8
  iterations: 8
  iterations_since_restore: 9
  mean_reward: 26.987179487179485
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 55.75
    ram_util_percent: 32.4
  pid: 1044
  should_checkpoint: true
  time_since_restore: 19.128239393234253
  time_this_iter_s: 1.485896348953247
  time_total_s: 19.128239393234253
  timestamp: 1650626588
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: '31265_00005'
  
Result for objective_fn_CartPole-v0_31265_00005:
  date: 2022-04-22_11-23-10
  done: true
  experiment_id: 8e5d6a4433154388bb0b0b947179691f
  experiment_tag: 5_train_batch_size=64
  hostname: 9b9552d30cc8
  iterations: 9
  iterations_since_restore: 10
  mean_reward: 29.6
  node_ip: 172.28.0.2
  perf:
    cpu_util_percent: 52.25
    ram_util_percent: 32.4
  pid: 1044
  should_checkpoint: true
  time_since_res

Trial name,status,loc,train_batch_size,iter,total time (s),iterations,mean_reward
objective_fn_CartPole-v0_31265_00000,TERMINATED,172.28.0.2:730,32,10,28.091,9,27.0795
objective_fn_CartPole-v0_31265_00001,TERMINATED,172.28.0.2:729,64,10,20.5998,9,27.5238
objective_fn_CartPole-v0_31265_00002,TERMINATED,172.28.0.2:876,32,10,19.9932,9,21.64
objective_fn_CartPole-v0_31265_00003,TERMINATED,172.28.0.2:932,64,10,20.7395,9,23.6
objective_fn_CartPole-v0_31265_00004,TERMINATED,172.28.0.2:987,32,10,20.0546,9,27.069
objective_fn_CartPole-v0_31265_00005,TERMINATED,172.28.0.2:1044,64,10,20.58,9,29.6


2022-04-22 11:23:10,661	INFO tune.py:639 -- Total run time: 171.64 seconds (170.87 seconds for the tuning loop).


In [12]:
analysis.best_checkpoint.local_path

'/content/drive/MyDrive/INM707/cartpole_v1/objective_fn_2022-04-22_11-20-19/objective_fn_CartPole-v0_31265_00005_5_train_batch_size=64_2022-04-22_11-22-17/checkpoint_000009/'

## Save best params in a file

In [13]:
#save best config to file 
import json 
#path = '/content/drive/MyDrive/INM707/cartpole_v1/objective_fn_2022-04-21_20-53-06'

path = ''
for val in analysis.best_checkpoint.local_path.split('/')[1:-3]: 
  path = path + '/' + val 

fname = 'best_params.json'
full_path = path + '/' + fname 


f = open(full_path, "w")
json_dict = json.dumps(str(analysis.best_config))
f.write(json_dict)
f.close()


In [14]:
model_train_config = analysis.best_config

#Use hyper parameters to train model 

#Load best params to config

In [15]:
"""
import json
fname = '/content/drive/MyDrive/INM707/cartpole_v1/objective_fn_2022-04-21_20-53-06' + '/' + 'best_params.json'
print(fname)
with open(fname) as json_file:
    config_dict = json.load(json_file)

type(config_dict)
print(config_dict[:74])
temp = config_dict[:74] + '}'
temp
Dict = eval(temp)
Dict
"""

"\nimport json\nfname = '/content/drive/MyDrive/INM707/cartpole_v1/objective_fn_2022-04-21_20-53-06' + '/' + 'best_params.json'\nprint(fname)\nwith open(fname) as json_file:\n    config_dict = json.load(json_file)\n\ntype(config_dict)\nprint(config_dict[:74])\ntemp = config_dict[:74] + '}'\ntemp\nDict = eval(temp)\nDict\n"

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device: ", device)

device:  cuda:0


In [17]:
ray.shutdown()
ray.init(num_cpus= 2, num_gpus=1)

{'address': '172.28.0.2:65139',
 'gcs_address': '172.28.0.2:65139',
 'metrics_export_port': 45907,
 'node_id': '01ad1c272f3b12502817e9b0eecfe816da5503d5e15135c85d8756c3',
 'node_ip_address': '172.28.0.2',
 'object_store_address': '/tmp/ray/session_2022-04-22_11-23-45_238073_72/sockets/plasma_store',
 'raylet_ip_address': '172.28.0.2',
 'raylet_socket_name': '/tmp/ray/session_2022-04-22_11-23-45_238073_72/sockets/raylet',
 'redis_address': None,
 'session_dir': '/tmp/ray/session_2022-04-22_11-23-45_238073_72',
 'webui_url': None}

## Train model on best params

In [None]:
config = sac.DEFAULT_CONFIG.copy()

config["framework"] = "torch"
config["train_batch_size"] = 32 #tune.grid_search([32, 64])
config["target_network_update_freq"] = 32 #tune.grid_search([16, 32])
config["env"] = 'CartPole-v0' 
config["gamma"] = 0.95 #tune.uniform(0, 1)

#used from tuned values : https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/sac/cartpole-sac.yaml
config["tau"] = 1
config["optimization"] = {
        "actor_learning_rate": 0.005,
        "critic_learning_rate": 0.005,
        "entropy_learning_rate": 0.0001}
config["no_done_at_end"] = False


config["num_gpus"] = 1

model_train_config = config

In [19]:
model_train = tune.run("SAC",
                    stop={"episode_reward_mean": 100},
                    config = model_train_config,
                    local_dir = "/content/drive/MyDrive/INM707/cartpole_v1",
                    checkpoint_freq = 10)

Trial name,status,loc
SAC_CartPole-v0_64065_00000,PENDING,


[2m[36m(SACTrainer pid=1191)[0m 2022-04-22 11:28:57,649	INFO simple_q.py:155 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
[2m[36m(SACTrainer pid=1191)[0m 2022-04-22 11:28:57,650	INFO trainer.py:781 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191




Trial name,status,loc
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 1500
  custom_metrics: {}
  date: 2022-04-22_11-29-05
  done: false
  episode_len_mean: 19.945945945945947
  episode_media: {}
  episode_reward_max: 59.0
  episode_reward_mean: 19.945945945945947
  episode_reward_min: 9.0
  episodes_this_iter: 74
  episodes_total: 74
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 1500
    learner:
      default_policy:
        learner_stats:
          actor_loss: -0.693173885345459
          alpha_loss: 0.0
          alpha_value: 0.9999000430107117
          critic_loss: 1.136523723602295
          log_alpha_value: -9.999928442994133e-05
          max_q: 0.007835425436496735
          mean_q: 3.0429117032326758e-05
          min_q: -0.008325004018843174
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 1.6365234851837158
        td_error:
        - 1.6570167541503906
        - 1.657269

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,4,7.62176,1800,21,74,9,21


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 1900
  custom_metrics: {}
  date: 2022-04-22_11-29-10
  done: false
  episode_len_mean: 21.53409090909091
  episode_media: {}
  episode_reward_max: 74.0
  episode_reward_mean: 21.53409090909091
  episode_reward_min: 9.0
  episodes_this_iter: 3
  episodes_total: 88
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 1896
    learner:
      default_policy:
        learner_stats:
          actor_loss: -15.130887985229492
          alpha_loss: -0.0026242779567837715
          alpha_value: 1.0164546966552734
          critic_loss: 1.1045565605163574
          log_alpha_value: 0.01632075384259224
          max_q: 17.777990341186523
          mean_q: 14.18322467803955
          min_q: 6.03477144241333
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 1.441554069519043
        td_error:
        - 0.9637947082519531
        - 0.4495

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,8,13.2462,2200,23.2308,118,9,23.2308


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 2300
  custom_metrics: {}
  date: 2022-04-22_11-29-16
  done: false
  episode_len_mean: 25.0
  episode_media: {}
  episode_reward_max: 186.0
  episode_reward_mean: 25.0
  episode_reward_min: 9.0
  episodes_this_iter: 1
  episodes_total: 92
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 2292
    learner:
      default_policy:
        learner_stats:
          actor_loss: -23.53980255126953
          alpha_loss: -0.01762784644961357
          alpha_value: 1.0868793725967407
          critic_loss: 0.10412608087062836
          log_alpha_value: 0.08331054449081421
          max_q: 26.501941680908203
          mean_q: 22.418792724609375
          min_q: 13.94131851196289
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.3841220438480377
        td_error:
        - 0.2720518112182617
        - 0.41339588165283203
        - 



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,12,18.9306,2600,26.3617,186,9,26.3617


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 2700
  custom_metrics: {}
  date: 2022-04-22_11-29-22
  done: false
  episode_len_mean: 27.822916666666668
  episode_media: {}
  episode_reward_max: 186.0
  episode_reward_mean: 27.822916666666668
  episode_reward_min: 9.0
  episodes_this_iter: 2
  episodes_total: 96
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 2688
    learner:
      default_policy:
        learner_stats:
          actor_loss: -25.811466217041016
          alpha_loss: -0.028140783309936523
          alpha_value: 1.1466240882873535
          critic_loss: 0.08577572554349899
          log_alpha_value: 0.13682204484939575
          max_q: 28.801040649414062
          mean_q: 24.496112823486328
          min_q: 1.042578101158142
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.3031642436981201
        td_error:
        - 0.1274404525756836
        - 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,16,24.6995,3000,29.1735,186,9,29.1735


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 3100
  custom_metrics: {}
  date: 2022-04-22_11-29-28
  done: false
  episode_len_mean: 30.8989898989899
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 30.8989898989899
  episode_reward_min: 9.0
  episodes_this_iter: 1
  episodes_total: 99
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 3084
    learner:
      default_policy:
        learner_stats:
          actor_loss: -27.769115447998047
          alpha_loss: -0.033786505460739136
          alpha_value: 1.1954078674316406
          critic_loss: 0.5148612260818481
          log_alpha_value: 0.17848742008209229
          max_q: 30.79395866394043
          mean_q: 26.408573150634766
          min_q: -1.8198519945144653
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.8397101163864136
        td_error:
        - 0.24925708770751953
        - 0.4

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,20,30.8218,3400,33.74,200,9,33.74


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 3500
  custom_metrics: {}
  date: 2022-04-22_11-29-34
  done: false
  episode_len_mean: 33.74
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 33.74
  episode_reward_min: 9.0
  episodes_this_iter: 0
  episodes_total: 101
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 3480
    learner:
      default_policy:
        learner_stats:
          actor_loss: -28.280086517333984
          alpha_loss: -0.04742606729269028
          alpha_value: 1.2417259216308594
          critic_loss: 0.19299380481243134
          log_alpha_value: 0.21650227904319763
          max_q: 31.777469635009766
          mean_q: 26.38027000427246
          min_q: -1.6051890850067139
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.4713609218597412
        td_error:
        - 0.5497503280639648
        - 0.305117130279541
       

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,23,35.1507,3700,36.04,200,10,36.04


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 3900
  custom_metrics: {}
  date: 2022-04-22_11-29-40
  done: false
  episode_len_mean: 37.43
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 37.43
  episode_reward_min: 10.0
  episodes_this_iter: 1
  episodes_total: 104
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 3876
    learner:
      default_policy:
        learner_stats:
          actor_loss: -27.158424377441406
          alpha_loss: -0.05506812781095505
          alpha_value: 1.2889723777770996
          critic_loss: 0.31911706924438477
          log_alpha_value: 0.25384533405303955
          max_q: 32.76799011230469
          mean_q: 25.41404151916504
          min_q: -2.055750846862793
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.5557035207748413
        td_error:
        - 0.10335731506347656
        - 0.35408496856689453
     

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,27,40.863,4100,39.54,200,10,39.54


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 4300
  custom_metrics: {}
  date: 2022-04-22_11-29-46
  done: false
  episode_len_mean: 40.8
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 40.8
  episode_reward_min: 10.0
  episodes_this_iter: 1
  episodes_total: 107
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 4272
    learner:
      default_policy:
        learner_stats:
          actor_loss: -29.262386322021484
          alpha_loss: -0.04950116574764252
          alpha_value: 1.338577151298523
          critic_loss: 0.07721477001905441
          log_alpha_value: 0.2916072905063629
          max_q: 32.52165222167969
          mean_q: 27.678775787353516
          min_q: -0.7006803750991821
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.26557132601737976
        td_error:
        - 0.34064483642578125
        - 0.17381954193115234
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,31,46.53,4500,41.98,200,10,41.98


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 4700
  custom_metrics: {}
  date: 2022-04-22_11-29-52
  done: false
  episode_len_mean: 43.76
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 43.76
  episode_reward_min: 10.0
  episodes_this_iter: 0
  episodes_total: 109
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 4668
    learner:
      default_policy:
        learner_stats:
          actor_loss: -31.03997802734375
          alpha_loss: -0.05389728769659996
          alpha_value: 1.3898886442184448
          critic_loss: 0.10213753581047058
          log_alpha_value: 0.3292236626148224
          max_q: 33.89621353149414
          mean_q: 29.403568267822266
          min_q: 0.3479779064655304
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.3447718322277069
        td_error:
        - 0.5214138031005859
        - 0.04577445983886719
       

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,35,52.3026,4900,46.21,200,10,46.21


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 5100
  custom_metrics: {}
  date: 2022-04-22_11-29-58
  done: false
  episode_len_mean: 47.69
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 47.69
  episode_reward_min: 10.0
  episodes_this_iter: 0
  episodes_total: 112
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 5097
    learner:
      default_policy:
        learner_stats:
          actor_loss: -30.546188354492188
          alpha_loss: -0.04619600623846054
          alpha_value: 1.439666986465454
          critic_loss: 0.37244075536727905
          log_alpha_value: 0.36441177129745483
          max_q: 33.92542266845703
          mean_q: 29.19916343688965
          min_q: -6.883475303649902
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.6432874798774719
        td_error:
        - 2.7196872234344482
        - 1.3237552642822266
        

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,39,57.9987,5300,49.98,200,10,49.98


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 5500
  custom_metrics: {}
  date: 2022-04-22_11-30-03
  done: false
  episode_len_mean: 51.88
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 51.88
  episode_reward_min: 10.0
  episodes_this_iter: 1
  episodes_total: 115
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 5493
    learner:
      default_policy:
        learner_stats:
          actor_loss: -31.533496856689453
          alpha_loss: -0.05966329574584961
          alpha_value: 1.4908634424209595
          critic_loss: 0.12645795941352844
          log_alpha_value: 0.3993554413318634
          max_q: 34.438255310058594
          mean_q: 29.854110717773438
          min_q: -3.0694546699523926
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.3285748362541199
        td_error:
        - 0.426788330078125
        - 0.2347412109375
        -

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,43,63.6548,5700,53.74,200,10,53.74


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 5900
  custom_metrics: {}
  date: 2022-04-22_11-30-09
  done: false
  episode_len_mean: 55.49
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 55.49
  episode_reward_min: 10.0
  episodes_this_iter: 1
  episodes_total: 117
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 5889
    learner:
      default_policy:
        learner_stats:
          actor_loss: -32.24004364013672
          alpha_loss: -0.057980868965387344
          alpha_value: 1.5439108610153198
          critic_loss: 0.15970872342586517
          log_alpha_value: 0.43431875109672546
          max_q: 35.297706604003906
          mean_q: 30.534988403320312
          min_q: 0.6766139268875122
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.4544411897659302
        td_error:
        - 0.6729755401611328
        - 0.20105934143066406
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,47,69.4129,6100,57.35,200,10,57.35


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 6300
  custom_metrics: {}
  date: 2022-04-22_11-30-15
  done: false
  episode_len_mean: 58.9
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 58.9
  episode_reward_min: 10.0
  episodes_this_iter: 1
  episodes_total: 119
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 6285
    learner:
      default_policy:
        learner_stats:
          actor_loss: -33.281246185302734
          alpha_loss: -0.07927262037992477
          alpha_value: 1.5980865955352783
          critic_loss: 0.18126851320266724
          log_alpha_value: 0.4688071012496948
          max_q: 36.60114288330078
          mean_q: 31.4232120513916
          min_q: -0.47088325023651123
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.475597083568573
        td_error:
        - 0.21914291381835938
        - 0.36392784118652344
        

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,51,75.0938,6500,60.12,200,10,60.12


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 6700
  custom_metrics: {}
  date: 2022-04-22_11-30-21
  done: false
  episode_len_mean: 61.99
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 61.99
  episode_reward_min: 10.0
  episodes_this_iter: 0
  episodes_total: 121
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 6681
    learner:
      default_policy:
        learner_stats:
          actor_loss: -34.16441345214844
          alpha_loss: -0.06779667735099792
          alpha_value: 1.656657338142395
          critic_loss: 0.16031304001808167
          log_alpha_value: 0.5048019289970398
          max_q: 36.413089752197266
          mean_q: 32.273075103759766
          min_q: 3.7523717880249023
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.43103519082069397
        td_error:
        - 0.20372390747070312
        - 0.18954086303710938
     

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,55,80.8422,6900,63.81,200,10,63.81


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 7100
  custom_metrics: {}
  date: 2022-04-22_11-30-27
  done: false
  episode_len_mean: 65.71
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 65.71
  episode_reward_min: 10.0
  episodes_this_iter: 0
  episodes_total: 123
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 7077
    learner:
      default_policy:
        learner_stats:
          actor_loss: -34.69989013671875
          alpha_loss: -0.0801135003566742
          alpha_value: 1.7183386087417603
          critic_loss: 0.11272567510604858
          log_alpha_value: 0.5413578748703003
          max_q: 38.356689453125
          mean_q: 32.88288879394531
          min_q: 0.0748869776725769
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.3697460889816284
        td_error:
        - 0.4449748992919922
        - 0.10638809204101562
        - 0

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,59,86.603,7300,67.41,200,10,67.41


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 7500
  custom_metrics: {}
  date: 2022-04-22_11-30-33
  done: false
  episode_len_mean: 69.94
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 69.94
  episode_reward_min: 10.0
  episodes_this_iter: 1
  episodes_total: 126
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 7473
    learner:
      default_policy:
        learner_stats:
          actor_loss: -33.70117950439453
          alpha_loss: -0.0948144868016243
          alpha_value: 1.7808464765548706
          critic_loss: 0.1109597235918045
          log_alpha_value: 0.5770887732505798
          max_q: 38.70362854003906
          mean_q: 31.604780197143555
          min_q: -0.6113134622573853
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.32219019532203674
        td_error:
        - 0.3938426971435547
        - 0.06513214111328125
       

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,63,92.312,7700,70.95,200,10,70.95


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 7900
  custom_metrics: {}
  date: 2022-04-22_11-30-39
  done: false
  episode_len_mean: 72.49
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 72.49
  episode_reward_min: 10.0
  episodes_this_iter: 0
  episodes_total: 128
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 7869
    learner:
      default_policy:
        learner_stats:
          actor_loss: -35.555686950683594
          alpha_loss: -0.06471273303031921
          alpha_value: 1.8444453477859497
          critic_loss: 0.09306246042251587
          log_alpha_value: 0.6121786236763
          max_q: 39.11838150024414
          mean_q: 33.64411926269531
          min_q: 0.1460491120815277
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.3186926245689392
        td_error:
        - 0.22791481018066406
        - 0.1588115692138672
        - 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,67,98.0386,8100,74.11,200,10,74.11


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 8300
  custom_metrics: {}
  date: 2022-04-22_11-30-44
  done: false
  episode_len_mean: 75.52
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 75.52
  episode_reward_min: 10.0
  episodes_this_iter: 0
  episodes_total: 130
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 8298
    learner:
      default_policy:
        learner_stats:
          actor_loss: -36.47197341918945
          alpha_loss: -0.08248919993638992
          alpha_value: 1.9155776500701904
          critic_loss: 0.08165724575519562
          log_alpha_value: 0.6500191688537598
          max_q: 39.39724349975586
          mean_q: 34.30792236328125
          min_q: 6.887767791748047
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.2769050598144531
        td_error:
        - 0.07639312744140625
        - 0.7818593978881836
        -

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,71,103.726,8500,77.4,200,10,77.4


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 8700
  custom_metrics: {}
  date: 2022-04-22_11-30-50
  done: false
  episode_len_mean: 78.64
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 78.64
  episode_reward_min: 10.0
  episodes_this_iter: 0
  episodes_total: 132
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 8694
    learner:
      default_policy:
        learner_stats:
          actor_loss: -37.407806396484375
          alpha_loss: -0.06378810107707977
          alpha_value: 1.9833673238754272
          critic_loss: 0.5300891995429993
          log_alpha_value: 0.6847960948944092
          max_q: 40.250038146972656
          mean_q: 35.65925598144531
          min_q: -2.036884069442749
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.7896920442581177
        td_error:
        - 0.9185676574707031
        - 0.5241184234619141
        

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,75,109.377,8900,81.82,200,10,81.82


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 9100
  custom_metrics: {}
  date: 2022-04-22_11-30-56
  done: false
  episode_len_mean: 83.42
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 83.42
  episode_reward_min: 10.0
  episodes_this_iter: 1
  episodes_total: 135
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 9090
    learner:
      default_policy:
        learner_stats:
          actor_loss: -38.40892791748047
          alpha_loss: -0.07201479375362396
          alpha_value: 2.058306932449341
          critic_loss: 0.0343947596848011
          log_alpha_value: 0.7218837141990662
          max_q: 40.57490158081055
          mean_q: 36.47382354736328
          min_q: -1.2646217346191406
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.16920554637908936
        td_error:
        - 0.14851951599121094
        - 0.06881141662597656
       

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,79,115.043,9300,85.28,200,10,85.28


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 9500
  custom_metrics: {}
  date: 2022-04-22_11-31-02
  done: false
  episode_len_mean: 86.51
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 86.51
  episode_reward_min: 11.0
  episodes_this_iter: 1
  episodes_total: 138
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 9486
    learner:
      default_policy:
        learner_stats:
          actor_loss: -37.301971435546875
          alpha_loss: -0.11750349402427673
          alpha_value: 2.1343541145324707
          critic_loss: 0.15111777186393738
          log_alpha_value: 0.7581640481948853
          max_q: 41.931453704833984
          mean_q: 35.30315399169922
          min_q: -0.1788294017314911
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.36939048767089844
        td_error:
        - 0.26288414001464844
        - 0.16121864318847656
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,83,120.651,9700,87.78,200,11,87.78


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 9900
  custom_metrics: {}
  date: 2022-04-22_11-31-08
  done: false
  episode_len_mean: 90.37
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 90.37
  episode_reward_min: 11.0
  episodes_this_iter: 1
  episodes_total: 141
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 9882
    learner:
      default_policy:
        learner_stats:
          actor_loss: -41.701717376708984
          alpha_loss: -0.051272615790367126
          alpha_value: 2.2138099670410156
          critic_loss: 0.0477629154920578
          log_alpha_value: 0.7947149872779846
          max_q: 43.40660858154297
          mean_q: 39.88456726074219
          min_q: 20.588151931762695
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.24158506095409393
        td_error:
        - 0.12551498413085938
        - 0.2086029052734375
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,87,126.467,10100,91.63,200,11,91.63


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 10300
  custom_metrics: {}
  date: 2022-04-22_11-31-13
  done: false
  episode_len_mean: 92.72
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 92.72
  episode_reward_min: 11.0
  episodes_this_iter: 0
  episodes_total: 143
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 10278
    learner:
      default_policy:
        learner_stats:
          actor_loss: -40.07612609863281
          alpha_loss: -0.09876126796007156
          alpha_value: 2.295388698577881
          critic_loss: 0.049409784376621246
          log_alpha_value: 0.8309021592140198
          max_q: 44.29780578613281
          mean_q: 38.06761932373047
          min_q: 1.5398759841918945
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.23932169377803802
        td_error:
        - 0.34998130798339844
        - 0.3790702819824219
     

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,91,132.186,10500,94.54,200,11,94.54


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 10700
  custom_metrics: {}
  date: 2022-04-22_11-31-19
  done: false
  episode_len_mean: 96.09
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 96.09
  episode_reward_min: 11.0
  episodes_this_iter: 0
  episodes_total: 145
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 10674
    learner:
      default_policy:
        learner_stats:
          actor_loss: -43.51719284057617
          alpha_loss: -0.0368126705288887
          alpha_value: 2.3763272762298584
          critic_loss: 0.0766201913356781
          log_alpha_value: 0.8655561804771423
          max_q: 44.65904235839844
          mean_q: 41.723411560058594
          min_q: 25.230871200561523
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.3284885883331299
        td_error:
        - 0.40296173095703125
        - 0.4847278594970703
       

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,RUNNING,172.28.0.2:1191,95,137.836,10900,97.82,200,11,97.82


Result for SAC_CartPole-v0_64065_00000:
  agent_timesteps_total: 11100
  custom_metrics: {}
  date: 2022-04-22_11-31-25
  done: false
  episode_len_mean: 99.7
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 99.7
  episode_reward_min: 11.0
  episodes_this_iter: 0
  episodes_total: 147
  experiment_id: b1f7de8926e041bea0e209f890048691
  hostname: 9b9552d30cc8
  info:
    last_target_update_ts: 11070
    learner:
      default_policy:
        learner_stats:
          actor_loss: -42.99762725830078
          alpha_loss: -0.0908115953207016
          alpha_value: 2.4699864387512207
          critic_loss: 0.09571187198162079
          log_alpha_value: 0.9042126536369324
          max_q: 46.209293365478516
          mean_q: 40.43885803222656
          min_q: 0.9439122676849365
          policy_t: 0.5
          target_entropy: 0.6792842149734497
        mean_td_error: 0.29957807064056396
        td_error:
        - 0.493255615234375
        - 0.2512226104736328
        -

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_CartPole-v0_64065_00000,TERMINATED,172.28.0.2:1191,98,142.222,11200,101.55,200,11,101.55


2022-04-22 11:31:27,686	INFO tune.py:639 -- Total run time: 153.79 seconds (153.30 seconds for the tuning loop).


# Load trained model 

In [20]:
checkpoint_file = "/content/drive/MyDrive/INM707/cartpole_v1/SAC/SAC_CartPole-v0_64065_00000_0_2022-04-22_11-28-54/checkpoint_000090/checkpoint-90"
# "/content/drive/MyDrive/INM707/cartpole_v1/SAC/SAC_CartPole-v0_4e787_00000_0_2022-04-21_22-49-31/checkpoint_000270/checkpoint-270"
agent = sac.SACTrainer(config=model_train_config)
agent.restore(checkpoint_file)

2022-04-22 11:33:44,340	INFO simple_q.py:155 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
2022-04-22 11:33:44,342	INFO trainer.py:781 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2022-04-22 11:33:51,591	INFO trainable.py:496 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/INM707/cartpole_v1/SAC/SAC_CartPole-v0_64065_00000_0_2022-04-22_11-28-54/checkpoint_000090/checkpoint-90
2022-04-22 11:33:51,593	INFO trainable.py:503 -- Current state after restoring: {'_iteration': 90, '_timesteps_total': 5760, '_time_total': 130.7810823917389, '_episodes_total': 144}


In [None]:
import gym
done = False

env = wrap_env(gym.make('CartPole-v0'))
state = env.reset()
total_reward = 0 

while not done:
    action = agent.compute_single_action(state)
    action = env.action_space.sample() # take a random action

    state, reward, done, _ = env.step(action) # take a random action
    total_reward += reward

    #env.render()


env.close()
show_video()

In [None]:
print("Total reward:", total_reward)