# Challenge 1 : Cart Pole

## Imports :

In [1]:
import os

# Reinforcement Learning
import gym

# Distributed Learning
import ray
from ray.rllib.agents.ppo import PPOTrainer

# Display
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from IPython.display import Video

# Optional
import warnings
warnings.filterwarnings('ignore')

## Context :

In [2]:
env = gym.make("CartPole-v1")

In [3]:
env.action_space

Discrete(2)

In [4]:
env.reset()

array([-0.00986755,  0.02455244,  0.03689659,  0.00625606], dtype=float32)

## Random Action :

In [5]:
def check_video_folder_sanity(path, video_name):
    video_path = path + video_name
    os.makedirs(path, exist_ok=True)
    if os.path.exists(video_path + ".mp4"):
        os.remove(video_path + ".mp4")
    if os.path.exists(video_path + ".meta.json"):
        os.remove(video_path + ".meta.json")
    return video_path

In [8]:
video_name = "before_training"
path = "videos/cart_pole/"
random_seed = 42

video_path = check_video_folder_sanity(path, video_name)
    
env = gym.make("CartPole-v1")
env.action_space.seed(random_seed)
before_video = VideoRecorder(env, video_path + ".mp4", enabled=video_name is not None)

env.reset()
for i in range(200):
    env.render()
    before_video.capture_frame()
    observation, reward, done, info = env.step(env.action_space.sample())
    #print("step", i, observation, reward, done, info)
    
before_video.close()
env.close()

In [9]:
Video(video_path + ".mp4")

## Train an agent :

In [10]:
os.cpu_count()

12

In [13]:
config = {
    "env": "CartPole-v1",
    # “tf” to use tensorflow, "torch" to use pytorch
    "framework": "torch",
    "model": {
        "fcnet_hiddens": [32],
        "fcnet_activation": "linear",
    },
}
stop = {"episode_reward_mean": 475}
ray.shutdown()
ray.init(num_cpus=4, include_dashboard=False,
         ignore_reinit_error=True, log_to_driver=False)
# Start Training 
analysis = ray.tune.run("PPO", config=config,
                        stop=stop, checkpoint_at_end=True, fail_fast="raise")

Trial name,status,loc
PPO_CartPole-v1_6e22b_00000,PENDING,


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2022-07-23_20-01-22
  done: false
  episode_len_mean: 22.49425287356322
  episode_media: {}
  episode_reward_max: 60.0
  episode_reward_mean: 22.49425287356322
  episode_reward_min: 9.0
  episodes_this_iter: 174
  episodes_total: 174
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 0.68302429568383
          entropy_coeff: 0.0
          kl: 0.01001952803287165
          policy_loss: -0.011653771173329122
          total_loss: 253.44734116420952
          vf_explained_var:
          - -0.0007393888081423938
          vf_loss: 253.45699172481415
        model: {}
    num_agent_steps_sampled: 4000
    num_agent_steps_trained: 4000
    nu

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,1,3.83488,4000,22.4943,60,9,22.4943


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 12000
  custom_metrics: {}
  date: 2022-07-23_20-01-30
  done: false
  episode_len_mean: 40.39
  episode_media: {}
  episode_reward_max: 147.0
  episode_reward_mean: 40.39
  episode_reward_min: 10.0
  episodes_this_iter: 98
  episodes_total: 413
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6355716730958672
          entropy_coeff: 0.0
          kl: 0.005707622491848895
          policy_loss: -0.016026466732384056
          total_loss: 653.9133229573567
          vf_explained_var:
          - -0.0005345064564608037
          vf_loss: 653.9282075779413
        model: {}
    num_agent_steps_sampled: 12000
    num_agent_steps_trained: 12000
    num_steps_sampled: 12

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,3,11.5605,12000,40.39,147,10,40.39


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 20000
  custom_metrics: {}
  date: 2022-07-23_20-01-38
  done: false
  episode_len_mean: 78.86
  episode_media: {}
  episode_reward_max: 257.0
  episode_reward_mean: 78.86
  episode_reward_min: 14.0
  episodes_this_iter: 39
  episodes_total: 513
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.10000000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5948357629519637
          entropy_coeff: 0.0
          kl: 0.0025950966748144815
          policy_loss: -0.011967614183943438
          total_loss: 1423.2395595099335
          vf_explained_var:
          - 0.0007547324057668447
          vf_loss: 1423.2512715329406
        model: {}
    num_agent_steps_sampled: 20000
    num_agent_steps_trained: 20000
    num_steps_sampled: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,5,19.1648,20000,78.86,257,14,78.86


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 28000
  custom_metrics: {}
  date: 2022-07-23_20-01-45
  done: false
  episode_len_mean: 130.05
  episode_media: {}
  episode_reward_max: 359.0
  episode_reward_mean: 130.05
  episode_reward_min: 16.0
  episodes_this_iter: 23
  episodes_total: 558
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.025000000000000005
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5895098480486101
          entropy_coeff: 0.0
          kl: 0.003539758422457492
          policy_loss: -0.011443574891816223
          total_loss: 2005.898864155431
          vf_explained_var:
          - -0.04575718939304352
          vf_loss: 2005.9102147256174
        model: {}
    num_agent_steps_sampled: 28000
    num_agent_steps_trained: 28000
    num_steps_sampled: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,7,26.6462,28000,130.05,359,16,130.05


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 36000
  custom_metrics: {}
  date: 2022-07-23_20-01-53
  done: false
  episode_len_mean: 174.75
  episode_media: {}
  episode_reward_max: 410.0
  episode_reward_mean: 174.75
  episode_reward_min: 21.0
  episodes_this_iter: 17
  episodes_total: 598
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.006250000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5856178797701354
          entropy_coeff: 0.0
          kl: 0.0005694413540360578
          policy_loss: -0.010125574309338806
          total_loss: 2034.7880684801328
          vf_explained_var:
          - -0.01390852965414524
          vf_loss: 2034.7981919155327
        model: {}
    num_agent_steps_sampled: 36000
    num_agent_steps_trained: 36000
    num_steps_sampled

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,9,34.2185,36000,174.75,410,21,174.75


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 44000
  custom_metrics: {}
  date: 2022-07-23_20-02-01
  done: false
  episode_len_mean: 210.69
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 210.69
  episode_reward_min: 41.0
  episodes_this_iter: 12
  episodes_total: 627
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0015625000000000003
          cur_lr: 5.0000000000000016e-05
          entropy: 0.567845380754881
          entropy_coeff: 0.0
          kl: 0.0013675496000399015
          policy_loss: -0.011337515876017591
          total_loss: 2306.22373210948
          vf_explained_var:
          - 0.022543713450431824
          vf_loss: 2306.2350606938844
        model: {}
    num_agent_steps_sampled: 44000
    num_agent_steps_trained: 44000
    num_steps_sampled: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,11,41.7598,44000,210.69,500,41,210.69


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 52000
  custom_metrics: {}
  date: 2022-07-23_20-02-08
  done: false
  episode_len_mean: 251.27
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 251.27
  episode_reward_min: 41.0
  episodes_this_iter: 15
  episodes_total: 653
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0003906250000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.572436648158617
          entropy_coeff: 0.0
          kl: 0.0038534951624086947
          policy_loss: -0.010867985549272709
          total_loss: 1767.7551137288413
          vf_explained_var:
          - -0.012719376012682915
          vf_loss: 1767.7659789054624
        model: {}
    num_agent_steps_sampled: 52000
    num_agent_steps_trained: 52000
    num_steps_sample

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,13,49.3525,52000,251.27,500,41,251.27


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 60000
  custom_metrics: {}
  date: 2022-07-23_20-02-16
  done: false
  episode_len_mean: 282.83
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 282.83
  episode_reward_min: 76.0
  episodes_this_iter: 11
  episodes_total: 679
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 9.765625000000002e-05
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5490572565345354
          entropy_coeff: 0.0
          kl: 0.0034496347788371953
          policy_loss: -0.013337810403637348
          total_loss: 1920.6875665808236
          vf_explained_var:
          - -0.03204629197716713
          vf_loss: 1920.700900498257
        model: {}
    num_agent_steps_sampled: 60000
    num_agent_steps_trained: 60000
    num_steps_sampled

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,15,57.1474,60000,282.83,500,76,282.83


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 68000
  custom_metrics: {}
  date: 2022-07-23_20-02-24
  done: false
  episode_len_mean: 308.4
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 308.4
  episode_reward_min: 117.0
  episodes_this_iter: 15
  episodes_total: 703
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.4414062500000005e-05
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5534907524944633
          entropy_coeff: 0.0
          kl: 0.0024791394974940715
          policy_loss: -0.01072845068930458
          total_loss: 1446.6880124656102
          vf_explained_var:
          - -0.02498968318104744
          vf_loss: 1446.6987338814683
        model: {}
    num_agent_steps_sampled: 68000
    num_agent_steps_trained: 68000
    num_steps_sampled

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,17,64.8407,68000,308.4,500,117,308.4


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 76000
  custom_metrics: {}
  date: 2022-07-23_20-02-32
  done: false
  episode_len_mean: 337.48
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 337.48
  episode_reward_min: 117.0
  episodes_this_iter: 9
  episodes_total: 722
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 6.103515625000001e-06
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5295290557927983
          entropy_coeff: 0.0
          kl: 0.0011084930878813989
          policy_loss: -0.009904616856847398
          total_loss: 1521.6032169465095
          vf_explained_var:
          - -0.04113409295678139
          vf_loss: 1521.6131323701593
        model: {}
    num_agent_steps_sampled: 76000
    num_agent_steps_trained: 76000
    num_steps_sample

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,19,72.5299,76000,337.48,500,117,337.48


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 84000
  custom_metrics: {}
  date: 2022-07-23_20-02-40
  done: false
  episode_len_mean: 353.22
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 353.22
  episode_reward_min: 117.0
  episodes_this_iter: 9
  episodes_total: 741
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.5258789062500003e-06
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5318886826435725
          entropy_coeff: 0.0
          kl: 0.0007342701386350035
          policy_loss: -0.007886218454849015
          total_loss: 1287.9996888519615
          vf_explained_var:
          - -0.06459195166826248
          vf_loss: 1288.0075689500377
        model: {}
    num_agent_steps_sampled: 84000
    num_agent_steps_trained: 84000
    num_steps_sampl

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,21,80.4922,84000,353.22,500,117,353.22


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 92000
  custom_metrics: {}
  date: 2022-07-23_20-02-48
  done: false
  episode_len_mean: 384.04
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 384.04
  episode_reward_min: 117.0
  episodes_this_iter: 8
  episodes_total: 758
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.814697265625001e-07
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5362070259227547
          entropy_coeff: 0.0
          kl: 0.0006244662457598532
          policy_loss: -0.007197979413053041
          total_loss: 1129.1043415684853
          vf_explained_var:
          - 0.03205464780330658
          vf_loss: 1129.1115376134073
        model: {}
    num_agent_steps_sampled: 92000
    num_agent_steps_trained: 92000
    num_steps_sampled

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,23,88.9653,92000,384.04,500,117,384.04


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 100000
  custom_metrics: {}
  date: 2022-07-23_20-02-57
  done: false
  episode_len_mean: 406.34
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 406.34
  episode_reward_min: 117.0
  episodes_this_iter: 10
  episodes_total: 777
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 9.536743164062502e-08
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5370557866109315
          entropy_coeff: 0.0
          kl: 0.0017703727857074855
          policy_loss: -0.0065581404954515475
          total_loss: 1013.0510807488554
          vf_explained_var:
          - -0.2734878957271576
          vf_loss: 1013.0576368106309
        model: {}
    num_agent_steps_sampled: 100000
    num_agent_steps_trained: 100000
    num_steps_sa

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,25,97.7007,100000,406.34,500,117,406.34


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 108000
  custom_metrics: {}
  date: 2022-07-23_20-03-06
  done: false
  episode_len_mean: 422.34
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 422.34
  episode_reward_min: 149.0
  episodes_this_iter: 8
  episodes_total: 793
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.3841857910156255e-08
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5294694326898103
          entropy_coeff: 0.0
          kl: 0.0006232533626120386
          policy_loss: -0.005478058094459195
          total_loss: 831.1262467251029
          vf_explained_var:
          - -0.1372043341398239
          vf_loss: 831.1317274442283
        model: {}
    num_agent_steps_sampled: 108000
    num_agent_steps_trained: 108000
    num_steps_sampl

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,27,106.935,108000,422.34,500,149,422.34


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 116000
  custom_metrics: {}
  date: 2022-07-23_20-03-14
  done: false
  episode_len_mean: 436.42
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 436.42
  episode_reward_min: 169.0
  episodes_this_iter: 11
  episodes_total: 814
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 5.960464477539064e-09
          cur_lr: 5.0000000000000016e-05
          entropy: 0.524110984481791
          entropy_coeff: 0.0
          kl: 0.0008735424991992372
          policy_loss: -0.0038363754168473266
          total_loss: 761.9864349037089
          vf_explained_var:
          - -0.2087595909833908
          vf_loss: 761.9902716359785
        model: {}
    num_agent_steps_sampled: 116000
    num_agent_steps_trained: 116000
    num_steps_sampl

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,29,115.067,116000,436.42,500,169,436.42


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 124000
  custom_metrics: {}
  date: 2022-07-23_20-03-22
  done: false
  episode_len_mean: 428.71
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 428.71
  episode_reward_min: 130.0
  episodes_this_iter: 13
  episodes_total: 835
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.490116119384766e-09
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5139690210101425
          entropy_coeff: 0.0
          kl: 0.000624396397560504
          policy_loss: -0.0008645909207482492
          total_loss: 742.7697849806918
          vf_explained_var:
          - -0.2515738606452942
          vf_loss: 742.7706527463852
        model: {}
    num_agent_steps_sampled: 124000
    num_agent_steps_trained: 124000
    num_steps_sampl

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,31,122.99,124000,428.71,500,130,428.71


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 132000
  custom_metrics: {}
  date: 2022-07-23_20-03-31
  done: false
  episode_len_mean: 424.71
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 424.71
  episode_reward_min: 110.0
  episodes_this_iter: 9
  episodes_total: 853
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.725290298461915e-10
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5157842105434787
          entropy_coeff: 0.0
          kl: 0.002063405198760814
          policy_loss: -0.004870772938073803
          total_loss: 514.9122452479537
          vf_explained_var:
          - -0.10558952391147614
          vf_loss: 514.9171175884944
        model: {}
    num_agent_steps_sampled: 132000
    num_agent_steps_trained: 132000
    num_steps_sample

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,33,131.133,132000,424.71,500,110,424.71


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 140000
  custom_metrics: {}
  date: 2022-07-23_20-03-39
  done: false
  episode_len_mean: 417.92
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 417.92
  episode_reward_min: 110.0
  episodes_this_iter: 10
  episodes_total: 872
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 9.313225746154787e-11
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5134287555051106
          entropy_coeff: 0.0
          kl: 0.002227113507155357
          policy_loss: -0.003021643873584527
          total_loss: 536.1781577981928
          vf_explained_var:
          - -0.14695963263511658
          vf_loss: 536.1811797029229
        model: {}
    num_agent_steps_sampled: 140000
    num_agent_steps_trained: 140000
    num_steps_sampl

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,35,139.026,140000,417.92,500,110,417.92


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 148000
  custom_metrics: {}
  date: 2022-07-23_20-03-47
  done: false
  episode_len_mean: 417.03
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 417.03
  episode_reward_min: 110.0
  episodes_this_iter: 10
  episodes_total: 890
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.3283064365386967e-11
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5037030619639222
          entropy_coeff: 0.0
          kl: 0.004103962145961061
          policy_loss: -0.005808253586292267
          total_loss: 509.7463887655607
          vf_explained_var:
          - -0.25097405910491943
          vf_loss: 509.75219681852604
        model: {}
    num_agent_steps_sampled: 148000
    num_agent_steps_trained: 148000
    num_steps_sam

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,37,147.099,148000,417.03,500,110,417.03


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 156000
  custom_metrics: {}
  date: 2022-07-23_20-03-55
  done: false
  episode_len_mean: 427.44
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 427.44
  episode_reward_min: 110.0
  episodes_this_iter: 8
  episodes_total: 906
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 5.820766091346742e-12
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5171368691869961
          entropy_coeff: 0.0
          kl: 0.001407139072591957
          policy_loss: -0.0013227623316549486
          total_loss: 461.7417781337615
          vf_explained_var:
          - -0.055260807275772095
          vf_loss: 461.7431020223966
        model: {}
    num_agent_steps_sampled: 156000
    num_agent_steps_trained: 156000
    num_steps_samp

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,39,155.536,156000,427.44,500,110,427.44


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 164000
  custom_metrics: {}
  date: 2022-07-23_20-04-04
  done: false
  episode_len_mean: 435.13
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 435.13
  episode_reward_min: 110.0
  episodes_this_iter: 8
  episodes_total: 923
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.4551915228366855e-12
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5195916908402597
          entropy_coeff: 0.0
          kl: 0.0010119088870648354
          policy_loss: -4.994449436023671e-05
          total_loss: 518.1564171903876
          vf_explained_var:
          - -0.14341993629932404
          vf_loss: 518.1564622981574
        model: {}
    num_agent_steps_sampled: 164000
    num_agent_steps_trained: 164000
    num_steps_sam

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,41,163.86,164000,435.13,500,110,435.13


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 172000
  custom_metrics: {}
  date: 2022-07-23_20-04-11
  done: false
  episode_len_mean: 458.88
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 458.88
  episode_reward_min: 118.0
  episodes_this_iter: 8
  episodes_total: 940
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.6379788070917137e-13
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5094273452476789
          entropy_coeff: 0.0
          kl: 0.0018134001792102453
          policy_loss: -0.0013510088446319745
          total_loss: 506.42929915561473
          vf_explained_var:
          - -0.07667318731546402
          vf_loss: 506.4306501573132
        model: {}
    num_agent_steps_sampled: 172000
    num_agent_steps_trained: 172000
    num_steps_sa

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,43,171.759,172000,458.88,500,118,458.88


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 180000
  custom_metrics: {}
  date: 2022-07-23_20-04-19
  done: false
  episode_len_mean: 455.31
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 455.31
  episode_reward_min: 118.0
  episodes_this_iter: 10
  episodes_total: 959
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 9.094947017729284e-14
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5135584140977552
          entropy_coeff: 0.0
          kl: 0.0002777872799419324
          policy_loss: 0.00201664367310142
          total_loss: 577.4896444136097
          vf_explained_var:
          - -0.049117304384708405
          vf_loss: 577.4876301457805
        model: {}
    num_agent_steps_sampled: 180000
    num_agent_steps_trained: 180000
    num_steps_sampl

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,45,179.68,180000,455.31,500,118,455.31


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 188000
  custom_metrics: {}
  date: 2022-07-23_20-04-27
  done: false
  episode_len_mean: 462.76
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 462.76
  episode_reward_min: 118.0
  episodes_this_iter: 8
  episodes_total: 976
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.273736754432321e-14
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5172271940977343
          entropy_coeff: 0.0
          kl: 0.0011645050726921682
          policy_loss: -0.0002749759503590163
          total_loss: 500.90231264175907
          vf_explained_var:
          - 0.012865548953413963
          vf_loss: 500.9025879234396
        model: {}
    num_agent_steps_sampled: 188000
    num_agent_steps_trained: 188000
    num_steps_sam

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,47,187.534,188000,462.76,500,118,462.76


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 196000
  custom_metrics: {}
  date: 2022-07-23_20-04-35
  done: false
  episode_len_mean: 467.89
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 467.89
  episode_reward_min: 118.0
  episodes_this_iter: 8
  episodes_total: 992
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 5.6843418860808026e-15
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5177878022514364
          entropy_coeff: 0.0
          kl: 0.0021327557421488277
          policy_loss: -0.00020613595843315123
          total_loss: 534.5553520325692
          vf_explained_var:
          - -0.07692240178585052
          vf_loss: 534.5555581246653
        model: {}
    num_agent_steps_sampled: 196000
    num_agent_steps_trained: 196000
    num_steps_sa

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,49,195.602,196000,467.89,500,118,467.89


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 204000
  custom_metrics: {}
  date: 2022-07-23_20-04-44
  done: false
  episode_len_mean: 460.6
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 460.6
  episode_reward_min: 118.0
  episodes_this_iter: 9
  episodes_total: 1010
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.4210854715202006e-15
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5168829057806281
          entropy_coeff: 0.0
          kl: 0.0014635509966831955
          policy_loss: -0.0008508720825756749
          total_loss: 590.7483871788107
          vf_explained_var:
          - -0.16594508290290833
          vf_loss: 590.7492373517764
        model: {}
    num_agent_steps_sampled: 204000
    num_agent_steps_trained: 204000
    num_steps_samp

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,51,204.007,204000,460.6,500,118,460.6


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 212000
  custom_metrics: {}
  date: 2022-07-23_20-04-52
  done: false
  episode_len_mean: 466.29
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 466.29
  episode_reward_min: 118.0
  episodes_this_iter: 8
  episodes_total: 1026
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.5527136788005016e-16
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5320044406639632
          entropy_coeff: 0.0
          kl: 0.0005547467038799562
          policy_loss: 9.582854046296048e-05
          total_loss: 529.6212682949599
          vf_explained_var:
          - -0.049634259194135666
          vf_loss: 529.6211717421008
        model: {}
    num_agent_steps_sampled: 212000
    num_agent_steps_trained: 212000
    num_steps_sa

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,53,212.486,212000,466.29,500,118,466.29


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 220000
  custom_metrics: {}
  date: 2022-07-23_20-05-01
  done: false
  episode_len_mean: 471.88
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 471.88
  episode_reward_min: 175.0
  episodes_this_iter: 8
  episodes_total: 1042
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 8.881784197001254e-17
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5269133135836611
          entropy_coeff: 0.0
          kl: 0.002458919508846222
          policy_loss: -0.00151736976638917
          total_loss: 499.00465960348805
          vf_explained_var:
          - -0.011730234138667583
          vf_loss: 499.0061767126924
        model: {}
    num_agent_steps_sampled: 220000
    num_agent_steps_trained: 220000
    num_steps_samp

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,55,220.626,220000,471.88,500,175,471.88


Result for PPO_CartPole-v1_6e22b_00000:
  agent_timesteps_total: 228000
  custom_metrics: {}
  date: 2022-07-23_20-05-09
  done: true
  episode_len_mean: 475.73
  episode_media: {}
  episode_reward_max: 500.0
  episode_reward_mean: 475.73
  episode_reward_min: 182.0
  episodes_this_iter: 8
  episodes_total: 1059
  experiment_id: abe6ee00f7dd492fb81d51c8d37b7746
  hostname: LAPTOP-MUL4L8MS
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.2204460492503135e-17
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5208867065047705
          entropy_coeff: 0.0
          kl: 0.0002004759106740692
          policy_loss: -0.0009501318677618939
          total_loss: 382.68431322651526
          vf_explained_var:
          - -0.035070717334747314
          vf_loss: 382.6852656948951
        model: {}
    num_agent_steps_sampled: 228000
    num_agent_steps_trained: 228000
    num_steps_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,RUNNING,192.168.1.12:7416,57,228.562,228000,475.73,500,182,475.73


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_6e22b_00000,TERMINATED,,57,228.562,228000,475.73,500,182,475.73


2022-07-23 20:05:09,773	INFO tune.py:561 -- Total run time: 240.72 seconds (240.05 seconds for the tuning loop).


In [14]:
# restore a trainer from the last checkpoint
trial = analysis.get_best_logdir("episode_reward_mean", "max")
checkpoint = analysis.get_best_checkpoint(
  trial,
  "training_iteration",
  "max",
)
trainer = PPOTrainer(config=config)
trainer.restore(checkpoint)

2022-07-23 20:05:48,772	INFO ppo.py:158 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2022-07-23 20:05:48,773	INFO trainer.py:726 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2022-07-23 20:05:53,733	INFO trainable.py:382 -- Restored on 192.168.1.12 from checkpoint: C:\Users\brieg\ray_results\PPO\PPO_CartPole-v1_6e22b_00000_0_2022-07-23_20-01-09\checkpoint_000057\checkpoint-57
2022-07-23 20:05:53,734	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 57, '_timesteps_total': None, '_time_total': 228.56157183647156, '_episodes_total': 1059}


## Evaluate the Agent :

In [18]:
video_name = "after_training"
video_path = check_video_folder_sanity(path, video_name)

after_video = VideoRecorder(env, video_path + ".mp4", enabled=video_name is not None)
observation = env.reset()
done = False
while not done:
    env.render()
    after_video.capture_frame()
    action = trainer.compute_single_action(observation)
    observation, reward, done, info = env.step(action)
after_video.close()
env.close()



In [17]:
Video(video_path + ".mp4")