# **CartPole - The OpenAI Gym CartPole Environment**

_A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The goal is to prevent the pole from falling over._

https://gym.openai.com/envs/CartPole-v1/


## Agent's Random Actions (without training) with the Environment

In [1]:
# Imports required packages

import gym
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from pyvirtualdisplay import Display

import ray
from ray.rllib.agents.ppo import PPOTrainer

In [24]:
# Creates environment for cartpole
env = gym.make("CartPole-v1")

In [25]:
# Checks for available actions available in the enviroment
env.action_space

Discrete(2)

In [4]:
# Instantiate virtual display to record video
display = Display(visible=False, size=(1400, 900))
_ = display.start()

# Sets path of the video file
before_training_video_file = "./before_training.mp4"

In [26]:
# Creates video recorder instance
video = VideoRecorder(env, before_training_video_file)

# Resets environment
env.reset()

# Start interacting with the environment and records each interactions
for i in range(200):
    env.render()
    video.capture_frame()
    
    observation, reward, done, info = env.step(env.action_space.sample())
    
    #print("step:", i, observation, reward, done, info)
#     if done:
#         print("Episode finished after {} timesteps".format(i+1))
#         break

# Closes video recorder and environment
video.close()
env.close()



In [18]:
# Displays the recorded video here in notebook
from IPython.display import HTML
from render_video import render_mp4
html = render_mp4(before_training_video_file)
HTML(html)

## Training the Agent

In [27]:
config = {
    "env": "CartPole-v1",
    "framework": "torch",  # or "tf" for Tensorflow
    "model": {
        "fcnet_hiddens": [32],
        "fcnet_activation": "linear",  # other option is "relu"
    },
}

stop = {"episode_reward_mean": 195}

In [28]:
ray.shutdown()

ray.init(
    num_cpus=8,  # check number of CPU cores available in your computer
    include_dashboard=False,
    ignore_reinit_error=True,
    log_to_driver=False,
)

{'node_ip_address': '192.168.226.49',
 'raylet_ip_address': '192.168.226.49',
 'redis_address': '192.168.226.49:6379',
 'object_store_address': '/tmp/ray/session_2021-10-09_09-19-10_969877_72352/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-10-09_09-19-10_969877_72352/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2021-10-09_09-19-10_969877_72352',
 'metrics_export_port': 52744,
 'node_id': '219612646b0e0a46df8a30a0d9a30de1412850b33f2d929527ca939c'}

In [29]:
# Performs training

analysis = ray.tune.run(
    "PPO",  # Proximal Policy Optimization algorithm
    config=config,
    stop=stop,
    checkpoint_at_end=True,
)

Trial name,status,loc
PPO_CartPole-v1_df90f_00000,PENDING,


Result for PPO_CartPole-v1_df90f_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2021-10-09_09-19-28
  done: false
  episode_len_mean: 21.641304347826086
  episode_media: {}
  episode_reward_max: 76.0
  episode_reward_mean: 21.641304347826086
  episode_reward_min: 9.0
  episodes_this_iter: 184
  episodes_total: 184
  experiment_id: 835c6130ec3a480fb1cb23e5c4f0294d
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6841155102816961
          entropy_coeff: 0.0
          kl: 0.009076417564353886
          policy_loss: -0.01536057619478113
          total_loss: 211.85406045400967
          vf_explained_var:
          - -0.0034129756968468428
          vf_loss: 211.86760523806336
        model: {}
    num_agent_steps_sampled: 4000
    num_agent_steps_trained: 4000


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,RUNNING,192.168.226.49:103619,1,5.84831,4000,21.6413,76,9,21.6413


Result for PPO_CartPole-v1_df90f_00000:
  agent_timesteps_total: 8000
  custom_metrics: {}
  date: 2021-10-09_09-19-34
  done: false
  episode_len_mean: 27.853146853146853
  episode_media: {}
  episode_reward_max: 98.0
  episode_reward_mean: 27.853146853146853
  episode_reward_min: 10.0
  episodes_this_iter: 143
  episodes_total: 327
  experiment_id: 835c6130ec3a480fb1cb23e5c4f0294d
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6629990611665992
          entropy_coeff: 0.0
          kl: 0.007639680380999725
          policy_loss: -0.01475261773473473
          total_loss: 335.232994333903
          vf_explained_var:
          - 0.005760434083640575
          vf_loss: 335.24621771535567
        model: {}
    num_agent_steps_sampled: 8000
    num_agent_steps_trained: 8000
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,RUNNING,192.168.226.49:103619,2,11.8674,8000,27.8531,98,10,27.8531


Result for PPO_CartPole-v1_df90f_00000:
  agent_timesteps_total: 12000
  custom_metrics: {}
  date: 2021-10-09_09-19-40
  done: false
  episode_len_mean: 37.61904761904762
  episode_media: {}
  episode_reward_max: 121.0
  episode_reward_mean: 37.61904761904762
  episode_reward_min: 9.0
  episodes_this_iter: 105
  episodes_total: 432
  experiment_id: 835c6130ec3a480fb1cb23e5c4f0294d
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6376500448872966
          entropy_coeff: 0.0
          kl: 0.00621791095632322
          policy_loss: -0.013186882926972322
          total_loss: 494.13562277517013
          vf_explained_var:
          - -0.015388996340334415
          vf_loss: 494.14756729782266
        model: {}
    num_agent_steps_sampled: 12000
    num_agent_steps_trained: 12000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,RUNNING,192.168.226.49:103619,3,18.0995,12000,37.619,121,9,37.619


Result for PPO_CartPole-v1_df90f_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2021-10-09_09-19-46
  done: false
  episode_len_mean: 52.32
  episode_media: {}
  episode_reward_max: 189.0
  episode_reward_mean: 52.32
  episode_reward_min: 12.0
  episodes_this_iter: 71
  episodes_total: 503
  experiment_id: 835c6130ec3a480fb1cb23e5c4f0294d
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6209945523610679
          entropy_coeff: 0.0
          kl: 0.003941223887489885
          policy_loss: -0.014729969517918684
          total_loss: 907.894780969107
          vf_explained_var:
          - -0.0003590714477468282
          vf_loss: 907.9087213331653
        model: {}
    num_agent_steps_sampled: 16000
    num_agent_steps_trained: 16000
    num_steps_sampled: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,RUNNING,192.168.226.49:103619,4,23.9882,16000,52.32,189,12,52.32


Result for PPO_CartPole-v1_df90f_00000:
  agent_timesteps_total: 20000
  custom_metrics: {}
  date: 2021-10-09_09-19-52
  done: false
  episode_len_mean: 71.48
  episode_media: {}
  episode_reward_max: 228.0
  episode_reward_mean: 71.48
  episode_reward_min: 12.0
  episodes_this_iter: 42
  episodes_total: 545
  experiment_id: 835c6130ec3a480fb1cb23e5c4f0294d
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.10000000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6070079863712352
          entropy_coeff: 0.0
          kl: 0.0036110141408952036
          policy_loss: -0.013322689568483701
          total_loss: 1518.5012674639302
          vf_explained_var:
          - -0.007517707068473101
          vf_loss: 1518.5142368439706
        model: {}
    num_agent_steps_sampled: 20000
    num_agent_steps_trained: 20000
    num_steps_sampled

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,RUNNING,192.168.226.49:103619,5,29.733,20000,71.48,228,12,71.48


Result for PPO_CartPole-v1_df90f_00000:
  agent_timesteps_total: 24000
  custom_metrics: {}
  date: 2021-10-09_09-19-58
  done: false
  episode_len_mean: 91.21
  episode_media: {}
  episode_reward_max: 348.0
  episode_reward_mean: 91.21
  episode_reward_min: 12.0
  episodes_this_iter: 28
  episodes_total: 573
  experiment_id: 835c6130ec3a480fb1cb23e5c4f0294d
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.05000000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5906560885008946
          entropy_coeff: 0.0
          kl: 0.0022350361641293513
          policy_loss: -0.01266468099508715
          total_loss: 1936.305871188256
          vf_explained_var:
          - -0.027071358636021614
          vf_loss: 1936.3184260542675
        model: {}
    num_agent_steps_sampled: 24000
    num_agent_steps_trained: 24000
    num_steps_sampled: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,RUNNING,192.168.226.49:103619,6,35.5426,24000,91.21,348,12,91.21


Result for PPO_CartPole-v1_df90f_00000:
  agent_timesteps_total: 28000
  custom_metrics: {}
  date: 2021-10-09_09-20-04
  done: false
  episode_len_mean: 119.79
  episode_media: {}
  episode_reward_max: 385.0
  episode_reward_mean: 119.79
  episode_reward_min: 15.0
  episodes_this_iter: 23
  episodes_total: 596
  experiment_id: 835c6130ec3a480fb1cb23e5c4f0294d
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.025000000000000005
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5846601717574622
          entropy_coeff: 0.0
          kl: 0.002400839100962113
          policy_loss: -0.012570964627128135
          total_loss: 2280.72630234585
          vf_explained_var:
          - -0.046364810317754745
          vf_loss: 2280.7388110089046
        model: {}
    num_agent_steps_sampled: 28000
    num_agent_steps_trained: 28000
    num_steps_sampled

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,RUNNING,192.168.226.49:103619,7,41.5271,28000,119.79,385,15,119.79


Result for PPO_CartPole-v1_df90f_00000:
  agent_timesteps_total: 32000
  custom_metrics: {}
  date: 2021-10-09_09-20-10
  done: false
  episode_len_mean: 142.66
  episode_media: {}
  episode_reward_max: 385.0
  episode_reward_mean: 142.66
  episode_reward_min: 27.0
  episodes_this_iter: 22
  episodes_total: 618
  experiment_id: 835c6130ec3a480fb1cb23e5c4f0294d
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.012500000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5930653524014258
          entropy_coeff: 0.0
          kl: 0.0011443538453852382
          policy_loss: -0.00988304220310222
          total_loss: 2326.873620999244
          vf_explained_var:
          - -0.06786046922206879
          vf_loss: 2326.883485593078
        model: {}
    num_agent_steps_sampled: 32000
    num_agent_steps_trained: 32000
    num_steps_sampled:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,RUNNING,192.168.226.49:103619,8,47.3675,32000,142.66,385,27,142.66


Result for PPO_CartPole-v1_df90f_00000:
  agent_timesteps_total: 36000
  custom_metrics: {}
  date: 2021-10-09_09-20-16
  done: false
  episode_len_mean: 164.65
  episode_media: {}
  episode_reward_max: 385.0
  episode_reward_mean: 164.65
  episode_reward_min: 27.0
  episodes_this_iter: 21
  episodes_total: 639
  experiment_id: 835c6130ec3a480fb1cb23e5c4f0294d
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.006250000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5910115479782063
          entropy_coeff: 0.0
          kl: 0.0031674888009952003
          policy_loss: -0.011215650614210836
          total_loss: 1898.4928074334257
          vf_explained_var:
          - -0.018399318680167198
          vf_loss: 1898.5040001653856
        model: {}
    num_agent_steps_sampled: 36000
    num_agent_steps_trained: 36000
    num_steps_samp

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,RUNNING,192.168.226.49:103619,9,53.3131,36000,164.65,385,27,164.65


Result for PPO_CartPole-v1_df90f_00000:
  agent_timesteps_total: 40000
  custom_metrics: {}
  date: 2021-10-09_09-20-22
  done: false
  episode_len_mean: 178.79
  episode_media: {}
  episode_reward_max: 385.0
  episode_reward_mean: 178.79
  episode_reward_min: 33.0
  episodes_this_iter: 21
  episodes_total: 660
  experiment_id: 835c6130ec3a480fb1cb23e5c4f0294d
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0031250000000000006
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5934345995226215
          entropy_coeff: 0.0
          kl: 0.001961056911218222
          policy_loss: -0.010255103332981947
          total_loss: 1979.0496925600114
          vf_explained_var:
          - 0.010294581763446331
          vf_loss: 1979.0599411831108
        model: {}
    num_agent_steps_sampled: 40000
    num_agent_steps_trained: 40000
    num_steps_sampl

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,RUNNING,192.168.226.49:103619,10,59.3444,40000,178.79,385,33,178.79


Result for PPO_CartPole-v1_df90f_00000:
  agent_timesteps_total: 44000
  custom_metrics: {}
  date: 2021-10-09_09-20-28
  done: false
  episode_len_mean: 183.8
  episode_media: {}
  episode_reward_max: 408.0
  episode_reward_mean: 183.8
  episode_reward_min: 13.0
  episodes_this_iter: 20
  episodes_total: 680
  experiment_id: 835c6130ec3a480fb1cb23e5c4f0294d
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0015625000000000003
          cur_lr: 5.0000000000000016e-05
          entropy: 0.582460597958616
          entropy_coeff: 0.0
          kl: 0.0012425415469802588
          policy_loss: -0.010345979183111139
          total_loss: 1997.9713784494709
          vf_explained_var:
          - 0.01507281418889761
          vf_loss: 1997.981714982884
        model: {}
    num_agent_steps_sampled: 44000
    num_agent_steps_trained: 44000
    num_steps_sampled: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,RUNNING,192.168.226.49:103619,11,65.3685,44000,183.8,408,13,183.8


Result for PPO_CartPole-v1_df90f_00000:
  agent_timesteps_total: 48000
  custom_metrics: {}
  date: 2021-10-09_09-20-34
  done: true
  episode_len_mean: 196.69
  episode_media: {}
  episode_reward_max: 408.0
  episode_reward_mean: 196.69
  episode_reward_min: 13.0
  episodes_this_iter: 18
  episodes_total: 698
  experiment_id: 835c6130ec3a480fb1cb23e5c4f0294d
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0007812500000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5838937960645204
          entropy_coeff: 0.0
          kl: 0.00041761561195637216
          policy_loss: -0.00932417760242618
          total_loss: 2000.4769756358157
          vf_explained_var:
          - 0.0004732272354885936
          vf_loss: 2000.4863042359711
        model: {}
    num_agent_steps_sampled: 48000
    num_agent_steps_trained: 48000
    num_steps_samp

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,RUNNING,192.168.226.49:103619,12,71.2945,48000,196.69,408,13,196.69


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_df90f_00000,TERMINATED,,12,71.2945,48000,196.69,408,13,196.69


2021-10-09 09:20:34,931	INFO tune.py:561 -- Total run time: 79.43 seconds (78.76 seconds for the tuning loop).


## Creating Video of the Trained Model in Action

**Getting reference to trainer instance**

In [30]:
trial = analysis.get_best_logdir("episode_reward_mean", "max")
checkpoint = analysis.get_best_checkpoint(trial, "training_iteration", "max")
trainer = PPOTrainer(config=config)
trainer.restore(checkpoint)

2021-10-09 09:24:01,765	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2021-10-09 09:24:01,766	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2021-10-09 09:24:05,131	INFO trainable.py:383 -- Restored on 192.168.226.49 from checkpoint: /home/pradip/ray_results/PPO/PPO_CartPole-v1_df90f_00000_0_2021-10-09_09-19-15/checkpoint_000012/checkpoint-12
2021-10-09 09:24:05,132	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 12, '_timesteps_total': None, '_time_total': 71.29453778266907, '_episodes_total': 698}


**Creating video**

In [33]:
after_training_video_file = "after_training.mp4"
after_video = VideoRecorder(env, after_training_video_file)
observation = env.reset()
done = False
while not done:
    env.render()
    after_video.capture_frame()
    action = trainer.compute_single_action(observation)
    observation, reward, done, info = env.step(action)
after_video.close()
env.close()

In [34]:
# Displays the recorded video here in notebook
from IPython.display import HTML
from render_video import render_mp4
html = render_mp4(after_training_video_file)
HTML(html)

## Tuning Hyperparameters

In [35]:
parameters_search_config = {
    "env": "CartPole-v1",
    "framework": "torch",
    
    # hyperparameters tuning specific - the search space
    
    "model": {
        "fcnet_hiddens": ray.tune.grid_search([[32], [64]]),
        "fcnet_activation": ray.tune.grid_search(["linear", "relu"]),
    },
    "lr": ray.tune.uniform(1e-7, 1e-2)
}

In [36]:
ray.shutdown()

ray.init(
    num_cpus=8,
    include_dashboard=False,
    ignore_reinit_error=True,
    log_to_driver=False,
)

{'node_ip_address': '192.168.226.49',
 'raylet_ip_address': '192.168.226.49',
 'redis_address': '192.168.226.49:6379',
 'object_store_address': '/tmp/ray/session_2021-10-09_09-43-05_458820_72352/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-10-09_09-43-05_458820_72352/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2021-10-09_09-43-05_458820_72352',
 'metrics_export_port': 46476,
 'node_id': '726b2a0c5ea256625c83d95460618ad220e4fdce73f9d11e4639d7d6'}

In [37]:
# Performs hyperparameters tuning

parameters_search_analysis = ray.tune.run(
    "PPO",
    config=parameters_search_config,
    stop=stop,
    num_samples=5,
    metric="timesteps_total",
    mode="min",
)

<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
Result for PPO_CartPole-v1_10bb4_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2021-10-09_09-49-31
  done: false
  episode_len_mean: 23.867469879518072
  episode_media: {}
  episode_reward_max: 75.0
  episode_reward_mean: 23.867469879518072
  episode_reward_min: 8.0
  episodes_this_iter: 166
  episodes_total: 166
  experiment_id: 254202ce3b2741abbe62e47f0c11a341
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 0.004289619884961042
          entropy: 0.6787872771422069
          entropy_coeff: 0.0
          kl: 0.015464462661396247
          policy_loss: -0.025395593806219998
          total_loss: 127.63090472272647
          vf_explained_var:
          - -0.0024547111243009567
          vf_loss: 1

2021-10-09 10:02:59,172	INFO tune.py:561 -- Total run time: 823.19 seconds (822.67 seconds for the tuning loop).


In [38]:
# Views the best values for hyperparameters
print("Best hyperparameters:", parameters_search_analysis.best_config)

Best hyperparameters: {'env': 'CartPole-v1', 'framework': 'torch', 'model': {'fcnet_hiddens': [64], 'fcnet_activation': 'relu'}, 'lr': 0.003727368901324877}


## Retraining the Agent with Tuned Hyperparameters

In [41]:
ray.shutdown()

ray.init(
    num_cpus=8,
    include_dashboard=False,
    ignore_reinit_error=True,
    log_to_driver=False,
)

{'node_ip_address': '192.168.226.49',
 'raylet_ip_address': '192.168.226.49',
 'redis_address': '192.168.226.49:6379',
 'object_store_address': '/tmp/ray/session_2021-10-09_10-21-58_359382_72352/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-10-09_10-21-58_359382_72352/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2021-10-09_10-21-58_359382_72352',
 'metrics_export_port': 62678,
 'node_id': '4ad1174836f3ea4edfa6c206dfbda2f26ebea279b39be86307ed22bf'}

In [44]:
# Performs training with the best values of hyperparameters

training_with_tuned_parameters_analysis = ray.tune.run(
    "PPO",
    config=parameters_search_analysis.best_config,
    stop=stop,
    checkpoint_at_end=True,
)

<IPython.core.display.HTML object>
Result for PPO_CartPole-v1_60411_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2021-10-09_10-41-48
  done: false
  episode_len_mean: 21.04232804232804
  episode_media: {}
  episode_reward_max: 73.0
  episode_reward_mean: 21.04232804232804
  episode_reward_min: 9.0
  episodes_this_iter: 189
  episodes_total: 189
  experiment_id: 7f23a769fb404aa7a6cfa49df42b3c7e
  hostname: avita-pura-ns14a6
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 0.0037273689013248767
          entropy: 0.6663280891474858
          entropy_coeff: 0.0
          kl: 0.028486253287748342
          policy_loss: -0.04149572960932248
          total_loss: 76.68353492982926
          vf_explained_var:
          - -0.055846989154815674
          vf_loss: 76.71933348871046
        model: {}
    num_agent_steps_sampled: 4000
    n

2021-10-09 10:42:31,605	INFO tune.py:561 -- Total run time: 55.57 seconds (54.89 seconds for the tuning loop).


In [45]:
trial = training_with_tuned_parameters_analysis.get_best_logdir("episode_reward_mean", "max")
checkpoint = training_with_tuned_parameters_analysis.get_best_checkpoint(trial, "training_iteration", "max")
trainer = PPOTrainer(config=parameters_search_analysis.best_config)
trainer.restore(checkpoint)

2021-10-09 10:44:45,489	INFO trainable.py:383 -- Restored on 192.168.226.49 from checkpoint: /home/pradip/ray_results/PPO/PPO_CartPole-v1_60411_00000_0_2021-10-09_10-41-36/checkpoint_000008/checkpoint-8
2021-10-09 10:44:45,491	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 8, '_timesteps_total': None, '_time_total': 48.188358545303345, '_episodes_total': 429}


**Creating video**

In [46]:
after_training_with_tuned_hyperparameters_video_file = "after_training_with_tuned_hyperparameters.mp4"
after_training_with_tuned_hyperparameters_video = VideoRecorder(env, after_training_with_tuned_hyperparameters_video_file)
observation = env.reset()
done = False
while not done:
    env.render()
    after_training_with_tuned_hyperparameters_video.capture_frame()
    action = trainer.compute_single_action(observation)
    observation, reward, done, info = env.step(action)
after_training_with_tuned_hyperparameters_video.close()
env.close()

In [48]:
# Displays the recorded video here in notebook
from IPython.display import HTML
from render_video import render_mp4
html = render_mp4(after_training_with_tuned_hyperparameters_video_file)
HTML(html)