# Example notebook for supervised training of LTC cells for time-series prediction tasks
#### *(using the Pytorch implementation)*

### Import requirements

In [47]:
from ray.rllib.agents.ppo import PPOTrainer
import numpy as np
import matplotlib.pyplot as plt

### Define a config dictionary for our trainer 

In [67]:
config = {
    # Environment (RLlib understands openAI gym registered strings).
    "env": "CartPole-v1",
    # Use 2 environment workers (aka "rollout workers") that parallelly
    # collect samples from their own environment clone(s).
    "num_workers": 8,
    # Change this to "framework: torch", if you are using PyTorch.
    # Also, use "framework: tf2" for tf2.x eager execution.
    "framework": "torch",
    # Tweak the default model provided automatically by RLlib,
    # given the environment's observation- and action spaces.
    "model": {
        "fcnet_hiddens": [64, 64],
        "fcnet_activation": "relu",
    },
    # Set up a separate evaluation worker set for the
    # `trainer.evaluate()` call after training (see below).
    "evaluation_num_workers": 1,
    # Only for evaluation runs, render the env.
    "evaluation_config": {
        "render_env": True,
    }
}

### Train the agent with our defined config

In [46]:
# Instanciate the PPO trainer object
trainer = PPOTrainer(config=config)

# Run it for n training iterations. A training iteration includes
# parallel sample collection by the environment workers as well as
# loss calculation on the collected batch and a model update.
log = []
iterations = 20
for i in range(iterations):
    print("iteration : " +str(i), ", ")
    log.append(trainer.train())
    print('len : ' + str(log[i]['episode_len_mean']))
    print('avg_rev : ' + str(np.array(log[i]['hist_stats']['episode_reward']).mean()))
    if i % 5 == 0:
        trainer.evaluate()

[2m[36m(pid=50226)[0m 
[2m[36m(pid=50223)[0m 


iteration : 0 , 


[2m[36m(pid=50221)[0m 


len : 22.223463687150836
avg_rev : 22.223463687150836




iteration : 1 , 
len : 29.00735294117647
avg_rev : 29.00735294117647
iteration : 2 , 
len : 47.11
avg_rev : 47.11
iteration : 3 , 
len : 78.14
avg_rev : 78.14
iteration : 4 , 
len : 104.68
avg_rev : 104.68
iteration : 5 , 
len : 129.69
avg_rev : 129.69
iteration : 6 , 
len : 164.62
avg_rev : 164.62
iteration : 7 , 
len : 194.26
avg_rev : 194.26
iteration : 8 , 
len : 219.31
avg_rev : 219.31
iteration : 9 , 
len : 251.01
avg_rev : 251.01
iteration : 10 , 
len : 278.5
avg_rev : 278.5
iteration : 11 , 
len : 305.15
avg_rev : 305.15
iteration : 12 , 
len : 329.61
avg_rev : 329.61
iteration : 13 , 
len : 334.93
avg_rev : 334.93
iteration : 14 , 
len : 350.22
avg_rev : 350.22
iteration : 15 , 
len : 342.34
avg_rev : 342.34
iteration : 16 , 
len : 322.29
avg_rev : 322.29
iteration : 17 , 
len : 318.71
avg_rev : 318.71
iteration : 18 , 
len : 302.14
avg_rev : 302.14
iteration : 19 , 
len : 296.95
avg_rev : 296.95


In [69]:
# Evaluate the trained Trainer (and render each timestep to the shell's
# output).
trainer.evaluate()

{'evaluation': {'episode_reward_max': 500.0,
  'episode_reward_min': 176.0,
  'episode_reward_mean': 344.2,
  'episode_len_mean': 344.2,
  'episode_media': {},
  'episodes_this_iter': 10,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [225.0,
    278.0,
    294.0,
    500.0,
    423.0,
    316.0,
    500.0,
    441.0,
    176.0,
    289.0],
   'episode_lengths': [225, 278, 294, 500, 423, 316, 500, 441, 176, 289]},
  'sampler_perf': {'mean_raw_obs_processing_ms': 0.1307517243951104,
   'mean_inference_ms': 1.1028653226296545,
   'mean_action_processing_ms': 0.06369292459067953,
   'mean_env_wait_ms': 0.10128286618736895,
   'mean_env_render_ms': 7.619497852004665},
  'off_policy_estimator': {},
  'timesteps_this_iter': 0}}