### 1. Import Dependencies

In [1]:
import os
# Open ai gym allows us to build environments and work with pre-existing environments
import gym
# stable_baselines : allows to vectorize environment, train RL agents on multiple environments at the same time.

# Proximal Policy Optimization algorithm combines ideas from AC2 (having multiple workers)
# and TRPO(it uses a trust region to improve the actor)
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
# evaluate_policy : helps to test out how model is actually performing
from stable_baselines3.common.evaluation import evaluate_policy

### 2. Load Environment


In [2]:
# Mapping to the pre-installed openai gym enviroments
environment_name = 'CartPole-v1'
# Making environment
env = gym.make(environment_name)

In [3]:
environment_name

'CartPole-v1'

In [4]:
# Testing out CarPole env 5 times
episodes = 5  # (think of episode as one full game within the environemnt)
for episode in range(1,episodes+1):
    # Return initial set of observations : Agent, Action, Environment, Reward
    state = env.reset()
    # These obs will be passed to reinforcement learning agent to determine best type of action that is able to maximize ythe reward
    done  = False # whether or not episode is done
    score = 0 
    
    while not done:
        env.render() # allows to view the graphical representation of the environment
        
        # env.reset()  : reset the environment and obtain the initial observation
        # env.step()  : apply an action to the environemnt
        #env.close   : close down the render frame
        
        # Generating a random action (0 or 1)
        action = env.action_space.sample()
        # Passing through random action returns next set of observations and reward
        n_state, reward, done, info,_ = env.step(action)
        # Accumulating our reward
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

Episode:1 Score:30.0
Episode:2 Score:11.0
Episode:3 Score:16.0
Episode:4 Score:23.0
Episode:5 Score:13.0


  gym.logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


In [5]:
env.reset()

(array([-0.01729826, -0.00530884,  0.02909031, -0.04429241], dtype=float32),
 {})

In [6]:
env.observation_space.sample()

array([-2.6321878e+00, -3.0543664e+38, -6.5467753e-03,  2.8911070e+38],
      dtype=float32)

### Understanding the Environment
![Cart Pole](CartPole.png)


In [7]:
env.action_space

Discrete(2)

In [8]:
env.action_space.sample()

0

In [9]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [10]:
env.observation_space.sample()

array([-3.5914617e+00, -2.3257383e+38,  2.0571604e-01, -1.2499575e+38],
      dtype=float32)

### 3. Train an RL Model
![Train](Train.png)


* Model Free RL only uses current state values to try to make a prediction.
* Model Based RL try to make prediction about the future state of the model to try to generate best possible action.

#### Choosing Algorithms

* Action Space:
    - Discrete Single Process    : DQN
    - Discrete Multi Processed   : PPO or A2C
    - Continuous Single Process  : sac or TD3
    - Continuous Multi Processed : PPO or A2C

![Algorithm](Algorithms.png)

#### Understanding Training Metrics

* Evaluation Metrics:
  Ep_len_mean, ep_rew_mean

* Time Metrics:
Eps, iterations, time_elapsed, total_timestamps

* Loss Metrics:
Entropy_loss, policy_loss, value_loss

* Other Metrics:
Explained_variance, learning_rate, n_updates



In [11]:
# craete directories first
log_path = os.path.join('Training','Logs')

In [12]:
log_path

'Training/Logs'

In [13]:
env = gym.make(environment_name)
# Wrapping env inside DummyVecEnv
# Allow us to work with env wrapped inside dummy vectorized env
env = DummyVecEnv([lambda:env])
#Mlp : multi Layer Perceptron
# An agent's policy is a rule which tels it how to operate in the environment
# Stable Baseline3 has : MlpPolicy, CnnPolicy, MultiInputPolicy
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device




In [15]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_2
-----------------------------
| time/              |      |
|    fps             | 4000 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 2643        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.005019945 |
|    clip_fraction        | 0.0392      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.558      |
|    explained_variance   | 0.856       |
|    learning_rate        | 0.0003      |
|    loss                 | 2.41        |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.00663    |
|    value_loss           | 27.8        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x2a8b95390>

### 4. Save and Reload Model

In [16]:
PPO_Path = os.path.join('Training','Saved Models','PPO_Model_Cartpole')

In [17]:
model.save(PPO_Path)

In [18]:
del model

In [20]:
PPO_Path

'Training/Saved Models/PPO_Model_Cartpole'

In [21]:
model.learn(total_timesteps=1000)

Logging to Training/Logs/PPO_3
-----------------------------
| time/              |      |
|    fps             | 3839 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------


<stable_baselines3.ppo.ppo.PPO at 0x2aad2be50>

In [19]:
# Reloading the model
model = PPO.load(PPO_Path,env=env)

  return self.fget.__get__(instance, owner)()


### 5. Evaluation

In [22]:
# PPO model is considered solved if we get an average score of 200 or higher
evaluate_policy(model,env,n_eval_episodes=10,render=True)

# CartPole reward is calculated as 1 point for every step that the pole remains upright.
# with a max of 200 steps.
# If pole is more than 15 degrees from vertical or the cart moves more than 2.4 units from center the episode ends.



(500.0, 0.0)

### 6. Test the Model

In [24]:

episodes = 5 
for episode in range(1,episodes+1):

    obs = env.reset()
    done  = False 
    score = 0 
    
    while not done:
        env.render() # using the model
        action, _ = model.predict(obs) 
        obs, reward, done, info= env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

Episode:1 Score:[500.]
Episode:2 Score:[500.]
Episode:3 Score:[500.]
Episode:4 Score:[500.]
Episode:5 Score:[500.]


In [25]:
#Get observations for our observation space
obs = env.reset()

In [29]:
action,_= model.predict(obs)

In [30]:
env.action_space.sample()

0

In [31]:
env.step(action) # By keeping our pole in an upright position and not letting it fall we accumulate a value of 1 every single time

(array([[ 0.04092013,  0.20921932,  0.04348042, -0.23916155]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{'TimeLimit.truncated': False}])

### 7. Viewing Logs in Tensorboard

In [32]:
training_log_path = os.path.join(log_path,'PPO_2')

In [33]:
training_log_path

'Training/Logs/PPO_2'

In [38]:
#tensorboard --logdir='Training/Logs/PPO_2' # run in terminal

Core Metrics to look at:
* Average Reward
* Average Episode length

Training Strategies
* Train for longer
* Hyperparameter Tuning
* Try different algorithms

### 8. Adding a callback to the training Stage

In [39]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [40]:
save_path = os.path.join('Training','Saved Models')

In [50]:
# Setting up stop training on Reward Threshold Callback: Stop training once we pass certain reward threshold
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200,verbose=1)
eval_callback = EvalCallback(env,
                             # everytime there's new best model it will run stop callback
                            callback_on_new_best=stop_callback,
                            # how frequently we want to run our eval callback
                             eval_freq=2000,
                            # After every 10000 runs check whether we passed the threshold if it has stop the training and save that to best model
                            best_model_save_path=save_path,
                            verbose=1)


In [51]:
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [52]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training/Logs/PPO_7
Eval num_timesteps=2000, episode_reward=10.00 +/- 0.00
Episode length: 10.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 10       |
|    mean_reward     | 10       |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 3945 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
Eval num_timesteps=4000, episode_reward=213.00 +/- 81.98
Episode length: 213.00 +/- 81.98
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 213        |
|    mean_reward          | 213        |
| time/                   |            |
|    total_timesteps      | 4000       |
| train/                  |            |
|    approx_kl            | 0.008395

<stable_baselines3.ppo.ppo.PPO at 0x1741c6110>

### 9. Changing Policies

In [53]:
# Define new neural network architecture
# vf: value function
net_arch = [dict(pi=[128,128,128,128],vf=[128,128,128,128])]

In [54]:
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path,policy_kwargs={'net_arch':net_arch})

Using cpu device




In [55]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training/Logs/PPO_8




Eval num_timesteps=2000, episode_reward=282.40 +/- 178.07
Episode length: 282.40 +/- 178.07
---------------------------------
| eval/              |          |
|    mean_ep_length  | 282      |
|    mean_reward     | 282      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
New best mean reward!
Stopping training because the mean reward 282.40  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x17c3d6fd0>

### 10. Using an Alternative Algorithm

In [56]:
from stable_baselines3 import DQN

In [57]:
model = DQN('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [58]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.95     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1432     |
|    time_elapsed     | 0        |
|    total_timesteps  | 106      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.554    |
|    n_updates        | 1        |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.917    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1757     |
|    time_elapsed     | 0        |
|    total_timesteps  | 175      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.526    |
|    n_updates        | 18       |
----------------------------------
----------------------------------
| rollout/            | 

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.417    |
| time/               |          |
|    episodes         | 76       |
|    fps              | 2777     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1227     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0879   |
|    n_updates        | 281      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.395    |
| time/               |          |
|    episodes         | 80       |
|    fps              | 2784     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1274     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0688   |
|    n_updates        | 293      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 152      |
|    fps              | 2781     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2056     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0163   |
|    n_updates        | 488      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 156      |
|    fps              | 2778     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2096     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0147   |
|    n_updates        | 498      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 228      |
|    fps              | 2750     |
|    time_elapsed     | 1        |
|    total_timesteps  | 2816     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00505  |
|    n_updates        | 678      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 232      |
|    fps              | 2698     |
|    time_elapsed     | 1        |
|    total_timesteps  | 2856     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00361  |
|    n_updates        | 688      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 304      |
|    fps              | 2670     |
|    time_elapsed     | 1        |
|    total_timesteps  | 3568     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00105  |
|    n_updates        | 866      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 308      |
|    fps              | 2670     |
|    time_elapsed     | 1        |
|    total_timesteps  | 3604     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0017   |
|    n_updates        | 875      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 380      |
|    fps              | 2674     |
|    time_elapsed     | 1        |
|    total_timesteps  | 4307     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00152  |
|    n_updates        | 1051     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 384      |
|    fps              | 2675     |
|    time_elapsed     | 1        |
|    total_timesteps  | 4348     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00197  |
|    n_updates        | 1061     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 456      |
|    fps              | 2676     |
|    time_elapsed     | 1        |
|    total_timesteps  | 5060     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000373 |
|    n_updates        | 1239     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 460      |
|    fps              | 2676     |
|    time_elapsed     | 1        |
|    total_timesteps  | 5099     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000725 |
|    n_updates        | 1249     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 532      |
|    fps              | 2678     |
|    time_elapsed     | 2        |
|    total_timesteps  | 5782     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000417 |
|    n_updates        | 1420     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 536      |
|    fps              | 2678     |
|    time_elapsed     | 2        |
|    total_timesteps  | 5819     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000155 |
|    n_updates        | 1429     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 608      |
|    fps              | 2679     |
|    time_elapsed     | 2        |
|    total_timesteps  | 6502     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000258 |
|    n_updates        | 1600     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 612      |
|    fps              | 2680     |
|    time_elapsed     | 2        |
|    total_timesteps  | 6543     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000215 |
|    n_updates        | 1610     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 684      |
|    fps              | 2659     |
|    time_elapsed     | 2        |
|    total_timesteps  | 7258     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00038  |
|    n_updates        | 1789     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 688      |
|    fps              | 2659     |
|    time_elapsed     | 2        |
|    total_timesteps  | 7302     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000127 |
|    n_updates        | 1800     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 760      |
|    fps              | 2662     |
|    time_elapsed     | 3        |
|    total_timesteps  | 8015     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000388 |
|    n_updates        | 1978     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 764      |
|    fps              | 2661     |
|    time_elapsed     | 3        |
|    total_timesteps  | 8055     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00015  |
|    n_updates        | 1988     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 836      |
|    fps              | 2660     |
|    time_elapsed     | 3        |
|    total_timesteps  | 8748     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 9.49e-05 |
|    n_updates        | 2161     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 840      |
|    fps              | 2660     |
|    time_elapsed     | 3        |
|    total_timesteps  | 8787     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 9.61e-05 |
|    n_updates        | 2171     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 912      |
|    fps              | 2662     |
|    time_elapsed     | 3        |
|    total_timesteps  | 9487     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000247 |
|    n_updates        | 2346     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 916      |
|    fps              | 2662     |
|    time_elapsed     | 3        |
|    total_timesteps  | 9526     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000246 |
|    n_updates        | 2356     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 988      |
|    fps              | 2656     |
|    time_elapsed     | 3        |
|    total_timesteps  | 10216    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0314   |
|    n_updates        | 2528     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 992      |
|    fps              | 2656     |
|    time_elapsed     | 3        |
|    total_timesteps  | 10258    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0287   |
|    n_updates        | 2539     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1064     |
|    fps              | 2635     |
|    time_elapsed     | 4        |
|    total_timesteps  | 10964    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0391   |
|    n_updates        | 2715     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1068     |
|    fps              | 2635     |
|    time_elapsed     | 4        |
|    total_timesteps  | 11005    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0348   |
|    n_updates        | 2726     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1140     |
|    fps              | 2631     |
|    time_elapsed     | 4        |
|    total_timesteps  | 11718    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0259   |
|    n_updates        | 2904     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1144     |
|    fps              | 2630     |
|    time_elapsed     | 4        |
|    total_timesteps  | 11756    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0524   |
|    n_updates        | 2913     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1216     |
|    fps              | 2633     |
|    time_elapsed     | 4        |
|    total_timesteps  | 12443    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0194   |
|    n_updates        | 3085     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1220     |
|    fps              | 2633     |
|    time_elapsed     | 4        |
|    total_timesteps  | 12480    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0148   |
|    n_updates        | 3094     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1292     |
|    fps              | 2633     |
|    time_elapsed     | 5        |
|    total_timesteps  | 13176    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0363   |
|    n_updates        | 3268     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1296     |
|    fps              | 2632     |
|    time_elapsed     | 5        |
|    total_timesteps  | 13217    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0294   |
|    n_updates        | 3279     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1368     |
|    fps              | 2633     |
|    time_elapsed     | 5        |
|    total_timesteps  | 13911    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0406   |
|    n_updates        | 3452     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1372     |
|    fps              | 2633     |
|    time_elapsed     | 5        |
|    total_timesteps  | 13949    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0173   |
|    n_updates        | 3462     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1444     |
|    fps              | 2633     |
|    time_elapsed     | 5        |
|    total_timesteps  | 14697    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0539   |
|    n_updates        | 3649     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1448     |
|    fps              | 2633     |
|    time_elapsed     | 5        |
|    total_timesteps  | 14737    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0385   |
|    n_updates        | 3659     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1520     |
|    fps              | 2635     |
|    time_elapsed     | 5        |
|    total_timesteps  | 15528    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0172   |
|    n_updates        | 3856     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1524     |
|    fps              | 2635     |
|    time_elapsed     | 5        |
|    total_timesteps  | 15574    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00784  |
|    n_updates        | 3868     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1596     |
|    fps              | 2637     |
|    time_elapsed     | 6        |
|    total_timesteps  | 16474    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.015    |
|    n_updates        | 4093     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1600     |
|    fps              | 2637     |
|    time_elapsed     | 6        |
|    total_timesteps  | 16510    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0152   |
|    n_updates        | 4102     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1672     |
|    fps              | 2626     |
|    time_elapsed     | 6        |
|    total_timesteps  | 17273    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0397   |
|    n_updates        | 4293     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1676     |
|    fps              | 2626     |
|    time_elapsed     | 6        |
|    total_timesteps  | 17314    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.02     |
|    n_updates        | 4303     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1748     |
|    fps              | 2628     |
|    time_elapsed     | 6        |
|    total_timesteps  | 18173    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0309   |
|    n_updates        | 4518     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1752     |
|    fps              | 2628     |
|    time_elapsed     | 6        |
|    total_timesteps  | 18242    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0167   |
|    n_updates        | 4535     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1824     |
|    fps              | 2628     |
|    time_elapsed     | 7        |
|    total_timesteps  | 18995    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0255   |
|    n_updates        | 4723     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1828     |
|    fps              | 2628     |
|    time_elapsed     | 7        |
|    total_timesteps  | 19044    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0046   |
|    n_updates        | 4735     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1900     |
|    fps              | 2628     |
|    time_elapsed     | 7        |
|    total_timesteps  | 19890    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0266   |
|    n_updates        | 4947     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1904     |
|    fps              | 2629     |
|    time_elapsed     | 7        |
|    total_timesteps  | 19979    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00861  |
|    n_updates        | 4969     |
----------------------------------


<stable_baselines3.dqn.dqn.DQN at 0x17417f050>