In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.autograd import Variable

import matplotlib.pyplot as plt
%matplotlib inline

from IPython import display
import gym
from envs import make_env
from subproc_vec_env import SubprocVecEnv
from model import CNNPolicy
from storage import RolloutStorage
import time


In [2]:
cuda = True
#log args
log_dir = './log'
log_interval = 100

# env args
env_name = 'BreakoutNoFrameskip-v4'
num_processes = 16
seed = 1

#stack 4 history frame
num_stack = 4

# RMSprop optimizer args
lr = 7e-4
eps = 1e-5
alpha = 0.99

# training args
num_steps = 5
num_frames = 10e6
num_updates = int(num_frames) // num_steps // num_processes

# a2c algrithm args
gamma = 0.99
tau = 0.95
max_grad_norm = 0.5
value_loss_coef = 0.5
entropy_coef = 0.01

In [3]:
print(num_updates)

125000


In [4]:
envs = [make_env(env_name, seed, rank, log_dir) for rank in range(num_processes)]
envs = SubprocVecEnv(envs)

[2018-01-19 15:15:54,026] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,032] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,037] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,045] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,053] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,059] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,075] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,080] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,085] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,091] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,100] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,106] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,112] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,118] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,132] Making new env: BreakoutNoFrameskip-v4
[2018-01-19 15:15:54,132]

  File "/home/user2/Pytorch-DDPG-A2C-PPO/A2C/subproc_vec_env.py", line 10, in worker
    cmd, data = remote.recv()
  File "/home/user2/Pytorch-DDPG-A2C-PPO/A2C/subproc_vec_env.py", line 10, in worker
    cmd, data = remote.recv()
  File "/home/user2/Pytorch-DDPG-A2C-PPO/A2C/subproc_vec_env.py", line 10, in worker
    cmd, data = remote.recv()
  File "/home/user2/anaconda3/lib/python3.6/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/home/user2/Pytorch-DDPG-A2C-PPO/A2C/subproc_vec_env.py", line 10, in worker
    cmd, data = remote.recv()
  File "/home/user2/anaconda3/lib/python3.6/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/home/user2/anaconda3/lib/python3.6/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/home/user2/anaconda3/lib/python3.6/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/home/user2/anaconda3/lib/python3.6/mult

In [6]:
obs_shape = envs.observation_space.shape
obs_shape = [obs_shape[0]*num_stack, *obs_shape[1:]]

In [7]:
obs_shape

[4, 84, 84]

In [8]:
actor_critic = CNNPolicy(obs_shape[0], envs.action_space, False)

In [9]:
actor_critic

CNNPolicy(
  (conv1): Conv2d (4, 32, kernel_size=(8, 8), stride=(4, 4))
  (conv2): Conv2d (32, 64, kernel_size=(4, 4), stride=(2, 2))
  (conv3): Conv2d (64, 32, kernel_size=(3, 3), stride=(1, 1))
  (linear1): Linear(in_features=1568, out_features=512)
  (critic_linear): Linear(in_features=512, out_features=1)
  (dist): Categorical(
    (linear): Linear(in_features=512, out_features=4)
  )
)

In [10]:
if cuda:
    actor_critic.cuda()
optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha)

In [11]:
rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space, actor_critic.state_size)
current_obs = torch.zeros(num_processes, *obs_shape)

In [12]:
current_obs.shape

torch.Size([16, 4, 84, 84])

In [13]:
# update the oldest obs in current_obs by obs
def update_current_obs(obs):
    shape_dim0 = envs.observation_space.shape[0]
    obs = torch.from_numpy(obs).float()
    if num_stack > 1:
        current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
    current_obs[:, -shape_dim0:] = obs

In [14]:
obs = envs.reset()
update_current_obs(obs)

In [15]:
rollouts.observations[0].copy_(current_obs)
episode_rewards = torch.zeros([num_processes,1])
final_rewards = torch.zeros([num_processes,1])

In [16]:
if cuda:
    rollouts.cuda()
    current_obs = current_obs.cuda()

In [17]:
if envs.action_space.__class__.__name__ == "Discrete":
    action_shape = 1
else:
    action_shape = envs.action_space.shape[0]

In [18]:
obs_test = rollouts.observations[0]
state_test = rollouts.states[0]
mask_test = rollouts.masks[0]
print(obs_test.shape,state_test.shape, mask_test.shape)
print(type(obs_test),type(state_test), type(mask_test))

torch.Size([16, 4, 84, 84]) torch.Size([16, 1]) torch.Size([16, 1])
<class 'torch.cuda.FloatTensor'> <class 'torch.cuda.FloatTensor'> <class 'torch.cuda.FloatTensor'>


In [19]:
value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[0], volatile=True),
                                                          Variable(rollouts.states[0], volatile=True),
                                                          Variable(rollouts.masks[0], volatile=True))

  probs = F.softmax(x)
  log_probs = F.log_softmax(x)
  probs = F.softmax(x)


In [None]:
# test
start = time.time()
for j in range(num_updates):
    for step in range(num_steps):
        # Sample actions
        value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                              Variable(rollouts.states[step], volatile=True),
                                                              Variable(rollouts.masks[step], volatile=True))
        cpu_actions = action.data.squeeze().cpu().numpy()
        #print(cpu_action)
        
        # obser reward and next obs
        obs, reward, done, info = envs.step(cpu_actions)
        # stack: make sure that reward is a numpy array(convert list to ndarray)
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
        episode_rewards += reward
        
        # If done then clean the history of observations.
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
        final_rewards *= masks
        final_rewards += (1 - masks) * episode_rewards
        episode_rewards *= masks
        if cuda:
            masks = masks.cuda()

        if current_obs.dim() == 4:
            current_obs *= masks.unsqueeze(2).unsqueeze(2)
        else:
            current_obs *= masks
        
        # update obs nad rollouts
        update_current_obs(obs)
        rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)
    
    # compute current update's return
    next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                              Variable(rollouts.states[-1], volatile=True),
                              Variable(rollouts.masks[-1], volatile=True))[0].data

    rollouts.compute_returns(next_value, False, gamma, tau)
    
    # in a2c the values  were calculated twice
    # the data in rollouts must be viewed, because the shape in rollouts is [num_steps, num_processes, x] which is [num,x] in actor_critic
    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                   Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                   Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                   Variable(rollouts.actions.view(-1, action_shape)))
    
    # compute the loss
    values = values.view(num_steps, num_processes, 1)
    action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

    advantages = Variable(rollouts.returns[:-1]) - values
    value_loss = advantages.pow(2).mean()

    action_loss = -(Variable(advantages.data) * action_log_probs).mean()
    
    # update model
    optimizer.zero_grad()
    loss = value_loss * value_loss_coef + action_loss - dist_entropy * entropy_coef 
    loss.backward()
    nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
    optimizer.step()
    
    rollouts.after_update()
    if j % log_interval == 0:
        end = time.time()
        total_num_steps = (j + 1) * num_processes * num_steps
        print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
        format(j, total_num_steps,
                int(total_num_steps / (end - start)),
                final_rewards.mean(),
                final_rewards.median(),
                final_rewards.min(),
                final_rewards.max(), dist_entropy.data[0],
                value_loss.data[0], action_loss.data[0]))

  probs = F.softmax(x)
  log_probs = F.log_softmax(x)
  probs = F.softmax(x)


Updates 0, num timesteps 80, FPS 757, mean/median reward 0.0/0.0, min/max reward 0.0/0.0, entropy 1.37727, value loss 0.00249, policy loss 0.04527
Updates 100, num timesteps 8080, FPS 1475, mean/median reward 0.2/0.0, min/max reward 0.0/3.0, entropy 1.38524, value loss 0.00074, policy loss -0.01315
Updates 200, num timesteps 16080, FPS 1429, mean/median reward 0.2/0.0, min/max reward 0.0/2.0, entropy 1.38576, value loss 0.02414, policy loss 0.02908
Updates 300, num timesteps 24080, FPS 1404, mean/median reward 0.2/0.0, min/max reward 0.0/1.0, entropy 1.38293, value loss 0.04014, policy loss 0.05686
Updates 400, num timesteps 32080, FPS 1394, mean/median reward 0.1/0.0, min/max reward 0.0/1.0, entropy 1.37919, value loss 0.07455, policy loss 0.13182
Updates 500, num timesteps 40080, FPS 1379, mean/median reward 0.4/0.0, min/max reward 0.0/2.0, entropy 1.37522, value loss 0.01414, policy loss -0.06204
Updates 600, num timesteps 48080, FPS 1387, mean/median reward 0.4/0.0, min/max reward 

Updates 5300, num timesteps 424080, FPS 1584, mean/median reward 1.6/1.0, min/max reward 0.0/5.0, entropy 1.12668, value loss 0.02410, policy loss -0.05357
Updates 5400, num timesteps 432080, FPS 1589, mean/median reward 2.1/2.0, min/max reward 0.0/6.0, entropy 0.99991, value loss 0.04498, policy loss -0.01688
Updates 5500, num timesteps 440080, FPS 1597, mean/median reward 1.6/1.0, min/max reward 0.0/4.0, entropy 1.21710, value loss 0.03181, policy loss -0.04328
Updates 5600, num timesteps 448080, FPS 1600, mean/median reward 1.8/1.0, min/max reward 0.0/5.0, entropy 1.17989, value loss 0.07478, policy loss -0.12069
Updates 5700, num timesteps 456080, FPS 1606, mean/median reward 2.5/2.0, min/max reward 1.0/5.0, entropy 1.17957, value loss 0.07967, policy loss -0.13259
Updates 5800, num timesteps 464080, FPS 1612, mean/median reward 2.8/2.0, min/max reward 0.0/7.0, entropy 1.10268, value loss 0.02526, policy loss -0.00654
Updates 5900, num timesteps 472080, FPS 1619, mean/median reward

Updates 10600, num timesteps 848080, FPS 1789, mean/median reward 5.2/4.0, min/max reward 1.0/13.0, entropy 0.96034, value loss 0.09102, policy loss 0.01129
Updates 10700, num timesteps 856080, FPS 1792, mean/median reward 4.1/4.0, min/max reward 0.0/10.0, entropy 1.01030, value loss 0.15047, policy loss -0.05358
Updates 10800, num timesteps 864080, FPS 1794, mean/median reward 6.5/4.0, min/max reward 1.0/16.0, entropy 0.92909, value loss 0.02393, policy loss 0.02686
Updates 10900, num timesteps 872080, FPS 1797, mean/median reward 4.5/4.0, min/max reward 1.0/9.0, entropy 0.95681, value loss 0.03261, policy loss -0.03601
Updates 11000, num timesteps 880080, FPS 1800, mean/median reward 5.6/5.0, min/max reward 1.0/15.0, entropy 1.04082, value loss 0.05250, policy loss -0.00796
Updates 11100, num timesteps 888080, FPS 1803, mean/median reward 4.8/3.0, min/max reward 1.0/12.0, entropy 0.96239, value loss 0.04218, policy loss 0.00977
Updates 11200, num timesteps 896080, FPS 1806, mean/medi

Updates 15800, num timesteps 1264080, FPS 1879, mean/median reward 5.9/4.0, min/max reward 0.0/20.0, entropy 0.92801, value loss 0.04350, policy loss -0.01925
Updates 15900, num timesteps 1272080, FPS 1880, mean/median reward 7.7/6.0, min/max reward 1.0/33.0, entropy 1.03699, value loss 0.03446, policy loss 0.02422
Updates 16000, num timesteps 1280080, FPS 1882, mean/median reward 7.9/5.0, min/max reward 2.0/30.0, entropy 1.05538, value loss 0.03893, policy loss -0.09876
Updates 16100, num timesteps 1288080, FPS 1884, mean/median reward 9.4/5.0, min/max reward 1.0/28.0, entropy 0.92339, value loss 0.04171, policy loss 0.00754
Updates 16200, num timesteps 1296080, FPS 1887, mean/median reward 5.1/2.0, min/max reward 0.0/15.0, entropy 1.02342, value loss 0.21763, policy loss -0.07242
Updates 16300, num timesteps 1304080, FPS 1888, mean/median reward 7.4/5.0, min/max reward 0.0/18.0, entropy 1.00406, value loss 0.01467, policy loss 0.03208
Updates 16400, num timesteps 1312080, FPS 1890, m

Updates 21000, num timesteps 1680080, FPS 1938, mean/median reward 7.0/5.0, min/max reward 2.0/15.0, entropy 1.12355, value loss 0.01592, policy loss 0.02095
Updates 21100, num timesteps 1688080, FPS 1939, mean/median reward 10.1/6.0, min/max reward 1.0/43.0, entropy 1.01558, value loss 0.02040, policy loss 0.02104
Updates 21200, num timesteps 1696080, FPS 1939, mean/median reward 11.2/7.0, min/max reward 2.0/41.0, entropy 0.93226, value loss 0.52417, policy loss -0.10814
Updates 21300, num timesteps 1704080, FPS 1941, mean/median reward 10.9/7.0, min/max reward 1.0/34.0, entropy 0.99533, value loss 0.01923, policy loss 0.01273
Updates 21400, num timesteps 1712080, FPS 1941, mean/median reward 6.4/4.0, min/max reward 1.0/21.0, entropy 0.97012, value loss 0.04476, policy loss -0.06146
Updates 21500, num timesteps 1720080, FPS 1943, mean/median reward 11.4/8.0, min/max reward 1.0/38.0, entropy 0.94025, value loss 0.32960, policy loss -0.10961
Updates 21600, num timesteps 1728080, FPS 194

Updates 26200, num timesteps 2096080, FPS 1976, mean/median reward 8.5/6.0, min/max reward 0.0/31.0, entropy 1.14230, value loss 0.13724, policy loss -0.14946
Updates 26300, num timesteps 2104080, FPS 1977, mean/median reward 3.8/4.0, min/max reward 0.0/10.0, entropy 1.02393, value loss 0.02663, policy loss -0.00220
Updates 26400, num timesteps 2112080, FPS 1977, mean/median reward 12.1/6.0, min/max reward 0.0/52.0, entropy 0.98174, value loss 0.08838, policy loss -0.06165
Updates 26500, num timesteps 2120080, FPS 1978, mean/median reward 12.8/7.0, min/max reward 0.0/44.0, entropy 0.98822, value loss 0.01652, policy loss -0.03101
Updates 26600, num timesteps 2128080, FPS 1978, mean/median reward 8.6/6.0, min/max reward 1.0/19.0, entropy 1.06595, value loss 1.49266, policy loss 0.32022
Updates 26700, num timesteps 2136080, FPS 1979, mean/median reward 6.6/4.0, min/max reward 0.0/19.0, entropy 0.92445, value loss 0.09814, policy loss 0.00376
Updates 26800, num timesteps 2144080, FPS 1979

Updates 31400, num timesteps 2512080, FPS 2004, mean/median reward 8.1/5.0, min/max reward 0.0/25.0, entropy 1.03937, value loss 0.02516, policy loss -0.00121
Updates 31500, num timesteps 2520080, FPS 2004, mean/median reward 12.3/8.0, min/max reward 0.0/49.0, entropy 1.07477, value loss 1.39824, policy loss -0.00872
Updates 31600, num timesteps 2528080, FPS 2005, mean/median reward 9.4/6.0, min/max reward 0.0/25.0, entropy 1.02698, value loss 0.08703, policy loss 0.01544
Updates 31700, num timesteps 2536080, FPS 2005, mean/median reward 18.1/7.0, min/max reward 0.0/80.0, entropy 0.98136, value loss 0.03342, policy loss -0.00653
Updates 31800, num timesteps 2544080, FPS 2005, mean/median reward 12.9/10.0, min/max reward 1.0/35.0, entropy 1.01920, value loss 0.01965, policy loss -0.00538
Updates 31900, num timesteps 2552080, FPS 2006, mean/median reward 9.9/10.0, min/max reward 0.0/29.0, entropy 0.96252, value loss 0.32404, policy loss -0.22770
Updates 32000, num timesteps 2560080, FPS 

Updates 36600, num timesteps 2928080, FPS 2030, mean/median reward 19.3/13.0, min/max reward 0.0/57.0, entropy 0.77298, value loss 0.44713, policy loss -0.16852
Updates 36700, num timesteps 2936080, FPS 2030, mean/median reward 13.1/7.0, min/max reward 1.0/83.0, entropy 0.84360, value loss 0.08801, policy loss -0.02699
Updates 36800, num timesteps 2944080, FPS 2031, mean/median reward 7.8/3.0, min/max reward 0.0/34.0, entropy 1.14514, value loss 0.02668, policy loss 0.02796
Updates 36900, num timesteps 2952080, FPS 2031, mean/median reward 9.6/6.0, min/max reward 0.0/32.0, entropy 0.96848, value loss 0.11516, policy loss 0.07589
Updates 37000, num timesteps 2960080, FPS 2032, mean/median reward 16.7/9.0, min/max reward 0.0/56.0, entropy 1.00625, value loss 0.05729, policy loss 0.05806
Updates 37100, num timesteps 2968080, FPS 2033, mean/median reward 7.9/4.0, min/max reward 0.0/22.0, entropy 0.92757, value loss 0.13532, policy loss -0.06789
Updates 37200, num timesteps 2976080, FPS 203

Updates 41800, num timesteps 3344080, FPS 2053, mean/median reward 12.2/4.0, min/max reward 0.0/49.0, entropy 0.91097, value loss 0.02017, policy loss 0.01262
Updates 41900, num timesteps 3352080, FPS 2053, mean/median reward 18.4/17.0, min/max reward 1.0/46.0, entropy 0.80788, value loss 0.84357, policy loss 0.00334
Updates 42000, num timesteps 3360080, FPS 2054, mean/median reward 14.3/16.0, min/max reward 0.0/28.0, entropy 0.95520, value loss 0.58406, policy loss -0.18529
Updates 42100, num timesteps 3368080, FPS 2054, mean/median reward 14.4/8.0, min/max reward 0.0/52.0, entropy 1.03195, value loss 0.06186, policy loss 0.00824
Updates 42200, num timesteps 3376080, FPS 2054, mean/median reward 11.4/6.0, min/max reward 2.0/52.0, entropy 0.94373, value loss 0.37547, policy loss -0.06156
Updates 42300, num timesteps 3384080, FPS 2055, mean/median reward 15.5/8.0, min/max reward 4.0/46.0, entropy 0.86267, value loss 0.11401, policy loss 0.09627
Updates 42400, num timesteps 3392080, FPS 

Updates 58300, num timesteps 4664080, FPS 2103, mean/median reward 19.6/16.0, min/max reward 0.0/63.0, entropy 0.62027, value loss 0.12501, policy loss -0.00450
Updates 58400, num timesteps 4672080, FPS 2103, mean/median reward 16.9/5.0, min/max reward 0.0/85.0, entropy 1.05140, value loss 0.04899, policy loss 0.03593
Updates 58500, num timesteps 4680080, FPS 2103, mean/median reward 9.7/5.0, min/max reward 0.0/33.0, entropy 1.09451, value loss 0.03309, policy loss -0.05202
Updates 58600, num timesteps 4688080, FPS 2103, mean/median reward 17.6/12.0, min/max reward 0.0/73.0, entropy 0.94924, value loss 0.54471, policy loss -0.18611
Updates 58700, num timesteps 4696080, FPS 2103, mean/median reward 14.9/12.0, min/max reward 2.0/49.0, entropy 0.95293, value loss 0.05017, policy loss -0.01130
Updates 58800, num timesteps 4704080, FPS 2103, mean/median reward 10.6/6.0, min/max reward 1.0/32.0, entropy 1.01862, value loss 0.07465, policy loss -0.01769
Updates 58900, num timesteps 4712080, F

Updates 63500, num timesteps 5080080, FPS 2108, mean/median reward 11.4/7.0, min/max reward 0.0/50.0, entropy 0.93191, value loss 0.11934, policy loss 0.15172
Updates 63600, num timesteps 5088080, FPS 2108, mean/median reward 17.3/8.0, min/max reward 0.0/86.0, entropy 0.98782, value loss 0.24210, policy loss 0.06463
Updates 63700, num timesteps 5096080, FPS 2108, mean/median reward 9.9/6.0, min/max reward 0.0/37.0, entropy 0.94741, value loss 0.36724, policy loss -0.09425
Updates 63800, num timesteps 5104080, FPS 2108, mean/median reward 8.0/4.0, min/max reward 0.0/37.0, entropy 0.96391, value loss 0.03663, policy loss -0.03640
Updates 63900, num timesteps 5112080, FPS 2109, mean/median reward 17.3/8.0, min/max reward 0.0/67.0, entropy 0.83075, value loss 0.08821, policy loss -0.11966
Updates 64000, num timesteps 5120080, FPS 2109, mean/median reward 14.3/7.0, min/max reward 0.0/36.0, entropy 0.95977, value loss 0.24921, policy loss 0.04732
Updates 64100, num timesteps 5128080, FPS 210

Updates 68700, num timesteps 5496080, FPS 2116, mean/median reward 20.6/9.0, min/max reward 0.0/104.0, entropy 0.94558, value loss 0.07054, policy loss -0.01645
Updates 68800, num timesteps 5504080, FPS 2116, mean/median reward 21.2/9.0, min/max reward 0.0/104.0, entropy 0.99871, value loss 0.11643, policy loss 0.09169
Updates 68900, num timesteps 5512080, FPS 2117, mean/median reward 17.6/7.0, min/max reward 1.0/96.0, entropy 0.92998, value loss 0.01638, policy loss -0.02717
Updates 69000, num timesteps 5520080, FPS 2117, mean/median reward 18.9/7.0, min/max reward 0.0/82.0, entropy 0.90898, value loss 0.04563, policy loss 0.02175
Updates 69100, num timesteps 5528080, FPS 2117, mean/median reward 6.6/3.0, min/max reward 0.0/31.0, entropy 1.00769, value loss 0.02002, policy loss -0.01730
Updates 69200, num timesteps 5536080, FPS 2118, mean/median reward 9.5/4.0, min/max reward 0.0/34.0, entropy 1.01901, value loss 0.01332, policy loss -0.00461
Updates 69300, num timesteps 5544080, FPS 

Updates 73900, num timesteps 5912080, FPS 2126, mean/median reward 10.1/0.0, min/max reward 0.0/70.0, entropy 0.85985, value loss 0.32245, policy loss -0.11392
Updates 74000, num timesteps 5920080, FPS 2127, mean/median reward 9.4/2.0, min/max reward 0.0/39.0, entropy 0.86024, value loss 0.07816, policy loss -0.02951
Updates 74100, num timesteps 5928080, FPS 2127, mean/median reward 25.4/11.0, min/max reward 2.0/59.0, entropy 0.75124, value loss 0.06785, policy loss 0.04836
Updates 74200, num timesteps 5936080, FPS 2127, mean/median reward 21.3/10.0, min/max reward 0.0/86.0, entropy 0.91529, value loss 0.10247, policy loss -0.10198
Updates 74300, num timesteps 5944080, FPS 2127, mean/median reward 9.4/5.0, min/max reward 0.0/50.0, entropy 0.95950, value loss 0.02334, policy loss 0.04261
Updates 74400, num timesteps 5952080, FPS 2127, mean/median reward 13.1/6.0, min/max reward 1.0/51.0, entropy 1.06353, value loss 0.38487, policy loss -0.02611
Updates 74500, num timesteps 5960080, FPS 

Updates 79100, num timesteps 6328080, FPS 2135, mean/median reward 9.1/5.0, min/max reward 0.0/36.0, entropy 0.97228, value loss 0.10460, policy loss -0.02618
Updates 79200, num timesteps 6336080, FPS 2135, mean/median reward 7.5/4.0, min/max reward 0.0/49.0, entropy 0.92150, value loss 0.04655, policy loss -0.01472
Updates 79300, num timesteps 6344080, FPS 2135, mean/median reward 13.0/8.0, min/max reward 0.0/48.0, entropy 1.00232, value loss 0.04810, policy loss 0.04901
Updates 79400, num timesteps 6352080, FPS 2136, mean/median reward 15.3/6.0, min/max reward 0.0/80.0, entropy 0.77696, value loss 0.14421, policy loss 0.04545
Updates 79500, num timesteps 6360080, FPS 2136, mean/median reward 12.2/5.0, min/max reward 0.0/48.0, entropy 0.96945, value loss 0.37632, policy loss -0.14876
Updates 79600, num timesteps 6368080, FPS 2137, mean/median reward 22.1/11.0, min/max reward 0.0/84.0, entropy 0.95012, value loss 0.05050, policy loss -0.00828
Updates 79700, num timesteps 6376080, FPS 2

Updates 84300, num timesteps 6744080, FPS 2145, mean/median reward 10.8/5.0, min/max reward 1.0/31.0, entropy 0.99767, value loss 0.02110, policy loss -0.00078
Updates 84400, num timesteps 6752080, FPS 2145, mean/median reward 16.1/10.0, min/max reward 0.0/88.0, entropy 1.13357, value loss 0.68163, policy loss 0.10927
Updates 84500, num timesteps 6760080, FPS 2145, mean/median reward 13.8/8.0, min/max reward 1.0/41.0, entropy 0.91458, value loss 0.63538, policy loss -0.13415
Updates 84600, num timesteps 6768080, FPS 2145, mean/median reward 10.8/5.0, min/max reward 0.0/57.0, entropy 0.97327, value loss 0.06199, policy loss -0.02055
Updates 84700, num timesteps 6776080, FPS 2145, mean/median reward 20.0/10.0, min/max reward 0.0/73.0, entropy 0.99847, value loss 0.57645, policy loss -0.13259
Updates 84800, num timesteps 6784080, FPS 2145, mean/median reward 14.0/5.0, min/max reward 0.0/61.0, entropy 0.98606, value loss 0.01910, policy loss 0.01117
Updates 84900, num timesteps 6792080, FP

Updates 89500, num timesteps 7160080, FPS 2153, mean/median reward 17.7/9.0, min/max reward 1.0/84.0, entropy 0.93885, value loss 0.01937, policy loss -0.00597
Updates 89600, num timesteps 7168080, FPS 2153, mean/median reward 12.8/4.0, min/max reward 1.0/50.0, entropy 1.01167, value loss 0.14731, policy loss -0.09707
Updates 89700, num timesteps 7176080, FPS 2153, mean/median reward 9.6/6.0, min/max reward 1.0/27.0, entropy 0.99237, value loss 0.39454, policy loss -0.09870
Updates 89800, num timesteps 7184080, FPS 2154, mean/median reward 9.4/4.0, min/max reward 0.0/31.0, entropy 0.88887, value loss 0.05484, policy loss -0.00820
Updates 89900, num timesteps 7192080, FPS 2154, mean/median reward 9.3/2.0, min/max reward 0.0/39.0, entropy 1.07734, value loss 0.02553, policy loss 0.06026
Updates 90000, num timesteps 7200080, FPS 2154, mean/median reward 8.2/2.0, min/max reward 0.0/38.0, entropy 1.12722, value loss 0.04584, policy loss -0.05607
Updates 90100, num timesteps 7208080, FPS 215

Updates 94700, num timesteps 7576080, FPS 2160, mean/median reward 23.4/15.0, min/max reward 0.0/60.0, entropy 1.02873, value loss 0.51163, policy loss -0.23993
Updates 94800, num timesteps 7584080, FPS 2160, mean/median reward 12.1/6.0, min/max reward 0.0/60.0, entropy 0.82523, value loss 0.17770, policy loss -0.10018
Updates 94900, num timesteps 7592080, FPS 2160, mean/median reward 21.2/9.0, min/max reward 0.0/73.0, entropy 0.93764, value loss 0.07765, policy loss -0.00257
Updates 95000, num timesteps 7600080, FPS 2160, mean/median reward 17.2/15.0, min/max reward 1.0/64.0, entropy 1.06789, value loss 0.03623, policy loss 0.00706
Updates 95100, num timesteps 7608080, FPS 2160, mean/median reward 3.7/0.0, min/max reward 0.0/24.0, entropy 1.18036, value loss 0.04843, policy loss -0.06202
Updates 95200, num timesteps 7616080, FPS 2160, mean/median reward 13.9/4.0, min/max reward 0.0/95.0, entropy 1.00080, value loss 0.06347, policy loss 0.02365
Updates 95300, num timesteps 7624080, FPS

Updates 99900, num timesteps 7992080, FPS 2166, mean/median reward 12.8/5.0, min/max reward 1.0/63.0, entropy 0.79771, value loss 0.11179, policy loss -0.07604
Updates 100000, num timesteps 8000080, FPS 2166, mean/median reward 22.2/13.0, min/max reward 0.0/79.0, entropy 1.11580, value loss 0.03935, policy loss -0.01893
Updates 100100, num timesteps 8008080, FPS 2166, mean/median reward 18.4/6.0, min/max reward 0.0/79.0, entropy 0.97517, value loss 0.05914, policy loss 0.00548
Updates 100200, num timesteps 8016080, FPS 2167, mean/median reward 12.6/9.0, min/max reward 0.0/71.0, entropy 0.92168, value loss 0.09048, policy loss -0.01969
Updates 100300, num timesteps 8024080, FPS 2167, mean/median reward 12.3/4.0, min/max reward 0.0/78.0, entropy 0.86774, value loss 0.02876, policy loss 0.03019
Updates 100400, num timesteps 8032080, FPS 2167, mean/median reward 19.3/4.0, min/max reward 0.0/85.0, entropy 0.77724, value loss 0.04856, policy loss -0.08574
Updates 100500, num timesteps 804008

Updates 105100, num timesteps 8408080, FPS 2172, mean/median reward 18.8/5.0, min/max reward 1.0/105.0, entropy 1.17295, value loss 0.01666, policy loss -0.01576
Updates 105200, num timesteps 8416080, FPS 2172, mean/median reward 18.0/5.0, min/max reward 0.0/83.0, entropy 1.05225, value loss 0.01119, policy loss -0.01656
Updates 105300, num timesteps 8424080, FPS 2172, mean/median reward 12.9/5.0, min/max reward 0.0/83.0, entropy 0.72540, value loss 0.06074, policy loss 0.00203
Updates 105400, num timesteps 8432080, FPS 2172, mean/median reward 20.7/7.0, min/max reward 1.0/90.0, entropy 0.96005, value loss 0.13289, policy loss 0.03571
Updates 105500, num timesteps 8440080, FPS 2172, mean/median reward 14.6/6.0, min/max reward 1.0/71.0, entropy 0.84598, value loss 0.05629, policy loss 0.00811
Updates 105600, num timesteps 8448080, FPS 2172, mean/median reward 15.3/8.0, min/max reward 2.0/72.0, entropy 1.04811, value loss 0.30583, policy loss -0.27226
Updates 105700, num timesteps 845608

Updates 110300, num timesteps 8824080, FPS 2177, mean/median reward 17.3/11.0, min/max reward 0.0/76.0, entropy 0.97768, value loss 0.03476, policy loss -0.10614
Updates 110400, num timesteps 8832080, FPS 2176, mean/median reward 11.2/11.0, min/max reward 0.0/45.0, entropy 0.89012, value loss 0.24022, policy loss -0.00213
Updates 110500, num timesteps 8840080, FPS 2176, mean/median reward 14.7/5.0, min/max reward 0.0/81.0, entropy 0.98793, value loss 0.04636, policy loss 0.05622
Updates 110600, num timesteps 8848080, FPS 2177, mean/median reward 20.6/5.0, min/max reward 0.0/103.0, entropy 0.85732, value loss 0.05464, policy loss 0.02509
Updates 110700, num timesteps 8856080, FPS 2177, mean/median reward 19.2/4.0, min/max reward 0.0/103.0, entropy 0.91394, value loss 0.14479, policy loss 0.08934
Updates 110800, num timesteps 8864080, FPS 2177, mean/median reward 10.1/3.0, min/max reward 0.0/56.0, entropy 0.88932, value loss 0.01947, policy loss -0.03315
Updates 110900, num timesteps 887

Updates 115400, num timesteps 9232080, FPS 2180, mean/median reward 25.8/5.0, min/max reward 0.0/99.0, entropy 0.99491, value loss 0.02802, policy loss 0.03407
Updates 115500, num timesteps 9240080, FPS 2180, mean/median reward 16.3/2.0, min/max reward 0.0/90.0, entropy 0.90541, value loss 0.08655, policy loss -0.00077
Updates 115600, num timesteps 9248080, FPS 2180, mean/median reward 11.0/2.0, min/max reward 0.0/42.0, entropy 0.96749, value loss 0.04544, policy loss 0.00986
Updates 115700, num timesteps 9256080, FPS 2180, mean/median reward 18.9/10.0, min/max reward 0.0/50.0, entropy 0.79905, value loss 0.67114, policy loss -0.15880
Updates 115800, num timesteps 9264080, FPS 2181, mean/median reward 24.8/14.0, min/max reward 0.0/100.0, entropy 1.15324, value loss 0.02172, policy loss -0.06898
Updates 115900, num timesteps 9272080, FPS 2181, mean/median reward 10.5/3.0, min/max reward 0.0/37.0, entropy 0.96830, value loss 0.03060, policy loss 0.02087
Updates 116000, num timesteps 9280

Updates 120500, num timesteps 9640080, FPS 2185, mean/median reward 15.6/7.0, min/max reward 0.0/75.0, entropy 1.16816, value loss 0.01599, policy loss 0.03873
Updates 120600, num timesteps 9648080, FPS 2185, mean/median reward 8.1/5.0, min/max reward 0.0/29.0, entropy 0.91804, value loss 0.76804, policy loss 0.04362
Updates 120700, num timesteps 9656080, FPS 2185, mean/median reward 13.5/5.0, min/max reward 0.0/56.0, entropy 1.00234, value loss 0.01911, policy loss -0.03609
Updates 120800, num timesteps 9664080, FPS 2185, mean/median reward 12.5/5.0, min/max reward 0.0/78.0, entropy 0.93060, value loss 0.03610, policy loss 0.03270
Updates 120900, num timesteps 9672080, FPS 2185, mean/median reward 11.2/4.0, min/max reward 0.0/45.0, entropy 0.96137, value loss 0.08656, policy loss 0.03911
Updates 121000, num timesteps 9680080, FPS 2185, mean/median reward 19.4/4.0, min/max reward 0.0/86.0, entropy 0.99571, value loss 0.02908, policy loss -0.05409
Updates 121100, num timesteps 9688080, 

In [None]:
torch.save(actor_critic, '../../models/'+env_name+'.pt')

In [None]:
print(2)