In [1]:
import torch
from torch import nn
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from collections import deque
import random
import gym

In [2]:
scalar_writer = SummaryWriter('TD3')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
class Critic(nn.Module):
    def __init__(self, obs_dim ,action_dim):
        super(Critic, self).__init__()
        self.l1 = nn.Linear(obs_dim + action_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, 1)
        
    def forward(self, obs, action):
        sa = torch.cat([obs, action], dim=1)
        q = torch.relu(self.l1(sa))
        q = torch.relu(self.l2(q))
        q = self.l3(q)
        return q

In [4]:
class Actor(nn.Module):
    '''一般环境的a的上下界对称，所以直接forward输出对应环境的动作'''
    def __init__(self, obs_dim, action_dim, max_action):
        super(Actor, self).__init__()
        
        self.l1 = nn.Linear(obs_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, action_dim)
        
        self.max_action = max_action
        
    def forward(self, obs):
        '''返回值直接乘上action的范围高度，得到可以用到环境的动作'''
        a = torch.relu(self.l1(obs))
        a = torch.relu(self.l2(a))
        a = torch.tanh(self.l3(a))
        
        return self.max_action * a

In [5]:
class BasicBuffer:

    def __init__(self, max_size):
        self.max_size = max_size
        self.buffer = deque(maxlen=max_size)

    def push(self, state, action, reward, next_state, done):
        experience = (state, action, np.array([reward]), next_state, done)
        self.buffer.append(experience)

    def sample(self, batch_size):
        state_batch = []
        action_batch = []
        reward_batch = []
        next_state_batch = []
        done_batch = []

        batch = random.sample(self.buffer, batch_size)

        for experience in batch:
            state, action, reward, next_state, done = experience
            state_batch.append(state)
            action_batch.append(action)
            reward_batch.append(reward)
            next_state_batch.append(next_state)
            done_batch.append(done)

        return (state_batch, action_batch, reward_batch, next_state_batch, done_batch)

In [6]:
def update_net(model, target_model, tau=1.):
    '''更新目标网络'''
    for tar_param, param in zip(target_model.parameters(), model.parameters()):
        tar_param.data.copy_(param.data * tau + tar_param.data * (1.0 - tau))

In [7]:
class TD3:
    def __init__(self, env, gamma, tau, buffer_maxlen, delay_step, noise_std, noise_bound, critic_lr, actor_lr):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.max_action = float(env.action_space.high[0])
        
        self.gamma = gamma
        self.tau = tau
        self.noise_std = noise_std
        self.noise_bound = noise_bound
        self.update_step = 0
        self.delay_step = delay_step
        
        # 初始化6个网络
        self.actor = Actor(self.obs_dim, self.action_dim, self.max_action).to(self.device)
        self.actor_target = Actor(self.obs_dim, self.action_dim, self.max_action).to(self.device)
        
        self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic1_target = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic2_target = Critic(self.obs_dim, self.action_dim).to(self.device)
        
        #  初始化目标网络的权重
        update_net(self.actor, self.actor_target, tau=1.)
        update_net(self.critic1, self.critic1_target, tau=1.)
        update_net(self.critic2, self.critic2_target, tau=1.)
        
        # 初始化优化器
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic1_optimizer = torch.optim.Adam(self.critic1.parameters(), lr=critic_lr)
        self.critic2_optimizer = torch.optim.Adam(self.critic2.parameters(), lr=critic_lr)
        
        # 设置一个mse函数
        self.loss_fn = torch.nn.MSELoss()
        
        # 初始化经验池
        self.replay_buffer = BasicBuffer(buffer_maxlen)
        
        # 初始化记录scalar的字典
        self.summaries = {}
        
    def get_action(self, obs):
        '''因为网络输出的直接是满足动作区间的动作，所以不需要rescale'''
        state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
        action = self.actor(state)
        action = action.squeeze(0).cpu().detach().numpy()
        return action
    
    def update(self, batch_size):
        '''更新网络'''
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.replay_buffer.sample(batch_size)
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        done_batch = torch.FloatTensor(done_batch).to(self.device)
        done_batch = done_batch.view(-1, 1)  # 转换成 （batchsize, 1） 的形状，为了下面的相乘
        
        action_noise = self.generate_noise(action_batch) # 产生一批和一批动作一样形状的高斯噪声
        # actions加过噪音要裁剪到目标范围，这里为何是next_state??论文和论文的代码不符合啊，可能就是个形式？还是要根据Q里的s或者s'
        actions_hat = (self.actor_target(next_state_batch) + action_noise).clamp(-self.max_action, self.max_action)
        next_q1 = self.critic1_target(next_state_batch, actions_hat)
        next_q2 = self.critic2_target(next_state_batch, actions_hat)
        min_next_q = torch.min(next_q1, next_q2)
        y = (reward_batch + (1.-done_batch) * self.gamma * min_next_q).detach()
        curr_q1 = self.critic1(state_batch, action_batch)
        curr_q2 = self.critic2(state_batch, action_batch)

        loss_critic1 = self.loss_fn(curr_q1, y)
        loss_critic2 = self.loss_fn(curr_q2, y)
        self.summaries['critic_loss'] = loss_critic1.detach().item()
        
        # 更新两个critic网络
        self.critic1_optimizer.zero_grad()
        self.critic2_optimizer.zero_grad()
        
        loss_critic1.backward()
        loss_critic2.backward()
        
        self.critic1_optimizer.step()
        self.critic2_optimizer.step()
        
        # 延迟更新策略网络和目标网络
        if self.update_step % self.delay_step == 0:
            actor_loss = -self.critic1(state_batch, self.actor(state_batch)).mean()
            self.summaries['actor_loss'] = actor_loss.detach().item()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            
            # 更新目标网络
            update_net(self.actor, self.actor_target, tau=self.tau)
            update_net(self.critic1, self.critic1_target, tau=self.tau)
            update_net(self.critic2, self.critic2_target, tau=self.tau)
        
        self.update_step += 1
        
    def generate_noise(self, action_batch):
        '''对一批动作产生同样维度的噪声，用于探索'''
        # torch.normal(mean, std) 他们只要有一个有形状就行
        noise = torch.normal(mean=torch.zeros(action_batch.size()), std=self.noise_std)
        noise = torch.clamp(noise, -self.noise_bound, self.noise_bound).to(self.device)
        return noise

In [8]:
def train(env, agent, max_episodes, max_steps, batch_size, std_train=0.1, render=False):
    '''按照episode训练, 可能会浪费一些步数，刚开始每回合训练不满，
    优点：可以自定义每回合最高步数限制，对一些非回合制游戏有效
    缺点：如果设置了每回合最大目标步数，智能体就会被限制住，到了这个目标就不会再继续增长了
    '''
    total_steps = 0
    
    for episode in range(max_episodes):
        state = env.reset()
        episode_reward = 0
        
        for step in range(max_steps):
            if render:
                env.render()
                
            if total_steps < 25e3:
                action = env.action_space.sample()
            else:
                action = (agent.get_action(state) + np.random.normal(0, agent.max_action * std_train, size=agent.action_dim)).astype(np.float32)
                action = np.clip(action, -agent.max_action, agent.max_action)
            
            next_state, reward, done, _ = env.step(action)
            agent.replay_buffer.push(state, action, reward, next_state, done)
            episode_reward += reward
            total_steps += 1
            
            if total_steps > 25e3:
                agent.update(batch_size)     
            
            if done or step == max_steps -1:
                print('totle_step {}, episode_reward {}'.format(total_steps, episode_reward))
                break
            state = next_state

In [5]:
def train_step(env, agent, max_steps, begin_steps, batch_size, std_train=0.1, render=False):
    '''按照所有步数进行训练：
    优点：每回合步数没有上限，一直会达到环境默认上限。
    缺点：如果环境本身没有最高步数限制返回done，会陷入死循环，对一些非回合制游戏无效'''
    state, done = env.reset(), False
    episode_steps = 0
    episode_reward = 0
    episode_num = 0
    
    for t in range(max_steps):
        episode_steps += 1
        if render:
                env.render()
        if t < begin_steps:
            action = env.action_space.sample()
        else:
            action = (agent.get_action(state) + np.random.normal(0, agent.max_action * std_train, size=agent.action_dim)).astype(np.float32)
            action = np.clip(action, -agent.max_action, agent.max_action)
            
        next_state, reward, done, _ = env.step(action)
        agent.replay_buffer.push(state, action, reward, next_state, done)
        episode_reward += reward
        state = next_state

        if t >= begin_steps:
            agent.update(batch_size)     
            scalar_writer.add_scalar('loss/actor_loss',agent.summaries['actor_loss'], t)
            scalar_writer.add_scalar('loss/critic_loss',agent.summaries['critic_loss'], t)

        if done:
            print('totle_step {},episode_num {}, episode_steps {}, episode_reward {}'.format(t+1, episode_num, episode_steps, episode_reward))
            scalar_writer.add_scalar('main/episode_reward', episode_reward, episode_num)
            scalar_writer.add_scalar('main/episode_steps', episode_steps, episode_num)
            #  重置各种东西
            state, done = env.reset(), False
            episode_reward = 0
            episode_steps = 0
            episode_num += 1

In [9]:
env = gym.make("Hopper-v2")
gamma = 0.99
tau = 0.005
noise_std = 0.2
bound = 0.5
delay_step = 2
buffer_maxlen = int(1e6)
critic_lr = 3e-4
actor_lr = 3e-4

agent = TD3(env, gamma, tau, buffer_maxlen, delay_step, noise_std, bound, critic_lr, actor_lr)

In [10]:
# 按照回合训练
max_episodes = 10000
max_steps = 1000
batch_size = 64
train(env, agent, max_episodes, max_steps, batch_size, render=False)

totle_step 17, episode_reward 10.717824730481036
totle_step 31, episode_reward 11.344015635365462
totle_step 64, episode_reward 40.75517596545666
totle_step 82, episode_reward 18.36156958220546
totle_step 143, episode_reward 73.06968081993784
totle_step 160, episode_reward 12.40240024720213
totle_step 177, episode_reward 8.723110780020267
totle_step 197, episode_reward 18.249571573892382
totle_step 207, episode_reward 5.4109378392866265
totle_step 285, episode_reward 101.72718351145971
totle_step 304, episode_reward 20.754635557080007
totle_step 335, episode_reward 21.08986536115908
totle_step 384, episode_reward 27.651652978279536
totle_step 412, episode_reward 20.780951370759013
totle_step 429, episode_reward 14.796973607377751
totle_step 464, episode_reward 35.76993833013621
totle_step 482, episode_reward 15.371338310349472
totle_step 508, episode_reward 8.01095083627878
totle_step 566, episode_reward 89.52976054305077
totle_step 589, episode_reward 25.43695745318132
totle_step 614,



totle_step 25054, episode_reward 34.7006494213666
totle_step 25072, episode_reward 18.464743604778686
totle_step 25096, episode_reward 40.97272720869112
totle_step 25119, episode_reward 40.03550458448585
totle_step 25142, episode_reward 40.21042801678184
totle_step 25165, episode_reward 40.17971129307552
totle_step 25188, episode_reward 39.97773961286758
totle_step 25210, episode_reward 38.90719980747382
totle_step 25232, episode_reward 38.18721312719577
totle_step 25254, episode_reward 38.49999754948775
totle_step 25276, episode_reward 38.82398140326138
totle_step 25299, episode_reward 40.40620238282032
totle_step 25322, episode_reward 39.95800587550117
totle_step 25343, episode_reward 36.987579105344395
totle_step 25366, episode_reward 39.987799358527354
totle_step 25389, episode_reward 40.635504311724056
totle_step 25411, episode_reward 38.568338981535724
totle_step 25434, episode_reward 40.49818895631366
totle_step 25457, episode_reward 40.35714439806014
totle_step 25478, episode_r

In [12]:
# 按照总步数训练
max_steps = int(1e6)
begin_steps = 25e3
batch_size = 64
train_step(env, agent, max_steps, begin_steps, batch_size, std_train=0.1, render=False)

totle_step 16,episode_num 0, episode_steps 16, episode_reward 13.051606350139052
totle_step 33,episode_num 1, episode_steps 17, episode_reward 16.636183833335636
totle_step 49,episode_num 2, episode_steps 16, episode_reward 12.168321507349239
totle_step 71,episode_num 3, episode_steps 22, episode_reward 13.900112193727306
totle_step 89,episode_num 4, episode_steps 18, episode_reward 10.830965765091417
totle_step 116,episode_num 5, episode_steps 27, episode_reward 15.059264236577649
totle_step 125,episode_num 6, episode_steps 9, episode_reward 6.994315459174535
totle_step 153,episode_num 7, episode_steps 28, episode_reward 11.070444725465832
totle_step 176,episode_num 8, episode_steps 23, episode_reward 18.401103984593114
totle_step 187,episode_num 9, episode_steps 11, episode_reward 5.192222372805845
totle_step 198,episode_num 10, episode_steps 11, episode_reward 6.7343072718911525
totle_step 228,episode_num 11, episode_steps 30, episode_reward 9.606231187044429
totle_step 265,episode_

totle_step 2122,episode_num 100, episode_steps 18, episode_reward 8.078435417663055
totle_step 2136,episode_num 101, episode_steps 14, episode_reward 7.067407453808446
totle_step 2152,episode_num 102, episode_steps 16, episode_reward 14.184375495466506
totle_step 2168,episode_num 103, episode_steps 16, episode_reward 10.075977615287774
totle_step 2195,episode_num 104, episode_steps 27, episode_reward 25.83298460321734
totle_step 2228,episode_num 105, episode_steps 33, episode_reward 23.52194306889087
totle_step 2246,episode_num 106, episode_steps 18, episode_reward 14.54750037555279
totle_step 2259,episode_num 107, episode_steps 13, episode_reward 9.897913490684738
totle_step 2271,episode_num 108, episode_steps 12, episode_reward 9.002116643158422
totle_step 2294,episode_num 109, episode_steps 23, episode_reward 19.326141695776663
totle_step 2307,episode_num 110, episode_steps 13, episode_reward 8.607682468686198
totle_step 2319,episode_num 111, episode_steps 12, episode_reward 8.45029

totle_step 4833,episode_num 228, episode_steps 17, episode_reward 8.696087906500868
totle_step 4890,episode_num 229, episode_steps 57, episode_reward 92.66462605430664
totle_step 4917,episode_num 230, episode_steps 27, episode_reward 21.747015336339242
totle_step 4932,episode_num 231, episode_steps 15, episode_reward 10.10612366456146
totle_step 4961,episode_num 232, episode_steps 29, episode_reward 13.487838678566193
totle_step 4982,episode_num 233, episode_steps 21, episode_reward 20.048546287564857
totle_step 4994,episode_num 234, episode_steps 12, episode_reward 9.076476260515985
totle_step 5017,episode_num 235, episode_steps 23, episode_reward 24.18058930617222
totle_step 5072,episode_num 236, episode_steps 55, episode_reward 54.46133876164747
totle_step 5119,episode_num 237, episode_steps 47, episode_reward 39.500671765167525
totle_step 5137,episode_num 238, episode_steps 18, episode_reward 15.674481024215966
totle_step 5149,episode_num 239, episode_steps 12, episode_reward 9.578

totle_step 7638,episode_num 351, episode_steps 41, episode_reward 53.270877252179744
totle_step 7667,episode_num 352, episode_steps 29, episode_reward 28.978451049532378
totle_step 7694,episode_num 353, episode_steps 27, episode_reward 34.36187043258531
totle_step 7708,episode_num 354, episode_steps 14, episode_reward 7.648084497651656
totle_step 7725,episode_num 355, episode_steps 17, episode_reward 9.350608471856912
totle_step 7737,episode_num 356, episode_steps 12, episode_reward 7.354455332772049
totle_step 7762,episode_num 357, episode_steps 25, episode_reward 27.332996186292053
totle_step 7776,episode_num 358, episode_steps 14, episode_reward 11.170451094765221
totle_step 7799,episode_num 359, episode_steps 23, episode_reward 20.057902798308334
totle_step 7816,episode_num 360, episode_steps 17, episode_reward 11.76701104730113
totle_step 7839,episode_num 361, episode_steps 23, episode_reward 21.201198866639892
totle_step 7853,episode_num 362, episode_steps 14, episode_reward 9.15

totle_step 10384,episode_num 480, episode_steps 12, episode_reward 8.802708833937059
totle_step 10397,episode_num 481, episode_steps 13, episode_reward 9.728792535943583
totle_step 10409,episode_num 482, episode_steps 12, episode_reward 7.987085085356742
totle_step 10425,episode_num 483, episode_steps 16, episode_reward 14.673312976069282
totle_step 10438,episode_num 484, episode_steps 13, episode_reward 8.511775423434836
totle_step 10482,episode_num 485, episode_steps 44, episode_reward 63.020465461869996
totle_step 10520,episode_num 486, episode_steps 38, episode_reward 23.979943974846606
totle_step 10530,episode_num 487, episode_steps 10, episode_reward 6.51323900223084
totle_step 10547,episode_num 488, episode_steps 17, episode_reward 13.613978684598885
totle_step 10602,episode_num 489, episode_steps 55, episode_reward 29.40632675062729
totle_step 10617,episode_num 490, episode_steps 15, episode_reward 13.370202475170139
totle_step 10633,episode_num 491, episode_steps 16, episode_r

totle_step 13163,episode_num 606, episode_steps 18, episode_reward 10.419776798429915
totle_step 13174,episode_num 607, episode_steps 11, episode_reward 7.819755431729623
totle_step 13202,episode_num 608, episode_steps 28, episode_reward 20.852143023217124
totle_step 13229,episode_num 609, episode_steps 27, episode_reward 10.539745690860794
totle_step 13253,episode_num 610, episode_steps 24, episode_reward 12.022223146554717
totle_step 13277,episode_num 611, episode_steps 24, episode_reward 27.124670988621673
totle_step 13306,episode_num 612, episode_steps 29, episode_reward 16.64531078825514
totle_step 13325,episode_num 613, episode_steps 19, episode_reward 14.847051161126313
totle_step 13363,episode_num 614, episode_steps 38, episode_reward 35.26180351745071
totle_step 13391,episode_num 615, episode_steps 28, episode_reward 23.094833738742356
totle_step 13401,episode_num 616, episode_steps 10, episode_reward 7.273263749636328
totle_step 13414,episode_num 617, episode_steps 13, episod

totle_step 15958,episode_num 724, episode_steps 50, episode_reward 53.14953988531154
totle_step 15977,episode_num 725, episode_steps 19, episode_reward 16.96440789601024
totle_step 15993,episode_num 726, episode_steps 16, episode_reward 16.358152578101812
totle_step 16006,episode_num 727, episode_steps 13, episode_reward 10.389688085613905
totle_step 16025,episode_num 728, episode_steps 19, episode_reward 13.889987887220995
totle_step 16039,episode_num 729, episode_steps 14, episode_reward 12.155909775108956
totle_step 16065,episode_num 730, episode_steps 26, episode_reward 28.607523011793244
totle_step 16091,episode_num 731, episode_steps 26, episode_reward 17.12331724739605
totle_step 16112,episode_num 732, episode_steps 21, episode_reward 18.692987663918462
totle_step 16173,episode_num 733, episode_steps 61, episode_reward 48.74946254887901
totle_step 16191,episode_num 734, episode_steps 18, episode_reward 10.36245625084623
totle_step 16202,episode_num 735, episode_steps 11, episode

totle_step 18673,episode_num 838, episode_steps 28, episode_reward 8.341717218227629
totle_step 18707,episode_num 839, episode_steps 34, episode_reward 36.67881445454872
totle_step 18749,episode_num 840, episode_steps 42, episode_reward 39.29309579464404
totle_step 18758,episode_num 841, episode_steps 9, episode_reward 6.61135427098504
totle_step 18776,episode_num 842, episode_steps 18, episode_reward 15.428212098254463
totle_step 18785,episode_num 843, episode_steps 9, episode_reward 6.944856740059125
totle_step 18801,episode_num 844, episode_steps 16, episode_reward 11.255411720144961
totle_step 18811,episode_num 845, episode_steps 10, episode_reward 7.213861321667081
totle_step 18833,episode_num 846, episode_steps 22, episode_reward 15.67737999044703
totle_step 18846,episode_num 847, episode_steps 13, episode_reward 8.574408531132208
totle_step 18868,episode_num 848, episode_steps 22, episode_reward 9.018296254033231
totle_step 18885,episode_num 849, episode_steps 17, episode_reward

totle_step 20836,episode_num 935, episode_steps 13, episode_reward 11.718405703610598
totle_step 20853,episode_num 936, episode_steps 17, episode_reward 15.330360758650876
totle_step 20871,episode_num 937, episode_steps 18, episode_reward 16.80898352938036
totle_step 20890,episode_num 938, episode_steps 19, episode_reward 11.947558112978363
totle_step 20908,episode_num 939, episode_steps 18, episode_reward 17.96102089509862
totle_step 20929,episode_num 940, episode_steps 21, episode_reward 22.51319314589038
totle_step 20950,episode_num 941, episode_steps 21, episode_reward 11.94897046693112
totle_step 20966,episode_num 942, episode_steps 16, episode_reward 9.146890118212767
totle_step 20998,episode_num 943, episode_steps 32, episode_reward 13.860788580207267
totle_step 21013,episode_num 944, episode_steps 15, episode_reward 9.635812124152144
totle_step 21034,episode_num 945, episode_steps 21, episode_reward 12.30345189229417
totle_step 21052,episode_num 946, episode_steps 18, episode_r

totle_step 22887,episode_num 1033, episode_steps 11, episode_reward 8.267353011723344
totle_step 22904,episode_num 1034, episode_steps 17, episode_reward 14.371029303305345
totle_step 22925,episode_num 1035, episode_steps 21, episode_reward 21.14450704603981
totle_step 22942,episode_num 1036, episode_steps 17, episode_reward 10.947279259172843
totle_step 22953,episode_num 1037, episode_steps 11, episode_reward 8.315326264735377
totle_step 22977,episode_num 1038, episode_steps 24, episode_reward 20.66531640251515
totle_step 23002,episode_num 1039, episode_steps 25, episode_reward 24.273564997370187
totle_step 23034,episode_num 1040, episode_steps 32, episode_reward 32.21264054141656
totle_step 23049,episode_num 1041, episode_steps 15, episode_reward 11.891142748945844
totle_step 23071,episode_num 1042, episode_steps 22, episode_reward 19.151729152364467
totle_step 23100,episode_num 1043, episode_steps 29, episode_reward 31.14524195827727
totle_step 23133,episode_num 1044, episode_steps 

totle_step 25091,episode_num 1129, episode_steps 18, episode_reward -1.4450148108073249
totle_step 25109,episode_num 1130, episode_steps 18, episode_reward -1.4846667205394941
totle_step 25127,episode_num 1131, episode_steps 18, episode_reward -1.2057271605068918
totle_step 25145,episode_num 1132, episode_steps 18, episode_reward -1.0095288567716478
totle_step 25162,episode_num 1133, episode_steps 17, episode_reward -0.9144925369506242
totle_step 25180,episode_num 1134, episode_steps 18, episode_reward -1.2472486543603232
totle_step 25198,episode_num 1135, episode_steps 18, episode_reward -1.0311800511442204
totle_step 25216,episode_num 1136, episode_steps 18, episode_reward -0.35399328289294407
totle_step 25234,episode_num 1137, episode_steps 18, episode_reward -1.2963681517275036
totle_step 25251,episode_num 1138, episode_steps 17, episode_reward -0.8048798983390264
totle_step 25270,episode_num 1139, episode_steps 19, episode_reward -1.187135614603913
totle_step 25288,episode_num 114

KeyboardInterrupt: 

In [12]:
with torch.no_grad():
    for i in range(100):
        state = env.reset()
        for j in range(1000):
            env.render()
            action = agent.get_action(np.array(state))
            print(action)
            next_state, reward, done, _ = env.step(action)
            if done:
                break
            state = next_state

Creating window glfw
[ 0.9899661  -0.14495094  0.9332513 ]
[0.99459136 0.3495038  0.20310341]
[0.9473573  0.08097157 0.18381123]
[0.9274553  0.03887747 0.27504626]
[0.9345606  0.09861897 0.34394315]
[0.9318877  0.15547793 0.5584015 ]
[0.9554483  0.2897263  0.62621194]
[0.9818423  0.58079964 0.5417278 ]
[ 0.9829051  0.325816  -0.1919474]
[ 0.64135474 -0.85198873 -0.87710196]
[ 0.6205839  -0.42334497 -0.16587439]
[0.8439528  0.05466815 0.5410438 ]
[0.5892363  0.05052556 0.41719785]
[0.35191306 0.13786589 0.62987715]
[-0.00234661  0.11559023  0.78804463]
[-0.31720573 -0.5477168   0.8707153 ]
[-0.9412783  -0.9638999   0.97850466]
[-0.8577042 -0.7939994  0.9282733]
[ 0.9935231   0.99781317 -0.79827744]
[0.9561082  0.31398773 0.6670681 ]
[ 0.7442811  -0.24355039  0.62405336]
[ 0.12883195  0.8534302  -0.89522   ]
[ 0.07959979 -0.28315622 -0.17102793]
[ 0.6535372   0.41315874 -0.7786634 ]
[-0.75203615  0.5457347  -0.9173735 ]
[0.45114046 0.48445803 0.04518583]
[-0.8440358  0.555085   0.693939 

[ 0.16243544 -0.02100173 -0.8942793 ]
[ 0.07914218 -0.06550665 -0.8881821 ]
[-0.28110108  0.7118585  -0.92766315]
[-0.2713953 -0.7005748 -0.977583 ]
[ 0.480889  -0.7068603 -0.9999679]
[ 0.42581332 -0.99172354 -1.        ]
[ 0.88353163 -0.9985934  -1.        ]
[ 0.30630586 -0.9821078  -1.        ]
[ 0.06760309 -0.86186534 -1.        ]
[ 0.27496612 -0.7823469  -1.        ]
[ 0.06049647 -0.10947889 -1.        ]
[ 0.2574385   0.42407382 -1.        ]
[ 0.380596  -0.0232765 -1.       ]
[ 0.462103   -0.18511006 -1.        ]
[ 0.22267018  0.07469459 -1.        ]
[-0.2979545  0.5986664 -0.9999952]
[-0.42756954  0.86397564 -0.9982948 ]
[-0.29791543  0.9245457  -0.54045856]
[0.6634567  0.9220532  0.99461836]
[0.99987435 0.11792442 0.9999726 ]
[0.99664557 0.9924521  0.85191596]
[ 0.9609363   0.99541444 -0.8919026 ]
[ 0.9570158  0.9941655 -0.8988815]
[ 0.8760384   0.9861432  -0.85877603]
[ 0.5184937  0.9712567 -0.8954794]
[-0.5142108   0.7409102  -0.49048525]
[-0.5631159   0.43610853  0.4243001 ]
[

[ 0.30969617 -0.12789705  0.05980837]
[ 0.41556048  0.60562956 -0.7784822 ]
[-0.16578707 -0.01162759 -0.74420124]
[ 0.23029912  0.46412048 -0.33932573]
[ 0.39526537  0.38281807 -0.14393802]
[0.8758961  0.39752716 0.30299476]
[ 0.897558    0.1840768  -0.24501486]
[0.7439791  0.27352166 0.8576778 ]
[0.1073231 0.6073583 0.9593493]
[-0.56949437  0.729177    0.9696718 ]
[-0.24811198  0.78439444  0.9082147 ]
[-0.00599113  0.41926283  0.9000216 ]
[-0.22856916  0.17861535  0.9604033 ]
[-0.057677   -0.04554025  0.94982976]
[-0.07592941 -0.2450145   0.93175733]
[-0.051658   -0.13411352  0.87489814]
[-0.05340079 -0.04135867  0.8226163 ]
[-0.06158979 -0.00973548  0.78719175]
[-0.01030842 -0.05166904  0.7752992 ]
[ 0.03649998 -0.0224153   0.8036744 ]
[0.06493551 0.07795768 0.7844123 ]
[0.09210487 0.08523491 0.79504395]
[0.1379483  0.07695375 0.8177431 ]
[0.11373544 0.02775656 0.8384055 ]
[0.11559176 0.00843816 0.86104566]
[ 0.09721688 -0.00542466  0.8847084 ]
[0.1334925  0.03137814 0.9108145 ]
[0.1

[ 0.3473112   0.9868327  -0.98171324]
[ 0.06759241  0.9602369  -0.9939208 ]
[-0.6472823   0.88153464  0.49636772]
[0.0782948  0.93290734 0.9506799 ]
[-0.35692966  0.7835199   0.9853169 ]
[0.11028793 0.7434854  0.9888699 ]
[0.04650659 0.66880065 0.9875318 ]
[0.09288672 0.662301   0.98397905]
[0.06523298 0.6143562  0.9807122 ]
[-0.00974149  0.5957327   0.97816294]
[0.01001678 0.60930705 0.9758923 ]
[0.00475713 0.6276864  0.9724997 ]
[0.00303074 0.6434715  0.97008264]
[-0.02060504  0.6742153   0.96705455]
[-0.00956051  0.6983206   0.9655654 ]
[0.0059951  0.69531715 0.9645973 ]
[-0.00584394  0.68281245  0.9626408 ]
[0.01103035 0.692967   0.9568874 ]
[0.0035007 0.7109414 0.951526 ]
[0.00974652 0.72375494 0.9460841 ]
[0.01243031 0.73647165 0.9402071 ]
[0.01425225 0.74943453 0.9326675 ]
[0.01035058 0.7627295  0.9162645 ]
[0.00513797 0.776598   0.8960478 ]
[0.01286803 0.80929834 0.8765523 ]
[0.01895602 0.84213823 0.85286385]
[0.03871215 0.86862123 0.82961977]
[0.06032045 0.88948536 0.8123047 ]

[ 0.02282993 -0.04620621  0.82484406]
[-0.00279692 -0.03765415  0.82279783]
[0.00997971 0.04140148 0.8044106 ]
[0.0231387  0.11028549 0.7879626 ]
[0.01027174 0.15093778 0.7941519 ]
[0.09783807 0.2526764  0.8224278 ]
[0.23381032 0.2749034  0.8460034 ]
[0.3060881  0.03764879 0.9281324 ]
[ 0.36201355 -0.26191458  0.9768137 ]
[ 0.2639721  -0.40066242  0.981676  ]
[ 0.03547498 -0.15867779  0.9822872 ]
[-0.05502898  0.06273274  0.97981054]
[0.05142549 0.1840374  0.980273  ]
[0.14724727 0.08923349 0.9871507 ]
[0.08673579 0.01838779 0.9923314 ]
[0.0387056  0.07717563 0.9910893 ]
[-0.03271155  0.15993002  0.98812073]
[-0.06291738  0.26786822  0.977479  ]
[-0.08648191  0.31925693  0.9227518 ]
[-0.23362067  0.29816267  0.60622096]
[-0.307845    0.55241704 -0.29336578]
[-0.16132769  0.2587617  -0.83917534]
[ 0.15383938  0.05414972 -0.7610927 ]
[-0.2028813   0.3909345  -0.25431776]
[-0.44818276  0.38846028 -0.6041432 ]
[-0.31014523  0.5341825  -0.66218877]
[-0.15199448  0.1366645  -0.31843624]
[-0.

[-0.01552948  0.53535223  0.9081813 ]
[-0.02534274  0.53473985  0.90611166]
[-0.04022458  0.5582676   0.9042373 ]
[-0.04650496  0.5844555   0.90132457]
[-0.02445014  0.60639757  0.8990315 ]
[-0.02612362  0.6187051   0.8871234 ]
[0.00265316 0.6270939  0.87231183]
[0.00126257 0.62710375 0.85893184]
[-0.02180758  0.6384815   0.8454613 ]
[-0.02381822  0.67367244  0.83294535]
[-0.01688787  0.69777286  0.82340723]
[-0.01505495  0.72108287  0.79359436]
[0.000947   0.77202535 0.7515735 ]
[-0.01074441  0.8238818   0.71053934]
[-0.00902209  0.86695844  0.65740484]
[-1.20174605e-04  9.03572083e-01  5.78564763e-01]
[0.02309828 0.9306844  0.4961246 ]
[0.0539863  0.94890386 0.43283555]
[0.12052159 0.9627612  0.1334044 ]
[ 0.11573666  0.97240406 -0.19757636]
[-9.7034517e-04  9.7695005e-01 -4.8735830e-01]
[ 0.02127811  0.98002183 -0.6607096 ]
[ 0.15223931  0.98698336 -0.8216002 ]
[ 0.44435358  0.9948245  -0.7563218 ]
[ 0.44051704  0.9941725  -0.3453561 ]
[-0.17551304  0.98658377 -0.63358134]
[-0.29554

[0.05451005 0.01327647 0.8366046 ]
[-0.01790044 -0.01532936  0.8389199 ]
[-0.00639029  0.04392721  0.8255492 ]
[0.02008623 0.10125437 0.81580985]
[0.08078721 0.2149494  0.83641154]
[0.16240524 0.29343048 0.85483426]
[0.30198273 0.169036   0.9193979 ]
[ 0.4664354  -0.23065172  0.97875345]
[ 0.3774409 -0.5337087  0.9886541]
[-0.01617027 -0.29342154  0.9849771 ]
[-0.1307839   0.1101684   0.97879034]
[0.04248726 0.33453727 0.97921854]
[0.15989989 0.1544788  0.99045396]
[0.10005668 0.00447098 0.9945415 ]
[-0.02541154  0.07373721  0.99215347]
[-0.06492379  0.27457738  0.9880125 ]
[-0.05391117  0.32763097  0.96285284]
[-0.16953434  0.2653057   0.7962147 ]
[-0.28874478  0.42631647  0.18383041]
[-0.42110354  0.37077636 -0.7417237 ]
[ 0.07102532  0.18724325 -0.8935524 ]
[ 0.0378612   0.29276443 -0.5232505 ]
[-0.32795608  0.42390507 -0.26471037]
[-0.45752275  0.44952312 -0.70407444]
[-0.09957737  0.29143804 -0.5955717 ]
[-0.12029427 -0.26923987 -0.18027115]
[-0.18698148 -0.578556   -0.7357601 ]
[

[0.04972748 0.8133277  0.9828953 ]
[0.06677118 0.73590994 0.9841303 ]
[0.07277429 0.68873066 0.9822019 ]
[0.05721903 0.635463   0.9788388 ]
[-0.01174698  0.60280365  0.9763815 ]
[0.00203174 0.61195624 0.9736625 ]
[0.00174628 0.63341993 0.97023904]
[-5.520996e-04  6.470662e-01  9.676694e-01]
[-0.00438189  0.6590049   0.96566737]
[-0.02490655  0.68434566  0.9634339 ]
[0.00434478 0.6894908  0.962629  ]
[-0.00255167  0.6781763   0.9615343 ]
[0.00434179 0.68196356 0.9574031 ]
[0.00612579 0.69931394 0.9511001 ]
[0.0095741  0.71256584 0.94552904]
[0.00908289 0.7255746  0.93970686]
[0.00688763 0.7383721  0.93356514]
[0.00691541 0.7525602  0.918297  ]
[0.00556046 0.7664915  0.89846253]
[0.00503398 0.7935335  0.87723225]
[0.01769928 0.8289203  0.85344666]
[0.0311717  0.8581392  0.82812977]
[0.06289367 0.88117146 0.80919707]
[0.03628444 0.90012753 0.79566866]
[0.02198802 0.91333586 0.7698434 ]
[0.01872859 0.9266232  0.7266834 ]
[0.0613374  0.94423395 0.58917284]
[0.11718199 0.96064705 0.35294548]

KeyboardInterrupt: 