In [1]:
import torch
import torch.nn as nn
from torch.distributions import Normal
import random
import numpy as np
from collections import deque  # 双向队列
import torch.optim as optim
import gym

In [2]:
from torch.utils.tensorboard import SummaryWriter

In [3]:
writer_policy_graph = SummaryWriter('sac_2018_actions/policy_net')
writer_softq_graph = SummaryWriter('sac_2018_actions/SoftQ_net')
writer_value_graph = SummaryWriter('sac_2018_actions/value_net')
writer_scale = SummaryWriter('sac_2018_actions')

In [4]:
class ValueNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ValueNetwork, self).__init__()
        self.l1 = nn.Linear(input_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, output_dim)
        
    def forward(self, state):
        x = torch.relu(self.l1(state))
        x = torch.relu(self.l2(x))
        x = self.l3(x)
        return x

class SoftQNetwork(nn.Module):
    '''Critic，多输入网络'''
    def __init__(self, num_inputs, num_actions, hidden_size=256):
        super(SoftQNetwork, self).__init__()
        self.l1 = nn.Linear(num_inputs + num_actions, hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.l3 = nn.Linear(hidden_size, 1)
        
    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)  # 首先把两个输入合并一起
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        x = self.l3(x)
        return x
    
class PolicyNetwork(nn.Module):
    '''Actor，多输出网络'''
    def __init__(self, num_inputs, num_actions, hidden_size=256, log_std_min=-20, log_std_max=2):
        super(PolicyNetwork, self).__init__()
        self.log_std_min = log_std_min                              # 这两个咋来的
        self.log_std_max = log_std_max
        
        self.l1 = nn.Linear(num_inputs, hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        
        self.l_mean = nn.Linear(hidden_size, num_actions)
        self.l_logstd = nn.Linear(hidden_size, num_actions)
        
    def forward(self, state):
        '''根据给定的state得到，mean和log_std'''
        x = torch.relu(self.l1(state))
        x = torch.relu(self.l2(x))
        
        mean = self.l_mean(x)
        log_std = self.l_logstd(x)
        log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)  # torch的clip其实是clamp

        return mean, log_std
    
    def sample(self, state, epsilon=1e-6):
        '''根据state得到mean和logstd之后再计算action(-1,1)和log_pi，是式子26'''
        mean, log_std = self.forward(state)
        std = log_std.exp()
        
        normal = Normal(mean, std)
        u = normal.rsample()        # rsample()  :从normal高斯分布采样一个u, 形状和mean相同
        action = torch.tanh(u)      # action取tanh u
        
        log_pi = normal.log_prob(u) - torch.log(1 - action.pow(2) + epsilon) # (26)
        log_pi = log_pi.sum(1, keepdim=True)                                # 用.sum(dim, keep_dim)这个语法
        
        return action, log_pi

In [5]:
class BasicBuffer:
    def __init__(self, max_size):
        self.max_size = max_size
        self.buffer = deque(maxlen=max_size)
        
    def push(self, state, action, reward, next_state, done):
        experience = (state, action, np.array([reward]), next_state, done)
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        state_batch = []
        action_batch = []
        reward_batch = []
        next_state_batch = []
        done_batch = []
        
        batch = random.sample(self.buffer, batch_size)
        
        for experience in batch:
            state, action, reward, next_state, done = experience
            state_batch.append(state)
            action_batch.append(action)
            reward_batch.append(reward)
            next_state_batch.append(next_state)
            done_batch.append(done)
            
        return (state_batch, action_batch, reward_batch, next_state_batch, done_batch)

In [6]:
def update_target(model, target_model, tau):
    for target_pam, pam in zip(target_model.parameters(), model.parameters()):
        target_pam.data.copy_((1. - tau) * target_pam + tau * pam)

In [7]:
class SAC_Agent:
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.env = env
        self.action_range = [self.env.action_space.low, self.env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2
        
        ## 初始化网络
        self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device)
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net =  ValueNetwork(self.obs_dim, 1).to(self.device)
        
        # 初始化目标网络的权重
        update_target(self.value_net, self.target_value_net, tau=1.)
        
        ## 优化器
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        
        ## Q网络和V网络的损失函数，都是mse
        self.loss_fn = nn.MSELoss()
        
        # 经验池
        self.replay_buffer = BasicBuffer(buffer_maxlen)
        
        # 用来记录各种标量的字典
        self.summuries = {}
        
    def get_action(self, state):
        # FloatTensor是把列表或者数组直接转换成tensorfloat32的函数，unsqueeze(dim)是在指定位置插入新维度
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)  # 输入变量也都 to device
        mean, log_std = self.policy_net(state)  # 自动call forward
        std = log_std.exp()
        
        normal = Normal(mean, std)
        u = normal.sample()     # sample() 和 rsample() 有所不同，这个一般用于不参与梯度计算的时候
        
        action = torch.tanh(u)
        
        action = action.cpu().detach().squeeze(0).numpy()  # 需要转移到cpu上，不然显示不了这个值
        scaled_action = self.rescale_action(action)
        return scaled_action  # (-2,2)
    
    def test_get_action(self, state):
        # 去掉方差， 测试的时候，只用均值即可
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)  # 输入变量也都 to device
        mean, _ = self.policy_net(state)  # 自动call forward
     
        action = torch.tanh(mean)
        
        action = action.cpu().detach().squeeze(0).numpy()  # 需要转移到cpu上，不然显示不了这个值
        scaled_action = self.rescale_action(action)
        return scaled_action  # (-2,2)
        
    
    def rescale_action(self, action):
        '''经过tanh，action必在[-1,1]，所以需要rescale到动作区间'''
        action = action * (self.action_range[1]-self.action_range[0]) / 2.0 +\
                (self.action_range[1] + self.action_range[0]) / 2.0
        return action
    
    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)  # 单个的Bool值不能转换，但是列表可以
        dones = dones.view(dones.size(0), -1)  # (batch_size, 1) 的形状
        
        # 计算q网络相关的值，用的actions都是 (-2,2)之间的动作值
        v_actions, v_log_pi = self.policy_net.sample(states)
        v_q1 = self.q_net1(states, v_actions)
        v_q2 = self.q_net2(states, v_actions)
        next_v = self.target_value_net(next_states)
        
        # value loss 
        v_target = torch.min(v_q1, v_q2) - v_log_pi
        curr_v = self.value_net(states)
        v_loss = self.loss_fn(curr_v, v_target.detach()) # 目标值统统detach中断传播
        
        # q_loss and update_qnet
        curr_q1 = self.q_net1(states, actions)
        curr_q2 = self.q_net2(states, actions)
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        q1_loss = self.loss_fn(curr_q1, expected_q.detach()) # 目标值不需要梯度计算，所以detach终止梯度
        q2_loss = self.loss_fn(curr_q2, expected_q.detach())
        self.summuries['q1_loss'] = q1_loss.detach().item()
        self.summuries['q2_loss'] = q2_loss.detach().item()
        
        self.value_optimizer.zero_grad()
        self.q1_optimizer.zero_grad()
        self.q2_optimizer.zero_grad()
        v_loss.backward()
        q1_loss.backward()
        q2_loss.backward()
        self.value_optimizer.step()
        self.q1_optimizer.step()
        self.q2_optimizer.step()
        
        # 延迟更新policy网络以及q目标网络，用的actions 是 (-1,1)之间的动作值
        new_actions, log_pi = self.policy_net.sample(states)  # new_actions 是-1,1之间的值
        if self.update_step % self.delay_step == 0:
            # 更新 policy网络
            min_q = torch.min(self.q_net1(states, new_actions), 
                              self.q_net2(states, new_actions))
            
            policy_loss = (log_pi - min_q).mean()
            self.summuries['policy_loss'] = policy_loss.detach().item()
            
            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()
            
            # 更新目标v网络
            update_target(self.value_net, self.target_value_net, tau=self.tau)
        
        self.update_step += 1


In [8]:
def train(env, agent, max_episode, max_steps, batch_size, render=True):
    global_step = 0
    
    for episode in range(max_episode):
        state = env.reset()
        episode_reward = 0
        episode_step = 0
        
        for step in range(max_steps):
            if render:
                env.render()
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.replay_buffer.push(state, action, reward, next_state, done)
            episode_reward += reward  # 每个回合能获得多少奖励
            global_step += 1          # 全部步数
            episode_step += 1         # 每个回合能走几步
            writer_scale.add_scalar('Main/every_step_reward', reward, global_step) # 每一小步的单步奖励
            
            if len(agent.replay_buffer.buffer) > batch_size:
                agent.update(batch_size)
                writer_scale.add_scalar('Loss/q1_loss', agent.summuries['q1_loss'], global_step)
                writer_scale.add_scalar('Loss/q2_loss', agent.summuries['q1_loss'], global_step)
                writer_scale.add_scalar('Loss/policy_loss',agent.summuries['policy_loss'], global_step)

            if done or step == max_steps - 1:
                writer_scale.add_scalar('Episode/episode_steps', episode_step, episode)
                writer_scale.add_scalar('Episode/episode_rewards', episode_reward, episode)
                print('Episode is {}, episod_reward is {}'.format(episode, episode_reward))
                break
            state = next_state

In [9]:
# env = gym.make('Ant-v2')
env = gym.make('Hopper-v2')

In [10]:
#SAC 2018 Params
tau = 0.005
gamma = 0.99
value_lr = 3e-4
q_lr = 3e-4
policy_lr = 3e-4
buffer_maxlen = 1000000

#2018 agent
agent = SAC_Agent(env, gamma, tau, value_lr, q_lr, policy_lr, buffer_maxlen)

In [11]:
test_state_raw = env.reset()
test_state = torch.FloatTensor(test_state_raw).unsqueeze(0).to(agent.device)
test_action_raw = agent.get_action(test_state_raw)
test_action = torch.FloatTensor(test_action_raw).unsqueeze(0).to(agent.device)

writer_policy_graph.add_graph(agent.policy_net, test_state)
writer_softq_graph.add_graph(agent.q_net1, [test_state, test_action])
writer_value_graph.add_graph(agent.value_net, test_state)

In [12]:
# train
episode_rewards = train(env, agent, 10000, 1000, 64, render=False) # 一共是 5000轮，每轮最多1000步

Episode is 0, episod_reward is 15.972597718270409
Episode is 1, episod_reward is 9.31604829715257
Episode is 2, episod_reward is 8.997190342042845
Episode is 3, episod_reward is 13.209068597647383
Episode is 4, episod_reward is 9.333430011332478
Episode is 5, episod_reward is 15.11782372748069
Episode is 6, episod_reward is 23.455744412772095
Episode is 7, episod_reward is 12.963882502185637
Episode is 8, episod_reward is 28.94919425077684
Episode is 9, episod_reward is 28.643390633177546
Episode is 10, episod_reward is 6.147201728951146
Episode is 11, episod_reward is 15.942191397993835
Episode is 12, episod_reward is 8.396573496684072
Episode is 13, episod_reward is 15.928785080465712
Episode is 14, episod_reward is 11.450712678279528
Episode is 15, episod_reward is 7.691701262515537
Episode is 16, episod_reward is 9.013170857243237
Episode is 17, episod_reward is 5.729856852665965
Episode is 18, episod_reward is 9.004171638780738
Episode is 19, episod_reward is 33.52709828259998
Epi

KeyboardInterrupt: 

In [20]:
env.action_space.low

array([-1., -1., -1.], dtype=float32)

In [13]:
torch.save(agent.policy_net,'policy_2018.pt')

# 测试

In [9]:
agent.policy_net = torch.load('policy_2018.pt')

In [10]:
with torch.no_grad():
    for i in range(100):
        obs = env.reset()
        for j in range(20000):
            env.render()
            action = agent.test_get_action(obs)
            next_obs, reward, done, _ = env.step(action)
            if done:
                break
            obs = next_obs

Creating window glfw


KeyboardInterrupt: 