In [1]:
import torch
import torch.nn as nn
from torch.distributions import Categorical
import numpy as np
import gym
import os

device = torch.device('cpu')
if(torch.cuda.is_available()):
    device = torch.device('cuda:0')

In [2]:
class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
        
    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

In [3]:
class Actor(nn.Module):
    '''给出各个动作的概率'''
    def __init__(self, state_dim, action_dim):
        super(Actor, self).__init__()
        self.linear1 = nn.Linear(state_dim, 64)
        self.linear2 = nn.Linear(64, 64)
        self.linear3 = nn.Linear(64, action_dim)
        
    def forward(self, state):
        x = torch.tanh(self.linear1(state))
        x = torch.tanh(self.linear2(x))
        x = torch.softmax(self.linear3(x), dim=-1)
        return x 

In [4]:
class Critic(nn.Module):
    def __init__(self, state_dim):
        super(Critic, self).__init__()
        self.linear1 = nn.Linear(state_dim, 64)
        self.linear2 = nn.Linear(64, 64)
        self.linear3 = nn.Linear(64, 1)
    
    def forward(self, state):
        x = torch.tanh(self.linear1(state))
        x = torch.tanh(self.linear2(x))
        x = self.linear3(x)
        return x

In [5]:
class ActorCritic(nn.Module):
    '''把Actor和Critic封装到一起了，这样可以直接评估，比较简单'''
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        
        self.action_dim = action_dim
        
        self.actor = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim)
        
        
    def act(self, state):
        '''输入单个状态，得到动作和log, 是之后要用到选择动作的子函数'''
        action_probs = self.actor(state)
        dist = Categorical(action_probs)
        action = dist.sample()     # 根据上面的概率，返回一个动作
        action_logprob = dist.log_prob(action)
        
        return action.detach(), action_logprob.detach()  # [x,x,x], x
    
    def evaluate(self, state, action):
        '''输入一批状态和动作，得到一批评估结果, .log_prob(action)中action不必是这个dist筛出来的'''
        action_probs = self.actor(state)
        dist = Categorical(action_probs)
        
        action_logprobs = dist.log_prob(action)
        state_values = self.critic(state)
        dist_entropy = dist.entropy()
        
        return action_logprobs, state_values, dist_entropy 

In [6]:
class PPO:
    def __init__(self, env, lr_actor, lr_critic, gamma, K_epochs, eps_clip):
        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        
        self.gamma = gamma
        self.K_epochs = K_epochs
        self.eps_clip = eps_clip
        self.buffer = RolloutBuffer()
        
        # 初始化Actor_Critic 整体网络
        self.policy = ActorCritic(self.state_dim, self.action_dim).to(device)
        self.policy_old = ActorCritic(self.state_dim, self.action_dim).to(device)
        
        # 初始化优化器和损失函数
        # 因为一个policy里面有两个网络，所以优化器的参数和学习率是两个字典组成的列表
        self.optimizer = torch.optim.Adam([
            {'params':self.policy.actor.parameters(), 'lr':lr_actor},
            {'params':self.policy.critic.parameters(), 'lr':lr_critic}
        ])

        self.loss_fn = nn.MSELoss()
    
 
    
    def select_action(self, state):
        '''根据概率选动作，就算概率小也能选到，训练时候用'''
        with torch.no_grad():
            state = torch.FloatTensor(state).to(device)
            action, action_log_prob = self.policy_old.act(state)
        
        # 选择动作的同时，把state， action, action_log_prob 加入buffer,都是tensor
        self.buffer.states.append(state)
        self.buffer.actions.append(action)    
        self.buffer.logprobs.append(action_log_prob)
        
        return action.detach().cpu().numpy()
    
    def test_select_action(self, state):
        '''直接根据最大概率选动作，而不是小概率也有可能选中，测试时候使用'''
        with torch.no_grad():
            state = torch.FloatTensor(state).to(device)
            action_prob = self.policy_old.actor(state).detach().numpy()
            action = np.argmax(action_prob)
        return action
         
    def update(self):
        # 先把reward转换为能用的折扣过的rewards
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)  # 把折扣过的动作重头排列
        # 归一化reward
        rewards = torch.FloatTensor(rewards).to(device)
        rewards = (rewards - rewards.mean())/(rewards.std() + 1e-7)  # [4000]
        
        # buffer里面的这三个东西本来就是tensor，所以纵向连接也要用tensor的连接方法
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)   # [4000, 11]
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device) # [4000, 3]
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device) # [4000]
        
        # update
        for _ in range(self.K_epochs):
            
            # 用现在的actor和critic，评估旧的states和actions
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
        #   [4000],   [4000, 1]     [4000]
            
            # 把state_values 变成[batch_size] 和rewards形状一致
            state_values = torch.squeeze(state_values)
            
            # 计算ratio
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # 计算loss
            advantages = rewards - state_values.detach()
            surr1 = ratios *  advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            
            # PPO 的最终loss
            loss = -torch.min(surr1, surr2) - 0.01 * dist_entropy + 0.5 * self.loss_fn(state_values, rewards) #[4000]
            loss = loss.mean()
            
            # 梯度更新
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
        
        # 更新目标网络，用load模型参数的方法来硬更新模型
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        # 清空buffer
        self.buffer.clear()        
        
    
    def save(self, checkpoint_path):
        '''只保存模型参数'''
        torch.save(self.policy_old.state_dict(), checkpoint_path)
        
    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path))
        self.policy.load_state_dict(torch.load(checkpoint_path))

In [7]:
env_name = "CartPole-v0"

max_ep_len = 1000
max_train_timesteps = int(3e6)

print_freq = max_ep_len * 10        # 多少步打印一下奖励
save_model_freq = int(1e5)          # 多少步保存一下模型

update_timestep = max_ep_len*4  # buffer存4000组数据训练一次
K_epochs = 80                   # update一次更新网络80次

eps_clip = 0.2
gamma = 0.99
lr_actor = 0.0003
lr_critic = 0.001

env = gym.make(env_name)

ppo_agent = PPO(env, lr_actor, lr_critic, gamma, K_epochs, eps_clip)

directory = 'PPO_weigths' + '/' + env_name + '/'
if not os.path.exists(directory):
      os.makedirs(directory)
checkpoint_path = directory + 'ppo_{}.pth'.format(env_name)  # 模型保存会覆盖这个文件，整套训练下来，就保存一个文件

In [8]:
time_step = 0
i_episod = 0

# printing and logging variables
print_running_reward = 0
print_running_episodes = 0

while time_step <= max_train_timesteps:  # 以最大步数作为终止条件，同之前的第二种训练方法
    state = env.reset()
    current_ep_reward = 0
    ep_step = 0
    for t in range(max_ep_len):
        action = ppo_agent.select_action(state)  # 已经存入了三个东西到buffer了！
        state, reward, done, _ = env.step(action) # 直接更新下一个状态到当前状态，没有出现next_state
        
        ppo_agent.buffer.rewards.append(reward)
        ppo_agent.buffer.is_terminals.append(done)
        
        time_step += 1
        current_ep_reward += reward
        ep_step += 1
        
        if time_step % update_timestep == 0:
            ppo_agent.update()
            
        # printing average reward
        if time_step % print_freq == 0:
            # 打印这么些回合的平均回合奖励
            print_avg_reward = print_running_reward / print_running_episodes
            print_avg_reward = round(print_avg_reward, 2)

            print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episod, time_step, print_avg_reward))

            print_running_reward = 0
            print_running_episodes = 0
        
        if time_step % save_model_freq == 0:
            ppo_agent.save(checkpoint_path)
            
        if done:
            break
    
    print_running_reward += current_ep_reward
    print_running_episodes += 1
    i_episod += 1

Episode : 366 		 Timestep : 10000 		 Average Reward : 27.19
Episode : 488 		 Timestep : 20000 		 Average Reward : 81.73
Episode : 546 		 Timestep : 30000 		 Average Reward : 171.71
Episode : 600 		 Timestep : 40000 		 Average Reward : 183.87
Episode : 653 		 Timestep : 50000 		 Average Reward : 190.94
Episode : 703 		 Timestep : 60000 		 Average Reward : 198.5
Episode : 754 		 Timestep : 70000 		 Average Reward : 195.53
Episode : 806 		 Timestep : 80000 		 Average Reward : 195.4
Episode : 858 		 Timestep : 90000 		 Average Reward : 191.12
Episode : 909 		 Timestep : 100000 		 Average Reward : 196.39
Episode : 959 		 Timestep : 110000 		 Average Reward : 200.0
Episode : 1009 		 Timestep : 120000 		 Average Reward : 200.0
Episode : 1059 		 Timestep : 130000 		 Average Reward : 198.48
Episode : 1111 		 Timestep : 140000 		 Average Reward : 194.25
Episode : 1162 		 Timestep : 150000 		 Average Reward : 193.94
Episode : 1215 		 Timestep : 160000 		 Average Reward : 190.7
Episode : 1266 		 T

KeyboardInterrupt: 

# 测试

In [9]:
ppo_agent_new = PPO(env, lr_actor, lr_critic, gamma, K_epochs, eps_clip)

In [10]:
ppo_agent_new.load(checkpoint_path)

In [11]:

for j in range(100):
    state = env.reset()
    for i in range(200):
        env.render()
        action = ppo_agent_new.test_select_action(state)
        state, reward, done, _ = env.step(action)
        if done:
            break



KeyboardInterrupt: 