In [1]:
import os
import numpy as np

import parl
from parl import layers
from paddle import fluid
from parl.utils import logger
from parl.utils import action_mapping # 将神经网络输出映射到对应的 实际动作取值范围 内
from parl.utils import ReplayMemory # 经验回放

from rlschool import make_env  # 使用 RLSchool 创建飞行器环境
from copy import deepcopy

  return f(*args, **kwds)


### Model

In [2]:
class ActorModel(parl.Model): # obs_dim=16 -> act_dim=4
    def __init__(self, act_dim):
        hid_size = 380

        self.act_dim = act_dim
        self.fc1 = layers.fc(size=hid_size, act='relu')
        self.fc2 = layers.fc(size=hid_size, act='relu')
        self.fc3 = layers.fc(size=hid_size, act='relu')
        self.fc4 = layers.fc(size=act_dim, act='tanh')

    def policy(self, obs):
        out = self.fc1(obs)
        out = self.fc2(out)
        out = self.fc3(out)
        logits = self.fc4(out)
        return logits


class CriticModel(parl.Model): # [action_dim=4, obs_dim=19]->input_dim=23, output_dim = 1
    def __init__(self):
        hid_size = 460

        self.fc1 = layers.fc(size=hid_size, act='relu')
        self.fc2 = layers.fc(size=hid_size, act='relu')
        self.fc3 = layers.fc(size=hid_size, act='relu')
        self.fc4 = layers.fc(size=1, act=None)

    def value(self, obs, act):
        concat = layers.concat([obs, act], axis=1)
        hid = self.fc1(concat)
        hid = self.fc2(hid)
        hid = self.fc3(hid)
        Q = self.fc4(hid)
        Q = layers.squeeze(Q, axes=[1])
        return Q



class QuadrotorModel(parl.Model):
    def __init__(self, act_dim):
        self.actor_model = ActorModel(act_dim)
        self.critic_model = CriticModel()

    def policy(self, obs):
        return self.actor_model.policy(obs)

    def value(self, obs, act):
        return self.critic_model.value(obs, act)

    def get_actor_params(self):
        return self.actor_model.parameters()

### Algorithm

In [3]:
from parl.algorithms import DDPG # 也可以直接从parl库中快速引入DDPG算法，无需自己重新写算法

### Agent

In [4]:
class QuadrotorAgent(parl.Agent):
    def __init__(self, algorithm, obs_dim, act_dim=4):
        assert isinstance(obs_dim, int)
        assert isinstance(act_dim, int)
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        super(QuadrotorAgent, self).__init__(algorithm)

        # 注意，在最开始的时候，先完全同步target_model和model的参数
        self.alg.sync_target(decay=0)

    def build_program(self):
        self.pred_program = fluid.Program()
        self.learn_program = fluid.Program()

        with fluid.program_guard(self.pred_program): # predict计算图
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            self.pred_act = self.alg.predict(obs)

        with fluid.program_guard(self.learn_program): # learn计算图
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            act = layers.data(
                name='act', shape=[self.act_dim], dtype='float32')
            reward = layers.data(name='reward', shape=[], dtype='float32')
            next_obs = layers.data(
                name='next_obs', shape=[self.obs_dim], dtype='float32')
            terminal = layers.data(name='terminal', shape=[], dtype='bool')
            _, self.critic_cost = self.alg.learn(obs, act, reward, next_obs,
                                                 terminal)

    def predict(self, obs):
        obs = np.expand_dims(obs, axis=0)
        act = self.fluid_executor.run(
            self.pred_program, feed={'obs': obs},
            fetch_list=[self.pred_act])[0]
        return act

    def learn(self, obs, act, reward, next_obs, terminal):
        feed = {
            'obs': obs,
            'act': act,
            'reward': reward,
            'next_obs': next_obs,
            'terminal': terminal
        }
        critic_cost = self.fluid_executor.run(
            self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0]
        self.alg.sync_target()
        return critic_cost

In [12]:
def run_episode(env, agent, rpm):
    obs = env.reset()
    total_reward, steps = 0, 0
    while True:
        steps += 1
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype('float32'))
        action = np.squeeze(action)

        # 给输出动作增加探索扰动，输出限制在 [-1.0, 1.0] 范围内
        action = np.clip(np.random.normal(action, 0.5), -1.0, 1.0)
        act_unmap = action
        
        # 动作映射到对应的 实际动作取值范围 内, action_mapping是从parl.utils那里import进来的函数
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0]) # 0.1~15.0


        next_obs, reward, done, info = env.step(action) # 采取action
        rpm.append(obs, act_unmap, REWARD_SCALE * reward, next_obs, done) # 存入rpm

        if rpm.size() > MEMORY_WARMUP_SIZE:
            batch_obs, batch_action, batch_reward, batch_next_obs, \
                    batch_terminal = rpm.sample_batch(BATCH_SIZE) # sample一个batch
            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
                                      batch_next_obs, batch_terminal) # 给agent学习

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward, steps


def evaluate(env, agent):
    eval_reward = []
    for i in range(1):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            action = np.squeeze(action)
            action = np.clip(action, -1.0, 1.0)
            
#             mean = np.mean(action)
#             action = action + 0.2*(mean-action)
            action = action_mapping(action, env.action_space.low[0],
                                    env.action_space.high[0])


            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward
            steps += 1

            if done:
                break
        eval_reward.append(total_reward)
    print(">>>{}>>>".format(eval_reward))
    return np.mean(eval_reward)

In [13]:
# 设置超参数 Hyperparameters
ACTOR_LR = 1e-6   # Actor网络更新的 learning rate
CRITIC_LR = 1e-7   # Critic网络更新的 learning rate

GAMMA = 0.99        # reward 的衰减因子，一般取 0.9 到 0.999 不等
TAU = 0.001         # target_model 跟 model 同步参数 的 软更新参数
MEMORY_SIZE = 3e4   # replay memory的大小，越大越占用内存
MEMORY_WARMUP_SIZE = 1e4      # replay_memory 里需要预存一些经验数据，再从里面sample一个batch的经验让agent去learn
REWARD_SCALE = 0.01       # reward 的缩放因子
BATCH_SIZE = 128
# 每次给agent learn的数据数量，从replay memory随机里sample一批数据出来
TRAIN_TOTAL_STEPS = 1e6   # 总训练步数
TEST_EVERY_STEPS = 5e3    # 每个N步评估一下算法效果，每次评估5个episode求平均reward

In [None]:
# 创建飞行器环境 Create the env
env = make_env("Quadrotor", task="velocity_control", seed=0)
env.reset()
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

model = QuadrotorModel(act_dim=act_dim)
algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR)
agent = QuadrotorAgent(algorithm, obs_dim, act_dim)

# parl库也为DDPG算法内置了ReplayMemory，可直接从 parl.utils 引入使用
rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim)

In [None]:
# 启动训练
#run4_steps_801000_reward_-19.98
ckpt = './run1_steps_110000_reward_-19.84.ckpt'
agent.restore(ckpt)

test_flag = 0
total_steps = 0

Rplot = []
while total_steps < TRAIN_TOTAL_STEPS:
    train_reward, steps = run_episode(env, agent, rpm)
    total_steps += steps
    logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) # 打印训练reward

    if total_steps // TEST_EVERY_STEPS >= test_flag: # 每隔一定step数，评估一次模型
        while total_steps // TEST_EVERY_STEPS >= test_flag:
            test_flag += 1
 
        evaluate_reward = evaluate(env, agent)
        logger.info(' 😀Steps {}, Test reward: {}😀'.format(
            total_steps, np.round(evaluate_reward,5))) # 打印评估的reward
        
        Rplot.append(np.round(evaluate_reward,2))

        # 每评估一次，就保存一次模型，以训练的step数命名
        ckpt = './run1_{}_reward_{}.ckpt'.format(total_steps, np.round(evaluate_reward,2))
        agent.save(ckpt)

### Test

In [23]:
ckpt = './run1_steps_110000_reward_-19.84.ckpt'
agent.restore(ckpt)
r = 0

env = make_env("Quadrotor", task="velocity_control", seed=0)
obs = env.reset()

while True:
    env.render()
    batch_obs = np.expand_dims(obs, axis=0)
    action = agent.predict(batch_obs.astype('float32'))
    action = np.squeeze(action)
    action = np.clip(action, -1.0, 1.0)

    action = action_mapping(action, env.action_space.low[0],
                            env.action_space.high[0])
    
    next_obs, reward, done, info = env.step(action)
    obs = next_obs
    r += reward

    if done:
        env.close()
        break
        
print(r)



-19.18377667987661
