# 大作业：使用DDPG解决四轴飞行器悬浮任务

# Step1 安装依赖

In [None]:
!pip uninstall -y parl  # 说明：AIStudio预装的parl版本太老，容易跟其他库产生兼容性冲突，建议先卸载
!pip uninstall -y pandas scikit-learn # 提示：在AIStudio中卸载这两个库再import parl可避免warning提示，不卸载也不影响parl的使用

!pip install paddlepaddle-gpu==1.6.3.post107
!pip install parl==1.3.1
!pip install rlschool==0.3.1

# 说明：安装日志中出现两条红色的关于 paddlehub 和 visualdl 的 ERROR 与parl无关，可以忽略，不影响使用

In [None]:
# 检查依赖包版本是否正确
!pip list | grep paddlepaddle
!pip list | grep parl
!pip list | grep rlschool

paddlepaddle-gpu     1.6.2.post97   
parl                 1.3.1          
rlschool             0.3.1          


# Step2 导入依赖

In [None]:
import os
import numpy as np

import parl
from parl import layers
from paddle import fluid
from parl.utils import logger
from parl.utils import action_mapping # 将神经网络输出映射到对应的 实际动作取值范围 内
from parl.utils import ReplayMemory # 经验回放

from rlschool import make_env  # 使用 RLSchool 创建飞行器环境

# Step3 设置超参数

In [None]:
######################################################################
######################################################################
#
# 1. 请设定 learning rate，尝试增减查看效果
#
######################################################################
######################################################################
ACTOR_LR = 0.0005   # Actor网络更新的 learning rate 
d_alr=0.000001
CRITIC_LR = 0.005   # Critic网络更新的 learning rate 
d_clr=0.00001

GAMMA = 0.98        # reward 的衰减因子，一般取 0.9 到 0.999 不等
TAU = 0.01         # target_model 跟 model 同步参数 的 软更新参数
MEMORY_SIZE = 1e6   # replay memory的大小，越大越占用内存
MEMORY_WARMUP_SIZE = 1e4      # replay_memory 里需要预存一些经验数据，再从里面sample一个batch的经验让agent去learn
REWARD_SCALE = 0.01       # reward 的缩放因子
BATCH_SIZE = 256          # 每次给agent learn的数据数量，从replay memory随机里sample一批数据出来
TRAIN_TOTAL_STEPS = 1e6   # 总训练步数
TEST_EVERY_STEPS = 1e4    # 每个N步评估一下算法效果，每次评估5个episode求平均reward

# Step4 搭建Model、Algorithm、Agent架构
* `Agent`把产生的数据传给`algorithm`，`algorithm`根据`model`的模型结构计算出`Loss`，使用`SGD`或者其他优化器不断的优化，`PARL`这种架构可以很方便的应用在各类深度强化学习问题中。

## （1）Model
* 分别搭建`Actor`、`Critic`的`Model`结构，构建`QuadrotorModel`。

In [None]:
class ActorModel(parl.Model):
    def __init__(self, act_dim):
        ######################################################################
        ######################################################################
        #offset
        self.fc1 = layers.fc(size=64, act='relu')
        self.fc2 = layers.fc(size=64, act='relu')
        self.fc3 = layers.fc(size=64, act='relu')
        self.fc4 = layers.fc(size=64, act='relu')
        self.fc5 = layers.fc(act_dim, act='tanh')
        #main power
        self.pfc1 = layers.fc(size=64, act='relu')
        self.pfc2 = layers.fc(size=64, act='relu')
        self.pfc3 = layers.fc(size=64, act='relu')
        self.pfc4 = layers.fc(size=64, act='relu')
        self.pfc5 = layers.fc(1, act='tanh')
        self.final = layers.fc(act_dim, act='tanh')
        ######################################################################
        ######################################################################

    def policy(self, obs):
        ######################################################################
        ######################################################################
        h1 = self.fc1(obs)
        h2 = self.fc2(h1)
        h3 = self.fc3(h2)
        h4 = self.fc4(h3)
        h5 = self.fc5(h4) * 0.2
        ph1 = self.pfc1(obs)
        ph2 = self.pfc2(ph1)
        ph3 = self.pfc3(ph2)
        ph4 = self.pfc4(ph3)
        ph5 = self.pfc5(ph4)
        logits = fluid.layers.elementwise_add(h5, ph5)
        logits = self.final(logits)     
        

        ######################################################################
        ######################################################################
        return logits


In [None]:
class CriticModel(parl.Model):
    def __init__(self):
        ######################################################################
        ######################################################################
        self.fc1 = layers.fc(size=64, act='relu')
        self.fc2 = layers.fc(size=64, act='relu')
        self.fc3 = layers.fc(size=64, act='relu')
        self.fc4 = layers.fc(size=64, act='relu')
        self.fc5 = layers.fc(size=1, act=None)
        ######################################################################
        ######################################################################

    def value(self, obs, act):
        # 输入 state, action, 输出对应的Q(s,a)

        ######################################################################
        ######################################################################
        concat = layers.concat([obs, act], axis=1)
        h1 = self.fc1(concat)
        h2 = self.fc2(h1)
        h3 = self.fc3(h2)
        h4 = self.fc4(h3)
        Q = self.fc5(h4)
        Q = layers.squeeze(Q, axes=[1])
        ######################################################################
        ######################################################################
        return Q

In [None]:
class QuadrotorModel(parl.Model):
    def __init__(self, act_dim):
        self.actor_model = ActorModel(act_dim)
        self.critic_model = CriticModel()

    def policy(self, obs):
        return self.actor_model.policy(obs)

    def value(self, obs, act):
        return self.critic_model.value(obs, act)

    def get_actor_params(self):
        return self.actor_model.parameters()

## （2）Algorithm
* 可以采用下面的方式从`parl`库中快速引入`DDPG`算法，无需自己重新写算法

In [None]:
from parl.algorithms import DDPG

## （3）Agent

In [None]:
class QuadrotorAgent(parl.Agent):
    def __init__(self, algorithm, obs_dim, act_dim=4):
        assert isinstance(obs_dim, int)
        assert isinstance(act_dim, int)
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        super(QuadrotorAgent, self).__init__(algorithm)

        # 注意，在最开始的时候，先完全同步target_model和model的参数
        self.alg.sync_target(decay=0)

    def build_program(self):
        self.pred_program = fluid.Program()
        self.learn_program = fluid.Program()

        with fluid.program_guard(self.pred_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            self.pred_act = self.alg.predict(obs)

        with fluid.program_guard(self.learn_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            act = layers.data(
                name='act', shape=[self.act_dim], dtype='float32')
            reward = layers.data(name='reward', shape=[], dtype='float32')
            next_obs = layers.data(
                name='next_obs', shape=[self.obs_dim], dtype='float32')
            terminal = layers.data(name='terminal', shape=[], dtype='bool')
            _, self.critic_cost = self.alg.learn(obs, act, reward, next_obs,
                                                 terminal)

    def predict(self, obs):
        obs = np.expand_dims(obs, axis=0)
        act = self.fluid_executor.run(
            self.pred_program, feed={'obs': obs},
            fetch_list=[self.pred_act])[0]
        return act

    def learn(self, obs, act, reward, next_obs, terminal):
        feed = {
            'obs': obs,
            'act': act,
            'reward': reward,
            'next_obs': next_obs,
            'terminal': terminal
        }
        critic_cost = self.fluid_executor.run(
            self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0]
        self.alg.sync_target()
        return critic_cost


# Step4 Training && Test（训练&&测试）

In [None]:
def run_episode(env, agent, rpm):
    obs = env.reset()
    total_reward, steps = 0, 0
    while True:
        steps += 1
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype('float32'))
        action = np.squeeze(action)

        # 给输出动作增加探索扰动，输出限制在 [-1.0, 1.0] 范围内
        action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0)
        # 动作映射到对应的 实际动作取值范围 内, action_mapping是从parl.utils那里import进来的函数
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        next_obs, reward, done, info = env.step(action)
        rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done)

        if rpm.size() > MEMORY_WARMUP_SIZE:
            batch_obs, batch_action, batch_reward, batch_next_obs, \
                    batch_terminal = rpm.sample_batch(BATCH_SIZE)
            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
                                      batch_next_obs, batch_terminal)

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward, steps

# 评估 agent, 跑 5 个episode，总reward求平均
def evaluate(env, agent):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            action = np.squeeze(action)
            try:
                action = action_mapping(action, env.action_space.low[0], 
                                        env.action_space.high[0])
            except(AssertionError):
                print(action)

            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward
            steps += 1

            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)

# Step 5 创建环境和Agent，创建经验池，启动训练，定期保存模型

In [None]:
# 创建飞行器环境
env = make_env("Quadrotor", task="hovering_control")
env.reset()
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]


# 根据parl框架构建agent
######################################################################
######################################################################
#
# 6. 请构建agent:  QuadrotorModel, DDPG, QuadrotorAgent三者嵌套
#
######################################################################
######################################################################
model = QuadrotorModel(act_dim)
algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR)
agent = QuadrotorAgent(algorithm, obs_dim, act_dim)

# parl库也为DDPG算法内置了ReplayMemory，可直接从 parl.utils 引入使用
rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim)

[32m[06-22 19:00:50 MainThread @machine_info.py:86][0m nvidia-smi -L found gpu count: 1
[32m[06-22 19:00:51 MainThread @machine_info.py:86][0m nvidia-smi -L found gpu count: 1
[32m[06-22 19:00:53 MainThread @machine_info.py:86][0m nvidia-smi -L found gpu count: 1


In [None]:
ckpt = 'steps_20099.ckpt'  # 出问题了 中断了 接着之前的model跑
agent.restore(ckpt)

#手动调整参数
# agent.alg.actor_lr = 0.0001
# agent.alg.critic_lr = 0.001
# agent.alg.tau =0.01
# agent.alg.gamma = 0.98
print(agent.alg.actor_lr)
print(agent.alg.critic_lr)
print(agent.alg.tau)

# 启动训练
test_flag = 0
total_steps = 0
best_reward = 4000
F1 = True
F2 = True
F3 = True
F4 = True
while total_steps < TRAIN_TOTAL_STEPS:
    #减小Lr
    if total_steps >= int(TRAIN_TOTAL_STEPS/8) and F1==True:
        agent.alg.actor_lr /= 2.5 
        agent.alg.critic_lr /= 2.5
        print(agent.alg.actor_lr)
        print(agent.alg.critic_lr)
        F1 = False
    if total_steps >= int(TRAIN_TOTAL_STEPS/4) and F2 == True:
        agent.alg.actor_lr /= 2
        agent.alg.critic_lr /= 2
        print(agent.alg.actor_lr)
        print(agent.alg.critic_lr)
        F2 = False
    if total_steps >= int(TRAIN_TOTAL_STEPS/2) and F3==True:
        agent.alg.actor_lr /= 2
        agent.alg.critic_lr /= 2
        print(agent.alg.actor_lr)
        print(agent.alg.critic_lr)
        F3 = False
    if total_steps >= int(TRAIN_TOTAL_STEPS*3/4) and F4 == True:
        agent.alg.actor_lr /= 2
        agent.alg.critic_lr /= 2
        print(agent.alg.actor_lr)
        print(agent.alg.critic_lr)
        F4 = False

    train_reward, steps = run_episode(env, agent, rpm)
    total_steps += steps
    logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) # 打印训练reward
    if best_reward < train_reward:
        agent.alg.actor_lr -= d_alr
        agent.alg.critic_lr -= d_clr
        best_reward = train_reward
        print(agent.alg.actor_lr)
        print(agent.alg.critic_lr)
    if total_steps // TEST_EVERY_STEPS >= test_flag: # 每隔一定step数，评估一次模型
        while total_steps // TEST_EVERY_STEPS >= test_flag:
            test_flag += 1

        evaluate_reward = evaluate(env, agent)
        logger.info('Steps {}, Test reward: {}'.format(
            total_steps, evaluate_reward)) # 打印评估的reward

        # 每评估一次，就保存一次模型，以训练的step数命名
        ckpt = 'model_dir-256/steps_{}.ckpt'.format(total_steps)
        agent.save(ckpt)
        #减小Lr
        if evaluate_reward >= 8000:
            agent.alg.actor_lr -= d_alr
            agent.alg.critic_lr -= d_clr
            print(agent.alg.actor_lr)
            print(agent.alg.critic_lr)
        elif evaluate_reward >= 14000:
            agent.alg.actor_lr -= 2*d_alr
            agent.alg.critic_lr -= 2*d_clr
            print(agent.alg.actor_lr)
            print(agent.alg.critic_lr)
        if best_reward < evaluate_reward:
            agent.alg.actor_lr -= d_alr
            agent.alg.critic_lr -= d_clr
            best_reward = evaluate_reward
            print(agent.alg.actor_lr)
            print(agent.alg.critic_lr)

0.0005
0.005
0.01
[32m[06-22 01:30:22 MainThread @<ipython-input-13-c5cd17859351>:48][0m Steps: 479 Reward: -3082.3572891572935
[32m[06-22 01:30:39 MainThread @<ipython-input-13-c5cd17859351>:61][0m Steps 479, Test reward: -792.4926482843719
[32m[06-22 01:30:41 MainThread @<ipython-input-13-c5cd17859351>:48][0m Steps: 614 Reward: -732.0957787522317
[32m[06-22 01:30:44 MainThread @<ipython-input-13-c5cd17859351>:48][0m Steps: 801 Reward: -428.6034888177062
[32m[06-22 01:30:49 MainThread @<ipython-input-13-c5cd17859351>:48][0m Steps: 1143 Reward: -102.45102452087123
[32m[06-22 01:30:52 MainThread @<ipython-input-13-c5cd17859351>:48][0m Steps: 1353 Reward: -178.11484636336766
[32m[06-22 01:30:58 MainThread @<ipython-input-13-c5cd17859351>:48][0m Steps: 1834 Reward: -2884.5686448912948
[32m[06-22 01:31:03 MainThread @<ipython-input-13-c5cd17859351>:48][0m Steps: 2129 Reward: -1139.8364072585905
[32m[06-22 01:31:12 MainThread @<ipython-input-13-c5cd17859351>:48][0m Steps: 

In [13]:
# 又出问题了 中断了 接着之前的model跑
ckpt = 'steps_251389.ckpt'  
agent.restore(ckpt)

#手动更新参数
agent.alg.actor_lr = 9.450000000000002e-05
agent.alg.critic_lr = 0.0009449999999999999
# agent.alg.tau =0.01
# agent.alg.gamma = 0.98
print(agent.alg.actor_lr)
print(agent.alg.critic_lr)
print(agent.alg.tau)

# 启动训练
test_flag = 0
total_steps = 251389
best_reward = 8907
F1 = False
F2 = False
F3 = True
F4 = True
while total_steps < TRAIN_TOTAL_STEPS:
    #减小Lr
    if total_steps >= int(TRAIN_TOTAL_STEPS/8) and F1==True:
        agent.alg.actor_lr /= 2.5 
        agent.alg.critic_lr /= 2.5
        print(agent.alg.actor_lr)
        print(agent.alg.critic_lr)
        F1 = False
    if total_steps >= int(TRAIN_TOTAL_STEPS/4) and F2 == True:
        agent.alg.actor_lr /= 2
        agent.alg.critic_lr /= 2
        print(agent.alg.actor_lr)
        print(agent.alg.critic_lr)
        F2 = False
    if total_steps >= int(TRAIN_TOTAL_STEPS/2) and F3==True:
        agent.alg.actor_lr /= 2
        agent.alg.critic_lr /= 2
        print(agent.alg.actor_lr)
        print(agent.alg.critic_lr)
        F3 = False
    if total_steps >= int(TRAIN_TOTAL_STEPS*3/4) and F4 == True:
        agent.alg.actor_lr /= 2
        agent.alg.critic_lr /= 2
        print(agent.alg.actor_lr)
        print(agent.alg.critic_lr)
        F4 = False

    train_reward, steps = run_episode(env, agent, rpm)
    total_steps += steps
    logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) # 打印训练reward
    if best_reward < train_reward:
        agent.alg.actor_lr -= d_alr
        agent.alg.critic_lr -= d_clr
        best_reward = train_reward
        print(agent.alg.actor_lr)
        print(agent.alg.critic_lr)
    if total_steps // TEST_EVERY_STEPS >= test_flag: # 每隔一定step数，评估一次模型
        while total_steps // TEST_EVERY_STEPS >= test_flag:
            test_flag += 1

        evaluate_reward = evaluate(env, agent)
        logger.info('Steps {}, Test reward: {}'.format(
            total_steps, evaluate_reward)) # 打印评估的reward

        # 每评估一次，就保存一次模型，以训练的step数命名
        ckpt = 'model_dir-256/steps_{}.ckpt'.format(total_steps)
        agent.save(ckpt)
        #减小Lr
        if evaluate_reward >= 8000:
            agent.alg.actor_lr -= d_alr
            agent.alg.critic_lr -= d_clr
            print(agent.alg.actor_lr)
            print(agent.alg.critic_lr)
        elif evaluate_reward >= 14000:
            agent.alg.actor_lr -= 2*d_alr
            agent.alg.critic_lr -= 2*d_clr
            print(agent.alg.actor_lr)
            print(agent.alg.critic_lr)
        if best_reward < evaluate_reward:
            agent.alg.actor_lr -= d_alr
            agent.alg.critic_lr -= d_clr
            best_reward = evaluate_reward
            print(agent.alg.actor_lr)
            print(agent.alg.critic_lr)

9.450000000000002e-05
0.0009449999999999999
0.01
[32m[06-22 19:33:54 MainThread @<ipython-input-13-8839a3c1c5fe>:48][0m Steps: 252224 Reward: 3599.1211132883773
[32m[06-22 19:34:50 MainThread @<ipython-input-13-8839a3c1c5fe>:61][0m Steps 252224, Test reward: 5797.995847665563
[32m[06-22 19:35:36 MainThread @<ipython-input-13-8839a3c1c5fe>:48][0m Steps: 253053 Reward: 4112.612039677296
[32m[06-22 19:36:32 MainThread @<ipython-input-13-8839a3c1c5fe>:48][0m Steps: 254053 Reward: 8190.797798864596
[32m[06-22 19:37:27 MainThread @<ipython-input-13-8839a3c1c5fe>:48][0m Steps: 255053 Reward: 2367.30257052315
[32m[06-22 19:38:21 MainThread @<ipython-input-13-8839a3c1c5fe>:48][0m Steps: 256053 Reward: 6139.15499807943
[32m[06-22 19:39:17 MainThread @<ipython-input-13-8839a3c1c5fe>:48][0m Steps: 257053 Reward: -2022.8482708747588
[32m[06-22 19:40:13 MainThread @<ipython-input-13-8839a3c1c5fe>:48][0m Steps: 258053 Reward: -2641.1988101147144
[32m[06-22 19:40:45 MainThread @<ipyth

KeyboardInterrupt: 

# 验收测评

In [20]:
######################################################################
######################################################################
#
# 7. 请选择你训练的最好的一次模型文件做评估
#
######################################################################
######################################################################

print("之前较好那版是在colab上跑的 结果大概是因为runtime超了 就蹦了 存的model也没了 紧急用aistudio又跑了一下 只到三十多万 时间不够了没完全收敛 这是目前最好的model了")

ckpt = 'steps_251389.ckpt'  # 请设置ckpt为你训练中效果最好的一次评估保存的模型文件名称

agent.restore(ckpt)
evaluate_reward = evaluate(env, agent)
logger.info('Evaluate reward: {}'.format(evaluate_reward)) # 打印评估的reward


之前较好那版是在colab上跑的 结果大概是因为runtime超了 就蹦了 存的model也没了 紧急用aistudio又跑了一下 只到三十多万 时间不够了没完全收敛 这是目前最好的model了
[32m[06-22 21:54:31 MainThread @<ipython-input-20-42922b0ec664>:15][0m Evaluate reward: 8934.268521856224
