In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# Hyper Parameters
BATCH_SIZE = 32
LR = 0.01  # 学习率
EPSILON = 0.9  # 贪心策略
GAMMA = 0.9  # 奖励衰减
TARGET_REPLACE_ITER = 100  # 目标更新频率
MEMORY_CAPACITY = 1000
env = gym.make('MountainCarContinuous-v0')
# env = gym.make('CartPole-v1')
env = env.unwrapped
N_ACTIONS = env.action_space
N_STATES = env.observation_space.shape[0]
# ENV_A_SHAPE = 0 if isinstance(env.action_space.sample()[0], int) else env.action_space.sample().shape  # 确认形状
print(N_STATES)
print(N_ACTIONS)

2
Box([-1.], [1.], (1,), float32)


In [3]:
a = torch.randn(4, 6)
a

tensor([[-1.9829, -1.1503,  0.3044,  0.2053,  1.0183, -2.4295],
        [-1.0728,  0.0179, -1.3310, -1.0436,  0.5158,  0.9143],
        [-0.3082,  0.7925, -0.3971, -0.2602, -1.8656, -2.8243],
        [-1.1412,  0.2394,  0.7934, -0.2238, -0.5671,  1.6657]])

In [4]:
torch.max(a, 1)[1]

tensor([4, 5, 1, 5])

In [5]:
class DQN(object):  # 强化神经网络
    def __init__(self):
        self.eval_net, self.target_net = Net(), Net()  # 定义两个网络：评估网络 & 目标网络
        self.learn_step_counter = 0  # 用于目标网络延迟更新
        self.memory_counter = 0  # 存储计数器
        self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))  # 初始化记忆全为 0
        # 记忆单元为 当前状态 行为 奖励 操作后状态
        # 单元数为 状态维度 * 2 + 行为维度 + 奖励维度(1)
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)  # todo Adam 优化器
        self.loss_func = nn.MSELoss()  # 均方误差损失函数

    def choose_action(self, x):  # 传入当前的 State，计算行为
        x = torch.unsqueeze(torch.FloatTensor(x), 0)  # 解包
        # 只输入一个样本
        if np.random.uniform() < EPSILON:  # 90% 概率使用评估网络的结果
            actions_value = self.eval_net.forward(x)  # 使用评估网络获取行为
            # print(actions_value)
            action = (torch.max(actions_value, 1)[1].data.numpy()[0] - 0.5)*2
            # print(torch.max(actions_value, 1))
            # print('{action}【',action,'】')
            # 返回argmax索引
        else:  # 随机
            action = np.random.randint(-1, 1)
        return action

    def store_transition(self, s, a, r, s_):
        # print('[state]:',s)
        # print('[action,reward]:',[a, r])
        # print('[stated]:',s_)
        transition = np.hstack((s, [a, r], s_))
        # 用新内存替换旧内存
        index = self.memory_counter % MEMORY_CAPACITY
        self.memory[index, :] = transition
        self.memory_counter += 1

    def learn(self):
        # 目标参数更新
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1

        # 样本批次转换
        sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
        b_memory = self.memory[sample_index, :]
        b_s = torch.FloatTensor(b_memory[:, :N_STATES])
        b_a = torch.FloatTensor(b_memory[:, N_STATES:N_STATES + 1].astype(float))
        b_r = torch.FloatTensor(b_memory[:, N_STATES + 1:N_STATES + 2])
        b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:])

        # q_eval w.r.t体验中的行动
        q_eval = self.eval_net(b_s)  # shape (batch, 1)
        q_next = self.target_net(b_s_).detach()  # detach from graph, don't backpropagate
        q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)  # shape (batch, 1)
        loss = self.loss_func(q_eval, q_target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [6]:
class Net(nn.Module):
    def __init__(self, ):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(N_STATES, 1)  # 传入网络 状态数 -> 50
        self.fc1.weight.data.normal_(0, 0.1)  # 随机初始化权重
        self.out = nn.Linear(1, 2)  # 输出网络 50 -> 输出Action数
        self.out.weight.data.normal_(0, 0.1)  # 随机初始化权重

    def forward(self, x):  # 前向传播
        x = self.fc1(x)  # 第一层映射
        x = F.relu(x)  # 激活函数 线性整流函数 f(x) = max(0,x)
        actions_value = self.out(x)  # 输出映射
        return actions_value


dqn = DQN()

In [7]:
# 横坐标 & 加速度
# def reward_cal(xx, aa):
#     xx += 0.5
#     if xx >= 0.0 and v <= 0.0:  # 右边下滑
#         if aa < 0.0:  # 向左加速
#             rw = xx
#         else:
#             rw = -xx
#     elif xx >= 0.0 and v > 0.0:  # 右边上冲
#         if aa > 0.0:  # 向右加速
#             rw = xx
#         else:
#             rw = -xx
#     elif xx < 0.0 and v >= 0.0:  # 左边下滑
#         if aa > 0.0:  # 向右加速
#             rw = - xx
#         else:
#             rw = xx
#     elif xx < 0.0 and v < 0.0:  # 左边上冲
#         if aa < 0.0:  # 向左加速
#             rw = - xx
#         else:
#             rw = xx
#     # if abs(aa) < 0.5:
#     #     rw = - (1 - abs(aa)) * 10
#     return rw
# 高 负数 高
def reward_cal(x, x_,t):
    if x_ >= 0.4:
        return 500/t
    if x < -0.5 and x_ < -0.5:
        return x - x_
    elif x > -0.5 and x_ > -0.5:
        return 2*(x_ - x)
    else:
        return -1
# def reward_cal(x, x_):
#     return abs(x_ - (-0.5))

In [8]:
print('\nCollecting experience...')
import time

target_pth = "target-2.pth"
eval_pth = "eval-2.pth"
# try:
#     print("读取模型中...")
#     dqn.target_net = torch.load(target_pth)
#     dqn.eval_net = torch.load(eval_pth)
#     print("读取模型成功!")
# except Exception as _:
#     print("读取模型失败!")
flag = True
for i_episode in range(400):
    if flag:
        s = env.reset()
        # print('[s]',s)
        ep_r = 0
        times = 0
        start = time.time()
        max_x = -1.2
        min_x = 0.6
        while True:
            try:
                env.render()
                a = dqn.choose_action(s)
                s_, rw, done, info = env.step([a])
                s_ = s_.reshape(2, )
                # print('[a]',a)
                x_, v_ = s_
                x, v = s
                r = reward_cal(x, x_,times)
                # r = rw
                # if x > max_x: max_x = x
                # if x < min_x: min_x = x
                ep_r += r
                if times % 100 == 0:
                    print(times)
                    print('[', times, ']{(r):(', round(ep_r, 2), ')}',
                               ',{range:[', round(min_x, 2), ',',
                              round(max_x, 2), ']}'
                              )
                times += 1
                # print(str(times) + ":" + str(r))
                # r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
                # r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
                # r = r1 + r2

                dqn.store_transition(s, a, r, s_)
                if dqn.memory_counter == MEMORY_CAPACITY:
                    print("开始学习")
                if dqn.memory_counter > MEMORY_CAPACITY:
                    dqn.learn()
                    if done:
                        t = time.time() - start
                        print('轮数: ', i_episode,
                              '| 次数: ', round(times, 2),
                              '| 奖励: ', round(ep_r, 2),
                              '| 结果: ', round(x, 4),
                              '| 速度: ', round(v, 4),
                              '| 时间: ', round(t, 2), 's'
                              )

                if done:
                    break
                s = s_
            except KeyboardInterrupt as e:
                flag = False
                break
    else:
        break

print("保存模型中...")
torch.save(dqn.target_net, target_pth)
torch.save(dqn.eval_net, eval_pth)
print("保存成功，程序已退出")


Collecting experience...
0
[ 0 ]{(r):( 0.0 )} ,{range:[ 0.6 , -1.2 ]}
100
[ 100 ]{(r):( -1.82 )} ,{range:[ 0.6 , -1.2 ]}
200
[ 200 ]{(r):( -1.55 )} ,{range:[ 0.6 , -1.2 ]}
300
[ 300 ]{(r):( -1.52 )} ,{range:[ 0.6 , -1.2 ]}
400
[ 400 ]{(r):( -3.73 )} ,{range:[ 0.6 , -1.2 ]}
500
[ 500 ]{(r):( -5.97 )} ,{range:[ 0.6 , -1.2 ]}
600
[ 600 ]{(r):( -5.84 )} ,{range:[ 0.6 , -1.2 ]}
700
[ 700 ]{(r):( -5.6 )} ,{range:[ 0.6 , -1.2 ]}
800
[ 800 ]{(r):( -5.54 )} ,{range:[ 0.6 , -1.2 ]}
900
[ 900 ]{(r):( -5.62 )} ,{range:[ 0.6 , -1.2 ]}
开始学习
1000
[ 1000 ]{(r):( -7.89 )} ,{range:[ 0.6 , -1.2 ]}


  return F.mse_loss(input, target, reduction=self.reduction)


1100
[ 1100 ]{(r):( -12.94 )} ,{range:[ 0.6 , -1.2 ]}
1200
[ 1200 ]{(r):( -15.84 )} ,{range:[ 0.6 , -1.2 ]}
1300
[ 1300 ]{(r):( -17.78 )} ,{range:[ 0.6 , -1.2 ]}
1400
[ 1400 ]{(r):( -17.83 )} ,{range:[ 0.6 , -1.2 ]}
1500
[ 1500 ]{(r):( -17.78 )} ,{range:[ 0.6 , -1.2 ]}
1600
[ 1600 ]{(r):( -17.76 )} ,{range:[ 0.6 , -1.2 ]}
1700
[ 1700 ]{(r):( -17.79 )} ,{range:[ 0.6 , -1.2 ]}
1800
[ 1800 ]{(r):( -17.86 )} ,{range:[ 0.6 , -1.2 ]}
1900
[ 1900 ]{(r):( -17.85 )} ,{range:[ 0.6 , -1.2 ]}
2000
[ 2000 ]{(r):( -17.85 )} ,{range:[ 0.6 , -1.2 ]}
2100
[ 2100 ]{(r):( -17.85 )} ,{range:[ 0.6 , -1.2 ]}
2200
[ 2200 ]{(r):( -17.82 )} ,{range:[ 0.6 , -1.2 ]}
2300
[ 2300 ]{(r):( -17.83 )} ,{range:[ 0.6 , -1.2 ]}
2400
[ 2400 ]{(r):( -17.83 )} ,{range:[ 0.6 , -1.2 ]}
2500
[ 2500 ]{(r):( -17.85 )} ,{range:[ 0.6 , -1.2 ]}
2600
[ 2600 ]{(r):( -17.84 )} ,{range:[ 0.6 , -1.2 ]}
2700
[ 2700 ]{(r):( -17.84 )} ,{range:[ 0.6 , -1.2 ]}
2800
[ 2800 ]{(r):( -17.85 )} ,{range:[ 0.6 , -1.2 ]}
2900
[ 2900 ]{(r):( -17.86 )

In [9]:
def test():
    s = env.reset()
    ep_r = 0
    times = 0
    start = time.time()
    max_x = -1.2
    min_x = 0.6
    while True:
        try:
            # env.render()
            a = dqn.choose_action(s)
            s_, r, done, info = env.step(a)
            x, v = s_
            # r = x
            if x > max_x: max_x = x
            if x < min_x: min_x = x
            # if times % 10 == 0:
            #     print('[', times, ']{(x,v):(', round(x, 2), ',', round(v, 2), ')},{range:[', round(min_x, 2), ',',
            #           round(max_x, 2), ']}')
            times += 1
            ep_r += r
            if done:
                t = time.time() - start
                print('次数: ', round(times, 2),
                      '| 结果: ', round(x, 4),
                      '| 速度: ', round(v, 4),
                      '| 时间: ', round(t, 2), 's'
                      )
                return times
            s = s_
        except KeyboardInterrupt as e:
            break

In [10]:
try:
    print("读取模型中...")
    dqn.target_net = torch.load(target_pth)
    dqn.eval_net = torch.load(eval_pth)
    print("读取模型成功!")
except Exception as _:
    print("读取模型失败!")
t = 0
for _ in range(0, 20):
    t += test()
print("累计次数: ", t)

读取模型中...
读取模型成功!


IndexError: invalid index to scalar variable.

In [None]:
s = env.reset()
x = torch.unsqueeze(torch.FloatTensor(s), 0)  # 解包
actions_value = dqn.eval_net.forward(x)  # 使用评估网络获取行为
print(actions_value)
action = torch.max(actions_value, 1)[0].data.numpy()  # 每行的最大值索引
action = np.random.randint(0, N_ACTIONS.shape[0] * 100) / 100.0
action = [action]
print(action)
s_, r, done, info = env.step(action)
x, v = s_
print("[action,reward]", [a[0][0], r])
print("[s_]", s_)
# print(r)
# print(done)