In [2]:
%matplotlib inline

import sys
import logging
import itertools
import copy

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(0)


logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s',
                    datefmt='%Y/%m/%d %H:%M:%S',
                    stream=sys.stdout,
                    # filemode='w',
                    # filename='log_{}.log'.format{time.strftime('%Y-%m-%d %H-%M-%S',time.localtime(time.time()))},
                    level=logging.INFO)

In [3]:
env = gym.make("MountainCar-v0")
env.seed(0)
for key in vars(env):
    logging.info("%s: %s", key, vars(env)[key])
print()
for key in vars(env.spec):
    logging.info("%s: %s", key, vars(env.spec)[key])

2021/08/11 19:56:30 [INFO] env: <MountainCarEnv<MountainCar-v0>>
2021/08/11 19:56:30 [INFO] action_space: Discrete(3)
2021/08/11 19:56:30 [INFO] observation_space: Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)
2021/08/11 19:56:30 [INFO] reward_range: (-inf, inf)
2021/08/11 19:56:30 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30}
2021/08/11 19:56:30 [INFO] _max_episode_steps: 200
2021/08/11 19:56:30 [INFO] _elapsed_steps: None

2021/08/11 19:56:30 [INFO] id: MountainCar-v0
2021/08/11 19:56:30 [INFO] entry_point: gym.envs.classic_control:MountainCarEnv
2021/08/11 19:56:30 [INFO] reward_threshold: -110.0
2021/08/11 19:56:30 [INFO] nondeterministic: False
2021/08/11 19:56:30 [INFO] max_episode_steps: 200
2021/08/11 19:56:30 [INFO] _kwargs: {}
2021/08/11 19:56:30 [INFO] _env_name: MountainCar


In [None]:
# 6-1 导入小车上山环境

# env = gym.make('MountainCar-v0')
# env = env.unwrapped
# print('观测空间 = {}'.format(env.observation_space))
# print('动作空间 = {}'.format(env.action_space))
# print('位置范围 = {}'.format(env.min_position, env.max_speed))
# print('速度范围 = {}'.format((-env.max_speed, env.max_speed)))
# print('目标位置 = {}'.format(env.goal_position))

observation = env.reset()
print(observation.shape)

In [None]:
# 6-2 总是向右施力的智能体

positions, velocities = [], []
observation = env.reset()
i = 200
while i > 0:
    positions.append(observation[0])
    velocities.append(observation[1])
    next_observation, reward, done, _ = env.step(2)
    if done:
        break
    observation = next_observation
    i = i - 1 

if next_observation[0] > 0.5:
    print('成功到达')
else:
    print('失败退出')

# 绘制位置和速度图像
fig, ax = plt.subplots()
ax.plot(positions, label='position')
ax.plot(velocities, label='velocity')
ax.legend()

In [None]:
# 6-3 砖瓦编码的实现

class TileCoder:
    def __init__(self, layers, features) -> None:
        self.layers = layers
        self.features = features
        self.codebook = {}

    def get_feature(self, codeword):
        if codeword in self.codebook:
            return self.codebook[codeword]
        count = len(self.codebook)
        if count >= self.features: # 冲突处理
            return hash(codeword) & self.features
        else:
            self.codebook[codeword] = count
            return count
    
    def __call__(self, floats=(), ints=()):
        dim = len(floats)
        scaled_floats = tuple(f * self.layers * self.layers for f in floats)
        features = []
        for layer in range(self.layers):
            codeword = (layer,) + tuple(int((f + (1 + dim * i) * layer) / self.layers) for i, f in enumerate(scaled_floats)) + ints
            feature = self.get_feature(codeword)
            features.append(feature)
        return features

In [None]:
# 6-4 函数近似SARSA算法智能体

In [None]:
# 6-5 函数近似SARSA(λ)智能体

In [4]:
# 6-6 经验回访的实现

class DQNReplayer:
    def __init__(self, capacity) -> None:
        self.memory = pd.DataFrame(
            index=range(capacity), columns=["observation", "action", "reward", "next_observation", "done"]
        )  # memory存储(s, a, r, s')
        self.i = 0  # 最新存储位置的索引
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = args
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in self.memory.columns)


In [None]:
# 6-7 带目标网络的深度Q学习智能体

class DQNAgent:
    def __init__(self, env) -> None:
        self.action_n = env.action_space.n
        self.gamma = 0.99
        self.replayer = DQNReplayer(10000)
        self.evaluate_net = self.build_net(
            input_size=env.observation_space.shape[0], hidden_sizes=[64, 64], output_size=self.action_n
        )
        self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.001)
        self.loss = nn.MSELoss()

    def build_net(self, input_size, hidden_sizes, output_size):
        layers = []
        for input_size, output_size in zip(
            [
                input_size,
            ]
            + hidden_sizes,
            hidden_sizes
            + [
                output_size,
            ],
        ):
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        model = nn.Sequential(*layers)
        print(model)
        return model

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == "train":
            self.trajectory = []
            self.target_net = copy.deepcopy(self.evaluate_net)

    def step(self, observation, reward, done):
        # 根据状态和奖励选择下一个动作
        if self.mode == "train" and np.random.rand() < 0.001:
            # epsilon-greedy policy in train mode
            action = np.random.randint(self.action_n)
        else:
            # test
            state_tensor = torch.as_tensor(observation, dtype=torch.float).squeeze(0)
            q_tensor = self.evaluate_net(state_tensor)
            action_tensor = torch.argmax(q_tensor)
            action = action_tensor.item()

        if self.mode == "train":
            self.trajectory += [observation, reward, done, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, done, _ = self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, done)

            if self.replayer.count > self.replayer.capacity * 0.95:  # skip first few episodes for speed
                self.learn()

        return action

    def learn(self):
        # replay
        states, actions, rewards, next_states, dones = self.replayer.sample(1024)
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        action_tensor = torch.as_tensor(actions, dtype=torch.long)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
        done_tensor = torch.as_tensor(dones, dtype=torch.float)

        # train
        next_q_tensor = self.target_net(next_state_tensor)
        next_max_q_tensor, _ = next_q_tensor.max(axis=-1)  # q_learning
        target_tensor = reward_tensor + self.gamma * (1.0 - done_tensor) * next_max_q_tensor
        pred_tensor = self.evaluate_net(state_tensor)
        q_tensor = pred_tensor.gather(1, action_tensor.unsqueeze(1)).squeeze(1)
        loss_tensor = self.loss(target_tensor, q_tensor)
        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()

    def close(self):
        pass


agent = DQNAgent(env)

In [4]:
# DoubleDQN

class DoubleDQNAgent:
    def __init__(self, env) -> None:
        self.action_n = env.action_space.n
        self.gamma = 0.99
        self.replayer = DQNReplayer(10000)
        self.evaluate_net = self.build_net(
            input_size=env.observation_space.shape[0], hidden_sizes=[64, 64], output_size=self.action_n
        )
        self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.001)
        self.loss = nn.MSELoss()

    def build_net(self, input_size, hidden_sizes, output_size):
        layers = []
        for input_size, output_size in zip(
            [
                input_size,
            ]
            + hidden_sizes,
            hidden_sizes
            + [
                output_size,
            ],
        ):
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        model = nn.Sequential(*layers)
        print(model)
        return model
    
    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []
            self.target_net = copy.deepcopy(self.evaluate_net)

    def step(self, observation, reward, done):
        if self.mode == 'train' and np.random.rand() < 0.001:
            # epsilon-greedy policy in train mode
            action = np.random.randint(self.action_n)
        else:
            state_tensor = torch.as_tensor(observation,
                    dtype=torch.float).reshape(1, -1)
            q_tensor = self.evaluate_net(state_tensor)
            action_tensor = torch.argmax(q_tensor)
            action = action_tensor.item()
        if self.mode == 'train':
            self.trajectory += [observation, reward, done, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, done, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, done)
            if self.replayer.count >= self.replayer.capacity * 0.95:
                    # skip first few episodes for speed
                self.learn()
        return action

    def close(self):
        pass

    def learn(self):
        # replay
        states, actions, rewards, next_states, dones = \
                self.replayer.sample(1024) # replay transitions
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        action_tensor = torch.as_tensor(actions, dtype=torch.long)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
        done_tensor = torch.as_tensor(dones, dtype=torch.float)

        # train
        next_eval_q_tensor = self.evaluate_net(next_state_tensor)
        next_action_tensor = next_eval_q_tensor.argmax(axis=-1)
        next_q_tensor = self.target_net(next_state_tensor)
        next_max_q_tensor = torch.gather(next_q_tensor, 1,
                next_action_tensor.unsqueeze(1)).squeeze(1)
        target_tensor = reward_tensor + self.gamma * (1. - done_tensor) * next_max_q_tensor
        pred_tensor = self.evaluate_net(state_tensor)
        q_tensor = pred_tensor.gather(1, action_tensor.unsqueeze(1)).squeeze(1)
        loss_tensor = self.loss(target_tensor, q_tensor)
        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()

agent = DoubleDQNAgent(env)

Sequential(
  (0): Linear(in_features=2, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=3, bias=True)
)


In [5]:
# DuelDQN

class DuelNet(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.common_net = nn.Sequential(nn.Linear(input_size, 64), nn.ReLU())
        self.advantage_net = nn.Sequential(nn.Linear(64, 32), nn.ReLU(),
            nn.Linear(32, output_size))
        self.v_net = nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 1))

    def forward(self, s):
        h = self.common_net(s)
        adv = self.advantage_net(h)
        adv = adv - adv.mean(1).unsqueeze(1)
        v = self.v_net(h)
        q = v + adv
        return q

class DuelDQNAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99
        self.replayer = DQNReplayer(10000)
        self.evaluate_net = DuelNet(input_size=env.observation_space.shape[0],
                output_size=self.action_n)
        self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.001)
        self.loss = nn.MSELoss()

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []
            self.target_net = copy.deepcopy(self.evaluate_net)

    def step(self, observation, reward, done):
        if self.mode == 'train' and np.random.rand() < 0.001:
            # epsilon-greedy policy in train mode
            action = np.random.randint(self.action_n)
        else:
            state_tensor = torch.as_tensor(observation,
                    dtype=torch.float).reshape(1, -1)
            q_tensor = self.evaluate_net(state_tensor)
            action_tensor = torch.argmax(q_tensor)
            action = action_tensor.item()
        if self.mode == 'train':
            self.trajectory += [observation, reward, done, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, done, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, done)
            if self.replayer.count >= self.replayer.capacity * 0.95:
                    # skip first few episodes for speed
                self.learn()
        return action

    def close(self):
        pass

    def learn(self):
        # replay
        states, actions, rewards, next_states, dones = \
                self.replayer.sample(1024) # replay transitions
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        action_tensor = torch.as_tensor(actions, dtype=torch.long)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
        done_tensor = torch.as_tensor(dones, dtype=torch.float)

        # train
        next_eval_q_tensor = self.evaluate_net(next_state_tensor)
        next_action_tensor = next_eval_q_tensor.argmax(axis=-1)
        next_q_tensor = self.target_net(next_state_tensor)
        next_max_q_tensor = torch.gather(next_q_tensor, 1,
                next_action_tensor.unsqueeze(1)).squeeze(1)
        target_tensor = reward_tensor + self.gamma * (1. - done_tensor) * \
                next_max_q_tensor
        pred_tensor = self.evaluate_net(state_tensor)
        unsqueeze_tensor = action_tensor.unsqueeze(1)
        q_tensor = pred_tensor.gather(1, action_tensor.unsqueeze(1)).squeeze(1)
        loss_tensor = self.loss(target_tensor, q_tensor)
        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()


agent = DuelDQNAgent(env)

In [7]:
def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):
    observation, reward, done = env.reset(), 0.0, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0.0, 0
    while True:
        action = agent.step(observation, reward, done)
        if render:
            env.render()
        if done:
            break
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
        if max_episode_steps and elapsed_steps >= max_episode_steps:
            break
    agent.close()
    return episode_reward, elapsed_steps


logging.info("==== train ====")
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(
        env.unwrapped, agent, max_episode_steps=env._max_episode_steps, mode="train", render=False
    )
    episode_rewards.append(episode_reward)
    logging.debug("train episode %d: reward = %.2f, steps = %d", episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > -110:
        break
plt.plot(episode_rewards)


logging.info("==== test ====")
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.debug("test episode %d: reward = %.2f, steps = %d", episode, episode_reward, elapsed_steps)
logging.info("average episode reward = %.2f ± %.2f", np.mean(episode_rewards), np.std(episode_rewards))

2021/08/11 20:14:29 [INFO] ==== train ====


KeyboardInterrupt: 