In [15]:
import numpy as np
import random
from collections import namedtuple, deque

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")

In [16]:
class Agent():
    def __init__(self, state_size,
                 action_size,
                 seed,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=1e-3):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.BUFFER_SIZE = buffer_size  # 经验池大小
        self.BATCH_SIZE = batch_size  # 训练用的批大小
        self.GAMMA = gamma  # 折扣因子
        self.TAU = tau  # 目标网络更新参数
        self.LR = lr  # 优化器学习参数

        # 神经网络
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR)

        #经验池
        self.buffer = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, seed)
        self.t_step = 0

    def add_traj(self, traj):
        """将一条轨迹放入经验池中"""
        for transition in traj:
            self.buffer.memory.append(transition)

    def act(self, state, eps=0):
        """
        完成从状态到动作的映射
        :param state:
        :param eps: （float）贪婪策略的探索率
        :return:
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        if len(self.buffer) < 10 * self.BATCH_SIZE:
            return 0
        experiences = self.buffer.sample()
        states, actions, rewards, next_states, dones = experiences
        Q_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        y = rewards + (self.GAMMA * Q_next * (1 - dones))
        # states = torch.tensor(states.astype(int, copy=False),dtype=torch.int64,device=device)
        Q = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(Q, y)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.TAU)

    def soft_update(self, local_models, target_model, tau):
        """
        两个神经网络参数之间的软更新
        θ_target = τ*θ_local + (1 - τ)*θ_target
        :param local_models:
        :param target_model:
        :param tau:
        :return:
        """
        for target_param, local_param in zip(target_model.parameters(), local_models.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

In [17]:
class ReplayBuffer:
    """用于存储智能体与环境交互的经验"""

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): 动作数量
            buffer_size (int): 经验池大小
            batch_size (int): 训练
            seed (int): 随机种子
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.transition = namedtuple("Transition", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def sample(self):
        """从经验池中随机采样一定数量的样本."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
            device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
            device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):  # 方便别的地方使用len()获取经验池汇中transition的数量
        return len(self.memory)

In [18]:
class QNetwork(nn.Module):
    """
    估计价值网络
    """
    def __init__(self, state_size, action_size, seed, hidden_size_1=256, hidden_size_2=128):
        """
        初始化价值网络
        :param state_size:
        :param action_size:
        :param seed:
        :param hidden_size_1:
        :param hidden_size_2:
        """
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.net = nn.Sequential(
            nn.Linear(state_size, hidden_size_1),
            nn.ReLU(),
            nn.Linear(hidden_size_1, hidden_size_2),
            nn.ReLU(),
            nn.Linear(hidden_size_2, action_size),
        )

    def forward(self, state):
        return self.net(state)

In [19]:
import  gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
import pickle

In [20]:
env = gym.make('LunarLander-v2')
env.seed(0)
print('状态空间的形状: ', env.observation_space.shape)
print('动作空间的动作数量： ',env.action_space.n )

状态空间的形状:  (8,)
动作空间的动作数量：  4


In [21]:
agent = Agent(state_size=8, action_size=4, seed=0, buffer_size=int(20e5), batch_size=256)

RuntimeError: false INTERNAL ASSERT FAILED at "C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\c10/cuda/CUDAGraphsC10Utils.h":74, please report a bug to PyTorch. Unknown CUDA graph CaptureStatus473639808

In [8]:
def train_agent(n_episodes=5000, max_t=2000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    scores = []
    scores_window = deque(maxlen=100)
    eps = eps_start

    for episode_i in range(1, n_episodes + 1):
        traj = []
        state = env.reset()
        score=0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)

            # env.render()
            if episode_i % 100 == 0:
                env.render()

            transition = agent.buffer.transition(state, action, reward, next_state, done)
            traj.append(transition)

            state = next_state
            score += reward
            if done:
                break
        agent.add_traj(traj)

        for _ in range(int(len(traj) / 4)):
            agent.learn()

        scores_window.append(score)
        scores.append(score)

        eps = max(eps_end, eps_decay * eps)  # 探索率衰减

        print('\rEpisode {}\t平均得分: {:.2f}'.format(episode_i, np.mean(scores_window)), end="")
        if episode_i % 100 == 0:
            print('\rEpisode {}\t平均得分: {:.2f}'.format(episode_i, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')  # 保存模型
        if np.mean(scores_window) >= 200.0 and len(scores_window) >= 100:
            print('\n任务已经成功完成！总共经过 {:d} 次任务的训练。!\t最近100次任务的平均得分为: {:.2f}'.format(episode_i - 100,
                                                                                 np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint1.pth')  # 保存模型
            break
    return scores

In [9]:
scores = train_agent()

Episode 27	平均得分: -148.86

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

In [1]:
import gym
env = gym.make("LunarLander-v2")




In [4]:
env.observation_space.shape

(8,)

In [5]:
state = env.reset()

In [6]:
state

array([-0.00616913,  1.4158409 , -0.62487525,  0.21867698,  0.0071552 ,
        0.14154342,  0.        ,  0.        ], dtype=float32)