In [1]:
import tianshou
import gym
import numpy as np
import pandas as pd

## 月球车模型测试

In [3]:
import torch
import tianshou as ts

#### 建立月球车env环境

In [4]:
test_envs = ts.env.SubprocVectorEnv(
        [lambda: gym.make('LunarLander-v2', render_mode="human") for _ in range(1)]
    )
env = gym.make('LunarLander-v2')
# env.render(mode='human')
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n

#### 建立深度网络模型

In [5]:
Q_param = {"hidden_sizes": [128, 128]}
V_param = {"hidden_sizes": [128, 128]}
net = ts.utils.net.common.Net(
        state_shape,
        action_shape,
        hidden_sizes=[128, 128],
        device='cuda',
        dueling_param=(Q_param, V_param)
    ).to('cuda')
optim = torch.optim.Adam(net.parameters(), lr=0.013)

#### policy

In [6]:
policy = ts.policy.DQNPolicy(
    net,
    optim,
    discount_factor=0.99, # gamma
    estimation_step=3, # n_step
    target_update_freq=500
)

In [7]:
# 载入模型
policy.load_state_dict(torch.load('./policy.pth'))

<All keys matched successfully>

#### 建立collector

In [8]:
from tianshou.data import Collector

eval_collector = Collector(policy, test_envs, exploration_noise=True)
eval_collector.reset()

#### 初始化policy（设置为eval模式）

In [9]:
policy.eval()

DQNPolicy(
  (model): Net(
    (model): MLP(
      (model): Sequential(
        (0): Linear(in_features=8, out_features=128, bias=True)
        (1): ReLU()
        (2): Linear(in_features=128, out_features=128, bias=True)
        (3): ReLU()
      )
    )
    (Q): MLP(
      (model): Sequential(
        (0): Linear(in_features=128, out_features=128, bias=True)
        (1): ReLU()
        (2): Linear(in_features=128, out_features=128, bias=True)
        (3): ReLU()
        (4): Linear(in_features=128, out_features=4, bias=True)
      )
    )
    (V): MLP(
      (model): Sequential(
        (0): Linear(in_features=128, out_features=128, bias=True)
        (1): ReLU()
        (2): Linear(in_features=128, out_features=128, bias=True)
        (3): ReLU()
        (4): Linear(in_features=128, out_features=1, bias=True)
      )
    )
  )
  (model_old): Net(
    (model): MLP(
      (model): Sequential(
        (0): Linear(in_features=8, out_features=128, bias=True)
        (1): ReLU()
        (

In [10]:
policy.set_eps(0.01)

#### 开始测试

In [11]:
result = eval_collector.collect(n_episode=10, render=0.)

In [12]:
rews, lens = result["rews"], result["lens"]
print(f"Final reward: {rews.mean()}, length: {lens.mean()}")

Final reward: 195.45919479232924, length: 382.2


## 测试Atari环境

In [13]:
from atari.atari_network import DQN
from atari.atari_wrapper import make_atari_env
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.policy import DQNPolicy
from tianshou.policy.modelbased.icm import ICMPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.utils import TensorboardLogger, WandbLogger
from tianshou.utils.net.discrete import IntrinsicCuriosityModule

#### 本环境需改了tianshou中make_atari_env，以确保能够可视化测试环境

#### 通过make_atari_env来创建环境，该环境缩小了输入图像，否则回合神经网络不一致

In [14]:
env, train_envs, test_envs = make_atari_env(
        task= "PongNoFrameskip-v4",
        seed=0,
        training_num=1,
        test_num=10,
        scale=0,
        frame_stack=4,
    )



#### 初始化DQN网络

In [15]:
import argparse

args = argparse.ArgumentParser()
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
net = DQN(*args.state_shape, args.action_shape, device='cuda').to('cuda')

#### 初始化policy

In [16]:
optim = torch.optim.Adam(net.parameters(), lr=0.0001)
# define policy
policy = DQNPolicy(
    net,
    optim,
    0.99,
    4,
    target_update_freq=500
)

In [17]:
policy.load_state_dict(torch.load('./atari_policy.pth'))
policy.eval()

DQNPolicy(
  (model): DQN(
    (net): Sequential(
      (0): Sequential(
        (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
        (1): ReLU(inplace=True)
        (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
        (3): ReLU(inplace=True)
        (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
        (5): ReLU(inplace=True)
        (6): Flatten(start_dim=1, end_dim=-1)
      )
      (1): Linear(in_features=3136, out_features=512, bias=True)
      (2): ReLU(inplace=True)
      (3): Linear(in_features=512, out_features=6, bias=True)
    )
  )
  (model_old): DQN(
    (net): Sequential(
      (0): Sequential(
        (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
        (1): ReLU(inplace=True)
        (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
        (3): ReLU(inplace=True)
        (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
        (5): ReLU(inplace=True)
        (6): Flatten(start_dim=1, end_dim=-1)
      )
      (1): Linear(

#### 建立测试环境

In [18]:
test_envs = make_atari_env(
        task= "PongNoFrameskip-v4",
        seed=0,
        training_num=1,
        test_num=1,
        scale=0,
        frame_stack=4,
        show=True,
    )[2]

#### 建立collector

In [19]:
from tianshou.data import Collector

eval_collector = Collector(policy, test_envs, exploration_noise=True)
eval_collector.reset()

In [20]:
policy.set_eps(0.01)

#### 开始预览

In [21]:
result = eval_collector.collect(n_episode=10, render=0.)

In [25]:
rews, lens = result["rews"], result["lens"]
print(f"Final reward: {rews.mean()}, length: {lens.mean()}")

Final reward: 19.9, length: 1770.7
