In [None]:
import gym
import tianshou as ts
import torch
import platoonenv

In [None]:
envName = "Platoon-v0"
# envName = "Platoon-v0"
env = gym.make(envName)

In [None]:
train_envs = ts.env.DummyVectorEnv([lambda: gym.make(envName) for _ in range(10)])
test_envs = ts.env.DummyVectorEnv([lambda: gym.make(envName) for _ in range(100)])

In [None]:
from tianshou.utils.net.common import Net
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
net = Net(state_shape, action_shape, hidden_sizes=(128,128,128,128))
optim = torch.optim.Adam(net.parameters(), lr=1e-3)

In [None]:
print(state_shape, action_shape)

In [None]:
policy = ts.policy.DQNPolicy(net, optim, discount_factor=0.9, estimation_step=3, target_update_freq=320)

In [None]:
train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(20000, 10), exploration_noise=True)
test_collector = ts.data.Collector(policy, train_envs, exploration_noise=True)

In [None]:
env.spec

In [None]:
threshold = env.spec.reward_threshold or 9
print(threshold)

In [None]:
result = ts.trainer.offpolicy_trainer(
    policy, train_collector, test_collector,
    max_epoch=20, step_per_epoch=10000, step_per_collect=10,
    update_per_step=0.1, episode_per_test=100, batch_size=64,
    train_fn=lambda epoch, env_step: policy.set_eps(0.1),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= threshold,
)
print(f"Finished training! Use {result['duration']}")

In [None]:
result

In [None]:
policy.eval()
policy.set_eps(0.05)
collector = ts.data.Collector(policy, gym.make(envName, render_mode="human"), exploration_noise=False)
collector.collect(n_episode=10, render=1 / 60)