In [1]:
import ptan
import numpy as np

q_vals = np.array([[5,2,3],[-1,-1,0]])
q_vals

  from torch.distributed.optim import ZeroRedundancyOptimizer


array([[ 5,  2,  3],
       [-1, -1,  0]])

In [2]:
selector = ptan.actions.ArgmaxActionSelector()
selector(q_vals)

array([0, 2])

In [6]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)
selector(q_vals)

array([1, 2])

In [8]:
selector = ptan.actions.ProbabilityActionSelector()
print("Actions sampled from three prob distributions:")
for _ in range(10):
    acts = selector(np.array([
        [0.1, 0.8, 0.1],
        [0.0, 0.0, 1.0],
        [0.5, 0.5, 0.0]
    ]))
    print(acts)

Actions sampled from three prob distributions:
[1 2 0]
[1 2 1]
[0 2 0]
[1 2 1]
[0 2 1]
[1 2 0]
[2 2 1]
[1 2 0]
[1 2 1]
[1 2 1]


## DQN agent

In [15]:
import torch
import torch.nn as nn
class DQNNet(nn.Module):
    def __init__(self, actions:int):
        super(DQNNet, self).__init__()
        self.actions = actions

    def forward(self, x):
        return torch.eye(x.size()[0],self.actions)
    
net = DQNNet(actions=3)
net(torch.zeros(2,10))

tensor([[1., 0., 0.],
        [0., 1., 0.]])

In [18]:
selector = ptan.actions.ArgmaxActionSelector()
agent = ptan.agent.DQNAgent(model = net, action_selector=selector)
agent(torch.zeros(2,5))

(array([0, 1]), [None, None])

In [23]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)
agent = ptan.agent.DQNAgent(model = net,action_selector=selector)
selector.epsilon = 0.1
agent(torch.zeros(10,5))[0]

array([0, 1, 2, 0, 0, 0, 1, 0, 0, 0])

## Policy Agent

In [24]:
import torch
import torch.nn as nn

class PolicyNet(nn.Module):
    def __init__(self, actions: int):
        super(PolicyNet, self).__init__()
        self.actions = actions

    def forward(self, x):
        shape = (x.size()[0], self.actions)
        res = torch.zeros(shape, dtype=torch.float32)
        res[:,0] = 1
        res[:,1] = 1
        return res
    
net = PolicyNet(actions=5)
net(torch.zeros(6,10))

tensor([[1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.]])

In [30]:
selector = ptan.actions.ProbabilityActionSelector()
agent = ptan.agent.PolicyAgent(model = net, action_selector=selector, apply_softmax=True)
agent(torch.zeros(6,5))[0]

array([3, 2, 1, 2, 1, 0])

## Experience Source

In [34]:
import gym

class ToyEnv(gym.Env):
    def __init__(self):
        super(ToyEnv,self).__init__()
        self.observation_space = gym.spaces.Discrete(n=5)
        self.action_space = gym.spaces.Discrete(n=3)
        self.step_index = 0

    def reset(self):
        self.step_index = 0
        return self.step_index
    
    def step(self, action):
        is_done = self.step_index == 10
        if is_done:
            return self.step_index % self.observation_space.n, 0.0, is_done, {}
        self.step_index += 1
        return self.step_index % self.observation_space.n, float(action), self.step_index == 10, {}
    
#不管是什么都产生相同动作的智能体
class DullAgent(ptan.agent.BaseAgent):
    def __init__(self, action: int):
        self.action = action

    def __call__(self, observations,state):
        return [self.action for _ in observations],state

In [None]:
env = ToyEnv()
s = env.reset()
print("env.reset() -> %s" % s)
s = env.step(1)
print("env.step(1) -> %s" % str(s))
s = env.step(2)
print("env.step(2) -> %s" % str(s))

# for _ in range(10):
#     r = env.step(0)
#     print(r)

agent = DullAgent(action=1)
print("agent:", agent([1, 2],1)[0])

env = ToyEnv()
agent = DullAgent(action=1)
exp_source = ptan.experience.ExperienceSource(env=env, agent=agent, steps_count=2)
next(iter(exp_source))
# for idx, exp in enumerate(exp_source):
#     if idx > 15:
#         break
#     print(exp)

env.reset() -> 0
env.step(1) -> (1, 1.0, False, {})
env.step(2) -> (2, 2.0, False, {})
agent: [1, 1]


TypeError: cannot unpack non-iterable int object