In [1]:
import sys
sys.path.insert(0, '..')

from dataclasses import dataclass, field
from typing import List, Dict, Union, get_type_hints, TypeVar

import numpy as np

import torch

from buffers import *

from __future__ import annotations

In [2]:
memory = MemoryBuffer()

In [4]:
from agents import ConstantAgent, CAgent

from models import MLPModel

In [5]:
model = MLPModel({})

In [6]:
model.config

{'input_size': 94,
 'num_actions': 2,
 'activation': 'leaky_relu',
 'hidden_sizes': [64, 64],
 'separate_value': False,
 'sigma0': 0.3,
 'initializer': 'kaiming_uniform'}

In [7]:
agent = CAgent(model)

In [12]:
obs = Observation(vector=np.random.randn(5, 94).astype(np.float32))

In [14]:
action, _, _ = agent.act(obs)

In [15]:
agent.evaluate(obs, action)

(tensor([-1.3707,  0.1812, -0.3733, -0.7954, -0.6431], grad_fn=<SumBackward1>),
 tensor([-0.7899, -1.3452,  0.2807, -1.2094, -2.2453], grad_fn=<SumBackward1>),
 tensor([0.4299, 0.4299, 0.4299, 0.4299, 0.4299], grad_fn=<SumBackward1>))

In [3]:
%%time

agents = ["Agent1", "Agent2", "Agent3"]

for _ in range(10000):
    obs = {agent_id: Observation(vector=np.random.randn(81).astype(np.float32)) for agent_id in agents}
    action = {agent_id: Action(continuous=np.random.randn(2).astype(np.float32)) for agent_id in agents}
    reward = {agent_id: np.random.randn(1).astype(np.float32) for agent_id in agents}
    value = {agent_id: np.random.randn(1).astype(np.float32) for agent_id in agents}
    done = {agent_id: False for agent_id in agents}

    memory.append(obs, action, reward, value, done)

CPU times: user 402 ms, sys: 12.2 ms, total: 414 ms
Wall time: 414 ms


In [8]:
%%time
data = memory.tensorify()

CPU times: user 432 ms, sys: 72.6 ms, total: 504 ms
Wall time: 502 ms


In [10]:
data["Agent1"]

MemoryRecord(obs=Observation(vector=tensor([[-1.4880,  1.2864, -1.2055,  ..., -0.6864,  0.1039,  0.1918],
        [-1.2485, -0.7422, -1.5813,  ...,  2.5104, -0.6631, -0.5201],
        [ 0.6763, -2.1487,  1.0342,  ..., -0.0298, -0.0790, -0.7663],
        ...,
        [ 1.0044, -0.1749,  1.1084,  ...,  0.8365, -1.2717, -1.3375],
        [ 1.7029, -1.3934, -0.8514,  ..., -1.0454, -1.4887, -1.1994],
        [-0.2679,  0.0317, -1.6959,  ..., -0.8685, -0.0649, -0.1062]]), rays=None, buffer=None, image=None), action=Action(continuous=tensor([[ 0.4727,  1.5711],
        [ 1.1814, -0.0202],
        [ 0.0876,  0.3682],
        ...,
        [-0.0092,  0.4263],
        [-0.0558,  0.7332],
        [-0.1934,  0.2680]]), discrete=None), reward=tensor([[-1.6922],
        [ 1.9348],
        [ 0.8651],
        ...,
        [ 0.1332],
        [ 1.4387],
        [ 0.2664]]), value=tensor([[-0.3821],
        [-0.2239],
        [ 1.0835],
        ...,
        [ 0.8901],
        [-0.0552],
        [ 0.1826]]

In [7]:
%%time
data = memory.crowd_tensorify()

CPU times: user 386 ms, sys: 61.8 ms, total: 448 ms
Wall time: 427 ms


In [6]:
data.obs.batch_size

30000