In [1]:
%matplotlib notebook
from config import Player, EnvConfig
from environments.flipit_utils import Action, ActionTargetPair, PlayerTargetPair
from environments.poachers_renderer import Renderer
from environments.poachers import PoachersEnv, PoachersMap
import torch
from torchrl.envs.utils import check_env_specs
from torchrl.envs import Compose, TransformedEnv, RewardSum, Stack

NUM_NODES = 20
NUM_STEPS = 40
SEED = 41

config = EnvConfig(
    num_nodes=NUM_NODES,
    num_steps=NUM_STEPS,
    seed=SEED,
    env_name="poachers",
)

In [2]:
env_map = PoachersMap(config, "cpu")

In [3]:
env = TransformedEnv(
    PoachersEnv(config, env_map, "cpu"),
    Compose(
        RewardSum(),
    )
)
check_env_specs(env)
out = env.reset()

ValueError: Invalid action detected: tensor([0, 3], dtype=torch.int32)

In [8]:
renderer = Renderer(env, SEED)
renderer.render()

<IPython.core.display.Javascript object>

In [12]:
from algorithms.generic_policy import MultiAgentPolicy, CombinedPolicy
from algorithms.generator import AgentGenerator
from algorithms.keys_processors import CombinedExtractor
from config import AgentNNConfig, BackboneConfig, HeadConfig, EnvConfig
import yaml

from config import TrainingConfig, LossConfig
from algorithms.simple_nn import TrainableNNAgentPolicy

with open("configs/run/test_single_training_poachers.yaml", "r") as file:
    config = yaml.safe_load(file)

env_config = EnvConfig.from_dict(config)
training_config_attacker = TrainingConfig.from_dict(config, suffix=f"_attacker")
loss_config_attacker = LossConfig.from_dict(config, suffix=f"_attacker")
agent_config = AgentNNConfig.from_dict(config)
backbone_config = BackboneConfig.from_dict(config, suffix=f"_backbone")
head_config = HeadConfig.from_dict(config, suffix=f"_head")


# attacker_agent = MultiAgentPolicy(
#     action_size=env.base_env.action_size,
#     player_type=1,
#     device="cpu",
#     embedding_size=32,
#     run_name="test",
#     policy_generator=AgentGenerator(
#         TrainableNNAgentPolicy,
#         {
#             "action_size": env.base_env.action_size,
#             "total_steps": env.base_env.num_steps,
#             "player_type": 1,
#             "embedding_size": 32,
#             "device": "cpu",
#             "loss_config": loss_config_attacker,
#             "training_config": training_config_attacker,
#             "run_name": "test",
#             "use_transformer": True,
#         }
#     ),
# )
attacker_extractor = CombinedExtractor(player_type=1, env=env, actions=backbone_config.extractors)
attacker_agent = TrainableNNAgentPolicy(
    player_type=1,
    max_sequence_size=NUM_STEPS + 1,
    extractor=attacker_extractor,
    action_size=env.action_size,
    env_type=env_config.env_pair,
    device="cpu",
    loss_config=loss_config_attacker,
    training_config=training_config_attacker,
    run_name="test",
    backbone_config=backbone_config,
    head_config=head_config,
    agent_config=agent_config,
)


attacker_agent.eval()
attacker_agent.load("saved_models/2025-07-07_23:54:35-attacker-/attacker/agent_0.pth")

TypeError: CombinedExtractor.__init__() got an unexpected keyword argument 'actions'

In [14]:
from algorithms.generic_policy import RandomAgent, GreedyOraclePoacherAgent


greedy_agent = GreedyOraclePoacherAgent(
    action_size=env.action_size,
    player_type=1,
    device="cpu",
    run_name="test",
    total_steps=NUM_STEPS,
    embedding_size=agent_config.embedding_size,
    env_map=env_map,
)

random_agent = RandomAgent(
    action_size=env.action_size,
    player_type=1,
    device="cpu",
    run_name="test",
    embedding_size=agent_config.embedding_size,
)

In [31]:
greedy_agent(out)["action"]

tensor([5], dtype=torch.int32)

In [46]:
random_agent(out)["action"]

tensor(2)

In [6]:
a = attacker_agent(out)
a["action"]

tensor(0)

In [7]:
a["logits"]

tensor([ 9.1918e-01, -1.5391e+00, -3.6143e+00, -2.1009e+00, -7.5466e-01,
        -1.0000e+08, -1.0000e+08], grad_fn=<IndexPutBackward0>)

In [32]:
out.update({
    "action": torch.tensor([4, 5]),
})
out2 = env.step(out)

In [33]:
renderer.render()
out = out2["next"]
out["game_id"] = out2["game_id"]

In [8]:
out["done"]

tensor([True])

In [34]:
out2["next"]["available_moves"]

tensor([[[-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [ 5,  7, -1, -1],
         [ 4,  6, -1, -1],
         [ 3,  5, -1, -1],
         [ 2,  4, -1, -1],
         [ 1,  3, -1, -1],
         [ 0,  2, -1, -1]],

        [[-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [-1, -1, -1, -1],
         [ 7,  9, -1, -1],

In [35]:
out2["next"]["node_reward_info"]

tensor([[[-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [ 0,  0],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1]],

        [[-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [-1, -1],
         [ 0,  0],
         [ 1,  0],
         [ 1,  1]]], dtype=torch.int32)

In [36]:
renderer.render()

In [37]:
out = out2["next"]

In [16]:
env.

tensor([False])

In [7]:
import yaml
from algorithms.simple_nn import TrainableNNAgentPolicy, NNAgentPolicy
from algorithms.generic_policy import CombinedPolicy, MultiAgentPolicy, ExplorerAgent
from algorithms.keys_processors import CombinedExtractor
from algorithms.generator import AgentGenerator
from config import TrainingConfig, LossConfig, EnvConfig, Player, AgentNNConfig, BackboneConfig, HeadConfig

device = torch.device("cpu")

with open("configs/run/test_single_training_poachers.yaml", "r") as file:
    config = yaml.safe_load(file)

run_name = "test"

env_config_ = EnvConfig.from_dict(config)
training_config = TrainingConfig.from_dict(config, suffix=f"_attacker")
loss_config = LossConfig.from_dict(config, suffix=f"_attacker")
agent_config = AgentNNConfig.from_dict(config)
backbone_config = BackboneConfig.from_dict(config, suffix=f"_backbone")
head_config = HeadConfig.from_dict(config, suffix=f"_head")

env_map, env = env_config_.create(device)

defender_extractor = CombinedExtractor(player_type=0, env=env, actions_map=backbone_config.extractors)
attacker_extractor = CombinedExtractor(player_type=1, env=env, actions_map=backbone_config.extractors)

defender_agent = NNAgentPolicy(
    player_type=0,
    max_sequence_size=env_config_.num_steps + 1,
    extractor=defender_extractor,
    action_size=env.action_size,
    backbone_config=backbone_config,
    head_config=head_config,
    agent_config=agent_config,
    device=device,
    run_name=run_name,
)
attacker_agent = TrainableNNAgentPolicy(
    player_type=1,
    max_sequence_size=env_config_.num_steps + 1,
    extractor=attacker_extractor,
    action_size=env.action_size,
    env_type=env_config_.env_pair,
    device=device,
    loss_config=loss_config,
    training_config=training_config,
    run_name=run_name,
    backbone_config=backbone_config,
    head_config=head_config,
    agent_config=agent_config,
    #scheduler_steps=training_config.total_steps_per_turn // training_config.steps_per_batch + 5,
)

exploration_defender = ExplorerAgent(
    action_size=env.action_size,
    player_type=0,
    device=device,
    run_name=run_name,
    total_steps=env.num_steps,
    embedding_size=agent_config.embedding_size,
)
exploration_attacker = ExplorerAgent(
    action_size=env.action_size,
    player_type=1,
    device=device,
    run_name=run_name,
    total_steps=env.num_steps,
    embedding_size=agent_config.embedding_size,
)
combined_policy = CombinedPolicy(
    defender_agent,
    attacker_agent,
    #exploration_defender=exploration_defender,
    #exploration_attacker=exploration_attacker,
)

In [8]:
training_config.steps_per_batch

10000

In [20]:
from torchrl.collectors import SyncDataCollector, MultiaSyncDataCollector, MultiSyncDataCollector
torch.multiprocessing.set_start_method('spawn', force=True)
torch.multiprocessing.set_sharing_strategy('file_system')

collector = MultiaSyncDataCollector(
    [env.create_from_self for _ in range(4)],
    policy=combined_policy,
    frames_per_batch=10000,
    total_frames=10000,
    split_trajs=False,
    device=env.device,
)

In [21]:
import time
start = time.time()
tensordict = next(collector.iterator())
print(time.time() - start)

77.66230392456055


In [8]:
tensordict["next"]["reward"][:, 1].mean()

tensor(0.0133)

In [9]:
tensordict["next"]["reward"][:, 1].max()

tensor(57.0261)

In [3]:
from torch_geometric.loader import DataLoader

loader = DataLoader([env_map, env_map], batch_size=2)

batch = next(iter(loader))

In [4]:
batch

PoachersMapBatch(x=[20, 2], edge_index=[2, 80], entry_nodes=[20], reward_nodes=[20], move_cost=[2], preparation_reward=[2], device=[2], batch=[20], ptr=[3])

In [7]:
batch.edge_index.shape

torch.Size([2, 80])

In [4]:
from unittest.mock import patch
from torch import nn

In [5]:
class _Linear(nn.Linear):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        bias: bool = True,
        weight_initializer: str | None = None,
        bias_initializer: str | None = None,
    ):
        super().__init__(
            in_features=in_channels,
            out_features=out_channels,
            bias=bias,
        )

In [7]:
with patch("torch_geometric.nn.conv.gcn_conv.Linear", side_effect=_Linear):
    from torch_geometric.nn.conv import GCNConv
    convs = nn.ModuleList([
        GCNConv(2, 5, aggr="mean"),
        GCNConv(5, 5, aggr="mean"),
        GCNConv(5, 5, aggr="mean"),
    ])

In [8]:
x = batch.x
for conv_layer in convs:
    x = conv_layer(x, batch.edge_index)
    x = torch.relu(x)
x

tensor([[0.0009, 0.0000, 0.0007, 0.0000, 0.0000],
        [0.0012, 0.0000, 0.0009, 0.0000, 0.0000],
        [0.0014, 0.0000, 0.0011, 0.0000, 0.0000],
        [0.0015, 0.0000, 0.0011, 0.0000, 0.0000],
        [0.0014, 0.0000, 0.0011, 0.0000, 0.0000],
        [0.0012, 0.0000, 0.0009, 0.0000, 0.0000],
        [0.0009, 0.0000, 0.0007, 0.0000, 0.0000],
        [0.0007, 0.0000, 0.0005, 0.0000, 0.0000],
        [0.0006, 0.0000, 0.0004, 0.0000, 0.0000],
        [0.0007, 0.0000, 0.0005, 0.0000, 0.0000],
        [0.0009, 0.0000, 0.0007, 0.0000, 0.0000],
        [0.0012, 0.0000, 0.0009, 0.0000, 0.0000],
        [0.0014, 0.0000, 0.0011, 0.0000, 0.0000],
        [0.0015, 0.0000, 0.0011, 0.0000, 0.0000],
        [0.0014, 0.0000, 0.0011, 0.0000, 0.0000],
        [0.0012, 0.0000, 0.0009, 0.0000, 0.0000],
        [0.0009, 0.0000, 0.0007, 0.0000, 0.0000],
        [0.0007, 0.0000, 0.0005, 0.0000, 0.0000],
        [0.0006, 0.0000, 0.0004, 0.0000, 0.0000],
        [0.0007, 0.0000, 0.0005, 0.0000, 0.0000]],

In [5]:
from algorithms.generic_policy import MultiAgentPolicy, CombinedPolicy
from algorithms.generator import AgentGenerator
from algorithms.keys_processors import CombinedExtractor
from config import AgentNNConfig, BackboneConfig, HeadConfig, EnvConfig
import yaml

from config import TrainingConfig, LossConfig
from algorithms.simple_nn import TrainableNNAgentPolicy

with open("configs/run/test_single_training_poachers.yaml", "r") as file:
    config = yaml.safe_load(file)

env_config = EnvConfig.from_dict(config)
training_config_attacker = TrainingConfig.from_dict(config, suffix=f"_attacker")
loss_config_attacker = LossConfig.from_dict(config, suffix=f"_attacker")
agent_config = AgentNNConfig.from_dict(config)
backbone_config = BackboneConfig.from_dict(config, suffix=f"_backbone")
head_config = HeadConfig.from_dict(config, suffix=f"_head")


# attacker_agent = MultiAgentPolicy(
#     action_size=env.base_env.action_size,
#     player_type=1,
#     device="cpu",
#     embedding_size=32,
#     run_name="test",
#     policy_generator=AgentGenerator(
#         TrainableNNAgentPolicy,
#         {
#             "action_size": env.base_env.action_size,
#             "total_steps": env.base_env.num_steps,
#             "player_type": 1,
#             "embedding_size": 32,
#             "device": "cpu",
#             "loss_config": loss_config_attacker,
#             "training_config": training_config_attacker,
#             "run_name": "test",
#             "use_transformer": True,
#         }
#     ),
# )
attacker_extractor = CombinedExtractor(player_type=1, env=env, actions_map=backbone_config.extractors)
attacker_agent = TrainableNNAgentPolicy(
    player_type=1,
    max_sequence_size=NUM_STEPS + 1,
    extractor=attacker_extractor,
    action_size=env.action_size,
    env_type=env_config.env_pair,
    device="cpu",
    loss_config=loss_config_attacker,
    training_config=training_config_attacker,
    run_name="test",
    backbone_config=backbone_config,
    head_config=head_config,
    agent_config=agent_config,
)

In [10]:
from utils import create_replay_buffer

In [6]:
replay_buffer = create_replay_buffer(training_config_attacker)

In [6]:
env_map.move_cost * (NUM_STEPS - 2)

tensor(-1.6043)

In [11]:
env_map.x[0].sum()

tensor(0.)

In [12]:
env_map.x

tensor([[ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.6343, -0.0050],
        [ 0.0000,  0.0000],
        [ 0.7104, -0.1165],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000]])

In [13]:
env_map.preparation_reward

tensor(1.1447)