In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
import sys
sys.tracebacklimit = 0

import numpy as np
import networkx as nx 
import matplotlib.pyplot as plt

In [3]:
from pettingzoo.test import parallel_api_test
from solution.custom_gym import CustomGymEnviornment
from solution.trainer import *
from solution.policy_net import *

In [4]:
from core.agent import *
from core.world import * 
from core.render import * 
from core.skill import * 
from core.models import *
from core.message import *

In [5]:
DEVICE = "cuda"

In [6]:
from sar.sar_agent import *
from sar.sar_world import *
from sar.sar_env_params import *
from sar.sar_traits_sampler import *

belief_initializer = SARBeliefInitializer(BELIEF_DIMS)
trait_sampler = SARTraitSampler()

def initialize_swarm(world : BaseWorld):
    swarm = trait_sampler.generate(SWARM_SIZE, DEVICE)
    for agent in swarm:
        agent.set_utility(SARUtilityFunction())
        world.add_agent(agent)
    swarm = initialize_positions_randomly(world, swarm)
    swarm = belief_initializer.initialize_beliefs(swarm)

In [7]:

from sar.urban_gen import * 
from sar.victims import * 
from sar.sar_comm import * 

terrain_generator = UrbanTerrainMapGenerator(padding = MAX_VISIBILITY)
victim_generator = VictimGenerator(padding = MAX_VISIBILITY)
def initialize_terrain(world : BaseWorld):
    terrain_map, population_map = terrain_generator.generate(world._dims)
    map_collection : BaseMapCollection = BaseMapCollection()
    map_collection.add_map("Terrain", terrain_map)
    map_collection.add_map("Population", population_map)

    victim_generator.set_density_map(population_map)
    victim_map = victim_generator.generate(world._dims)

    map_collection.add_map("Victims", victim_map)
    return map_collection


In [8]:
from sar.energy import EnergyModel
from sar.victims import VictimModel
from solution.sar_action_interpreter import *
from solution.encoder_net import *
from solution.decoder_net import *
from models.complex_model import * 

world = SARWorld(dims = WORLD_DIMS,
              swarm_initializer= initialize_swarm,
              generation_pipeline=initialize_terrain
              )
world.add_model("energy_model", EnergyModel())
world.add_model("victim_model", VictimModel())
world.reset()

In [9]:
policy_net= PolicyNet(1, 7, 4)
target_net=  PolicyNet(1, 7, 4)
encoder_model = Encoder()
decoder_model = Decoder()

complex_model = ComplexModel(
    policy_net= policy_net, 
    encoder_net = encoder_model, 
    decoder_net = decoder_model
)

comms_protocol = SARCommunicationProtocol(encoder_model, decoder_model)
action_interpreter = SARActionInterpreter(BELIEF_DIMS)

custom_gym : CustomGymEnviornment = CustomGymEnviornment(world, action_interpreter, comms_protocol)

complex_model.to(DEVICE)
custom_gym.to(DEVICE)

In [10]:
custom_gym.reset(42)

({1: {'Belief': tensor([0., 0., 0., 0., 0.], device='cuda:0'),
   'Vision': array([[1., 1., 1., 0., 1., 0., 1.],
          [1., 0., 0., 0., 1., 1., 1.],
          [1., 0., 0., 1., 0., 0., 1.],
          [0., 1., 0., 0., 0., 1., 0.],
          [1., 0., 0., 1., 0., 0., 1.],
          [0., 1., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 1., 0., 1.]]),
   'State': tensor([90.3885,  0.0000], device='cuda:0')},
  2: {'Belief': tensor([0., 0., 0., 0., 0.], device='cuda:0'),
   'Vision': array([[1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 0., 1.],
          [1., 1., 1., 1., 0., 1., 1.],
          [1., 0., 1., 0., 0., 1., 0.],
          [1., 1., 1., 1., 0., 1., 0.],
          [1., 0., 0., 0., 0., 0., 1.],
          [1., 1., 0., 1., 1., 0., 0.]]),
   'State': tensor([107.1394,   1.0000], device='cuda:0')},
  3: {'Belief': tensor([0., 0., 0., 0., 0.], device='cuda:0'),
   'Vision': array([[0., 0., 0., 0., 1., 0., 0.],
          [0., 1., 0., 0., 0., 0., 0.],
          [1., 1., 

### Testing

In [11]:
parallel_api_test(custom_gym, num_cycles=1_000)
custom_gym.reset()

Passed Parallel API test


({1: {'Belief': tensor([0., 0., 0., 0., 0.], device='cuda:0'),
   'Vision': array([[1., 1., 1., 0., 1., 0., 1.],
          [1., 0., 0., 0., 1., 1., 1.],
          [1., 0., 0., 1., 0., 0., 1.],
          [0., 1., 0., 0., 0., 1., 0.],
          [1., 0., 0., 1., 0., 0., 1.],
          [0., 1., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 1., 0., 1.]]),
   'State': tensor([90.3885,  0.0000], device='cuda:0')},
  2: {'Belief': tensor([0., 0., 0., 0., 0.], device='cuda:0'),
   'Vision': array([[1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 0., 1.],
          [1., 1., 1., 1., 0., 1., 1.],
          [1., 0., 1., 0., 0., 1., 0.],
          [1., 1., 1., 1., 0., 1., 0.],
          [1., 0., 0., 0., 0., 0., 1.],
          [1., 1., 0., 1., 1., 0., 0.]]),
   'State': tensor([107.1394,   1.0000], device='cuda:0')},
  3: {'Belief': tensor([0., 0., 0., 0., 0.], device='cuda:0'),
   'Vision': array([[0., 0., 0., 0., 1., 0., 0.],
          [0., 1., 0., 0., 0., 0., 0.],
          [1., 1., 

# Training

In [12]:

from models.base import * 
from models.idqn import * 
from solution.policy_net import PolicyNet
import matplotlib.pyplot as plt

In [13]:
model = IDQN(env = custom_gym,
             feature_extractor= feature_extractor,
             target_net= target_net,
             model= complex_model,
             batch_size=1024,
             device = DEVICE
             )

In [14]:
rewards = train_loop(custom_gym, model, games=10, optimization_passes = 1, seed=42)

Training on thesis.


  belief_tensor = torch.tensor(belief, dtype=torch.float32)


Average loss 48.939928407669065
Model has been saved.

Starting evaluation on thesis (num_games=1)


Training Progress:  10%|█         | 1/10 [00:14<02:13, 14.87s/it]

Avg reward: 781.46  std: 720.4795128246188  coeff : 0.9219659519676232
Avg reward per agent, per game:  {1: 991.0, 2: 198.0, 3: 1243.0, 4: 196.0, 5: 99.0, 6: 2607.0, 7: 2326.0, 8: 752.0, 9: 1277.0, 10: 98.0, 11: 99.0, 12: 2101.0, 13: 393.0, 14: 489.0, 15: 292.0, 16: 393.0, 17: 749.0, 18: 3026.0, 19: 99.0, 20: 381.0, 21: 1968.0, 22: 381.0, 23: 893.0, 24: 548.0, 25: 1141.0, 26: 791.0, 27: 292.0, 28: 98.0, 29: 1048.0, 30: 99.0, 31: 99.0, 32: 1647.0, 33: 1681.0, 34: 0.0, 35: 1699.0, 36: 98.0, 37: 1314.0, 38: 0.0, 39: 1074.0, 40: 737.0, 41: 0.0, 42: 467.0, 43: 197.0, 44: 0.0, 45: 99.0, 46: 2539.0, 47: 99.0, 48: 2308.0, 49: 378.0, 50: 1315.0, 51: 842.0, 52: 99.0, 53: 486.0, 54: 1707.0, 55: 0.0, 56: 292.0, 57: 2057.0, 58: 1679.0, 59: 936.0, 60: 1009.0, 61: 1844.0, 62: 0.0, 63: 0.0, 64: 2082.0, 65: 475.0, 66: 294.0, 67: 385.0, 68: 99.0, 69: 522.0, 70: 296.0, 71: 1081.0, 72: 0.0, 73: 1004.0, 74: 473.0, 75: 874.0, 76: 1767.0, 77: 1228.0, 78: 392.0, 79: 557.0, 80: 1070.0, 81: 287.0, 82: 637.0, 83

Training Progress:  20%|██        | 2/10 [00:38<02:41, 20.13s/it]

Avg reward: 398.72  std: 341.0349565660389  coeff : 0.8553244295897845
Avg reward per agent, per game:  {1: 940.0, 2: 295.0, 3: 675.0, 4: 489.0, 5: 99.0, 6: 197.0, 7: 99.0, 8: 1281.0, 9: 189.0, 10: 197.0, 11: 296.0, 12: 374.0, 13: 99.0, 14: 1072.0, 15: 194.0, 16: 198.0, 17: 565.0, 18: 825.0, 19: 198.0, 20: 663.0, 21: 198.0, 22: 460.0, 23: 666.0, 24: 546.0, 25: 386.0, 26: 296.0, 27: 99.0, 28: 1428.0, 29: 571.0, 30: 99.0, 31: 1024.0, 32: 569.0, 33: 888.0, 34: 467.0, 35: 295.0, 36: 979.0, 37: 99.0, 38: 0.0, 39: 863.0, 40: 1214.0, 41: 0.0, 42: 99.0, 43: 198.0, 44: 0.0, 45: 386.0, 46: 99.0, 47: 391.0, 48: 639.0, 49: 0.0, 50: 389.0, 51: 295.0, 52: 486.0, 53: 295.0, 54: 96.0, 55: 0.0, 56: 474.0, 57: 198.0, 58: 291.0, 59: 198.0, 60: 99.0, 61: 386.0, 62: 386.0, 63: 485.0, 64: 295.0, 65: 99.0, 66: 198.0, 67: 715.0, 68: 198.0, 69: 538.0, 70: 296.0, 71: 198.0, 72: 0.0, 73: 278.0, 74: 97.0, 75: 190.0, 76: 1511.0, 77: 485.0, 78: 722.0, 79: 99.0, 80: 291.0, 81: 1282.0, 82: 96.0, 83: 195.0, 84: 565.0,

Training Progress:  20%|██        | 2/10 [01:00<04:03, 30.40s/it]


KeyboardInterrupt: 

In [None]:
plt.plot(rewards)

# Optimizations

In [15]:
import cProfile

In [16]:
def stress_test():
    train_loop(custom_gym, model, games=1, optimization_passes = 1, seed=42)

In [17]:

cProfile.run('stress_test()', sort = 'time')

Training on thesis.


Training Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Average loss 45.659642934799194
Model has been saved.

Starting evaluation on thesis (num_games=1)


Training Progress: 100%|██████████| 1/1 [00:31<00:00, 31.71s/it]

Avg reward: 749.53  std: 711.035757961581  coeff : 0.948642159702188
Avg reward per agent, per game:  {1: 1278.0, 2: 99.0, 3: 854.0, 4: 954.0, 5: 1960.0, 6: 0.0, 7: 99.0, 8: 196.0, 9: 373.0, 10: 294.0, 11: 385.0, 12: 392.0, 13: 99.0, 14: 565.0, 15: 552.0, 16: 1383.0, 17: 620.0, 18: 650.0, 19: 947.0, 20: 485.0, 21: 198.0, 22: 645.0, 23: 666.0, 24: 0.0, 25: 388.0, 26: 99.0, 27: 834.0, 28: 1196.0, 29: 1333.0, 30: 99.0, 31: 2382.0, 32: 1016.0, 33: 286.0, 34: 868.0, 35: 99.0, 36: 1285.0, 37: 99.0, 38: 0.0, 39: 584.0, 40: 760.0, 41: 1396.0, 42: 1083.0, 43: 580.0, 44: 383.0, 45: 654.0, 46: 99.0, 47: 1161.0, 48: 283.0, 49: 439.0, 50: 294.0, 51: 856.0, 52: 1007.0, 53: 923.0, 54: 96.0, 55: 98.0, 56: 922.0, 57: 2423.0, 58: 469.0, 59: 196.0, 60: 2689.0, 61: 296.0, 62: 0.0, 63: 485.0, 64: 667.0, 65: 99.0, 66: 2711.0, 67: 1937.0, 68: 198.0, 69: 554.0, 70: 296.0, 71: 1267.0, 72: 0.0, 73: 457.0, 74: 1457.0, 75: 1065.0, 76: 2171.0, 77: 482.0, 78: 1416.0, 79: 99.0, 80: 2291.0, 81: 1509.0, 82: 2371.0, 83


