In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from use_case.baseline import * 
from tests.eval import *

np.random.seed(1337)

N = 10
payoff_i = np.random.uniform(-10, 10, (N, N))
payoff_j = np.random.uniform(-10, 10, (N, N))

# Initialize environment
N_ACTIONS = payoff_i.shape[0]
N_AGENTS = 1000
env = BaselineEnvironment(N_AGENTS, payoff_i, payoff_j, total_games = 1)

# Actual Run

In [3]:
from models.model import *
from models.trainer import *


In [4]:
# Configure the network here
parameters = ParameterSettings(
    n_agents = N_AGENTS,
    d_action = N_ACTIONS, 
    d_obs = env.obs_size, 
    d_traits = 1,
    d_beliefs = 1
)
parameters.device = "cuda" if torch.cuda.is_available() else "cpu"

model = Model(parameters)

In [5]:
equilibriua = find_pure_equilibria(payoff_i, payoff_j)

for eq in equilibriua:
    x, y = eq 
    a = (y[0] + y[1]) / 2

    print(x, a)

(18, 10) 9.42934343544164


In [None]:
evaluate_policy(model, env, 10)


    Average Return: 0.0656039389081976
    Total returns: 0.656039389081976
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[103  69  83  45  43  65  40  49  66  38  58  41  39  43  33  49  35  37
  31  33]


: 

In [None]:
# Setup the training loop
training_parameters = TrainingParameters(
    outer_loops = 200,
    hypernet_training_loops= 5, 
    actor_training_loops = 100,
    experience_buffer_size = 3,

    actor_learning_rate= 2.5e-4,
    critic_learning_rate = 2.5e-4,
    hypernet_learning_rate = 2.5e-4,
)

train_model(model, env, training_parameters)
        

Epoch 0


Hypernet Loop: 100%|██████████| 30/30 [00:07<00:00,  4.20it/s]



    Average Policy Loss 0.009680504910647869
    Average Entropy Loss: 36.09934616088867
    Average JSD Loss: 0.001468414906412363
    

    Average Return: 0.11158655686338168
    Total returns: 1.1158655686338168
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[132  91  74  79  67  54  28  48  43  44  38  35  52  37  33  24  28  37
  19  37]


Actor Training: 100%|██████████| 100/100 [00:44<00:00,  2.26it/s]



    Average Policy Loss: 0.03668736208463088
    Average Value Loss: 24.688346424102782
    Average Entropy Loss: 11.961390228271485 
    

    Average Return: 0.029856960693596103
    Total returns: 0.298569606935961
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[144  88  51  67  43  53  38  58  47  38  49  36  30  38  46  32  36  36
  33  37]
Epoch 1


Hypernet Loop: 100%|██████████| 30/30 [00:18<00:00,  1.63it/s]



    Average Policy Loss 0.00835129152983427
    Average Entropy Loss: 36.1063346862793
    Average JSD Loss: 0.001276160473935306
    

    Average Return: -0.023058099561075594
    Total returns: -0.23058099561075596
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[143  89  77  69  68  69  36  40  42  39  42  32  48  43  25  28  29  25
  24  32]


Actor Training: 100%|██████████| 100/100 [01:02<00:00,  1.59it/s]



    Average Policy Loss: 0.0331169086007867
    Average Value Loss: 24.65362641811371
    Average Entropy Loss: 11.96431843996048 
    

    Average Return: -0.03998883326398603
    Total returns: -0.3998883326398603
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[163 106  87  63  62  44  38  54  47  40  32  34  50  27  23  27  34  24
  23  22]
Epoch 2


Hypernet Loop: 100%|██████████| 30/30 [00:19<00:00,  1.52it/s]



    Average Policy Loss 0.006959795951843262
    Average Entropy Loss: 36.106109619140625
    Average JSD Loss: 0.0011318159522488713
    

    Average Return: 0.07991809645855877
    Total returns: 0.7991809645855876
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[164  98  83  64  62  54  34  41  43  33  55  42  42  28  28  31  27  22
  23  26]


Actor Training: 100%|██████████| 100/100 [00:36<00:00,  2.75it/s]



    Average Policy Loss: 0.030431812509777955
    Average Value Loss: 24.692739057540894
    Average Entropy Loss: 11.966240315437316 
    

    Average Return: 0.047158621738396
    Total returns: 0.47158621738396
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[152  97  77  72  56  47  31  54  43  39  46  43  41  36  40  34  32  22
  20  18]
Epoch 3


Hypernet Loop: 100%|██████████| 30/30 [00:19<00:00,  1.50it/s]



    Average Policy Loss 0.007150881923735142
    Average Entropy Loss: 36.103145599365234
    Average JSD Loss: 0.0009984005009755492
    

    Average Return: 0.07892089151193413
    Total returns: 0.7892089151193413
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[125  77  75  66  60  53  23  61  52  59  36  34  60  26  47  38  31  22
  29  26]


Actor Training: 100%|██████████| 100/100 [01:03<00:00,  1.57it/s]



    Average Policy Loss: 0.027382379941846013
    Average Value Loss: 24.656230545043947
    Average Entropy Loss: 11.967782049179077 
    

    Average Return: -0.037502076922429683
    Total returns: -0.37502076922429683
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[137  86  78  70  59  51  37  62  41  42  34  45  58  30  28  41  27  34
  25  15]
Epoch 4


Hypernet Loop: 100%|██████████| 30/30 [00:19<00:00,  1.55it/s]



    Average Policy Loss 0.006244340445846319
    Average Entropy Loss: 36.1143798828125
    Average JSD Loss: 0.0009325787541456521
    

    Average Return: 0.2712102925515206
    Total returns: 2.712102925515206
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[137  84  93  56  51  60  29  80  46  44  43  37  44  33  35  31  27  36
  22  12]


Actor Training: 100%|██████████| 100/100 [01:01<00:00,  1.61it/s]



    Average Policy Loss: 0.027071181191713548
    Average Value Loss: 24.62744049549103
    Average Entropy Loss: 11.969071815013885 
    

    Average Return: 0.15998985423989948
    Total returns: 1.5998985423989946
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[193 100  77  71  71  57  34  40  36  42  43  27  42  23  27  32  22  23
  24  16]
Epoch 5


Hypernet Loop: 100%|██████████| 30/30 [00:13<00:00,  2.19it/s]



    Average Policy Loss 0.006097221747040749
    Average Entropy Loss: 36.11821365356445
    Average JSD Loss: 0.000888121488969773
    

    Average Return: 0.00563343050589254
    Total returns: 0.056334305058925394
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[188  93  72  68  71  36  28  46  30  51  49  32  38  34  39  47  25  20
  19  14]


Actor Training: 100%|██████████| 100/100 [00:44<00:00,  2.23it/s]



    Average Policy Loss: 0.02111187434196836
    Average Value Loss: 24.581277174949648
    Average Entropy Loss: 11.97009890794754 
    

    Average Return: 0.07863347442424173
    Total returns: 0.7863347442424173
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[150  84  76  65  82  46  28  51  59  39  55  27  50  45  38  30  23  17
  19  16]
Epoch 6


Hypernet Loop: 100%|██████████| 30/30 [00:20<00:00,  1.48it/s]



    Average Policy Loss 0.004827076569199562
    Average Entropy Loss: 36.12289047241211
    Average JSD Loss: 0.0008067660965025425
    

    Average Return: 0.1633977935313673
    Total returns: 1.633977935313673
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[199 103  90  82  89  72  29  37  37  45  41  32  28  25  17  26  13  14
  13   8]


Actor Training: 100%|██████████| 100/100 [01:03<00:00,  1.58it/s]



    Average Policy Loss: 0.020643400607514194
    Average Value Loss: 24.590921120643614
    Average Entropy Loss: 11.971037621498107 
    

    Average Return: 0.030386613309970223
    Total returns: 0.30386613309970223
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[181 102  96  64  86  43  27  52  29  30  45  34  28  39  38  35  18  30
  16   7]
Epoch 7


Hypernet Loop: 100%|██████████| 30/30 [00:18<00:00,  1.66it/s]



    Average Policy Loss 0.005737268831580877
    Average Entropy Loss: 36.11208724975586
    Average JSD Loss: 0.000741052208468318
    

    Average Return: 0.022475401370138547
    Total returns: 0.22475401370138548
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[167 112  86  76  69  68  24  57  31  36  42  29  40  34  25  36  16  21
  18  13]


Actor Training: 100%|██████████| 100/100 [00:37<00:00,  2.65it/s]



    Average Policy Loss: 0.019926054645766272
    Average Value Loss: 24.528240098953248
    Average Entropy Loss: 11.971771671772004 
    

    Average Return: 0.038529306033813084
    Total returns: 0.3852930603381308
    
Action Distribution
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[171  94  73  87  62  61  27  49  39  39  55  30  46  37  22  32  22  22
  19  13]
Epoch 8


Hypernet Loop:  43%|████▎     | 13/30 [00:02<00:03,  4.62it/s]