In [None]:
import os

os.environ['TORCH_USE_CUDA_DSA'] = "TRUE"
os.environ['CUDA_LAUNCH_BLOCKING']="TRUE"

In [None]:
from use_case.tests import * 
from models.eval import *

torch.manual_seed(1337)
# Initialize environment
env = initialize_baseline(seed = 1337)

In [None]:
payoff_i = env.payoff_i
payoff_j = env.payoff_j
for i in range(env.n_actions):
    for j in range(env.n_actions):
        print(f"({payoff_i[i, j]:.2f}, {payoff_j[i, j]:.2f})", end = "\t")
    print("\n")

In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import matplotlib.pyplot as plt

# Calculate average rewards for all action pairs
average_rewards = (payoff_i + payoff_j) / 2

# Flatten the matrix into a list of all possible rewards
all_rewards = average_rewards.ravel()

# Plotting the histogram
plt.figure(figsize=(10, 6))
plt.hist(all_rewards, bins=50, color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Distribution of Average Rewards for All Action Pairs')
plt.xlabel('Average Reward')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:

# Calculate average rewards for all action pairs
max_rewards = np.max([payoff_i, payoff_j], axis = 0)

# Flatten the matrix into a list of all possible rewards
all_rewards = max_rewards.ravel()

# Plotting the histogram
plt.figure(figsize=(10, 6))
plt.hist(all_rewards, bins=50, color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Distribution of Average Rewards for All Action Pairs')
plt.xlabel('Average Reward')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Actual Run

In [None]:

from models.model import *
from models.trainer import *

In [None]:
# Configure the network here
parameters = ParameterSettings(
    n_agents = env.n_agents,
    d_action = env.n_actions, 
    d_obs = env.obs_size, 
    d_traits = env.d_traits,
    d_het_latent = 4, 
    d_beliefs = env.d_beliefs,
    d_relation = env.d_relation, 
    d_message = 4,
    d_comm_state= env.d_comm_state,
)
parameters.device = "cuda" if torch.cuda.is_available() else "cpu"
# parameters.device = "cpu"
model = PPOModel(parameters)

In [None]:
equilibriua = find_pure_equilibria(payoff_i, payoff_j)

for eq in equilibriua:
    x, y = eq 
    a = (y[0] + y[1]) / 2

    print(x, a)

In [None]:
evaluate_policy(model, env, 10, temperature=2.0)

In [None]:
# Setup the training loop
training_parameters = TrainingParameters(
    outer_loops = 8_000,
    
    actor_learning_rate= 1e-4,
    critic_learning_rate = 1e-3,
    hypernet_learning_rate = 5e-4,

    hypernet_jsd_threshold = 1.0,
    hypernet_samples = 3000,
    hypernet_jsd_weight = 1.0,
    hypernet_entropy_weight = 0.01, 
    hypernet_diversity_weight= 1.0,

    sampled_agents_proportion = 0.5,
    experience_sampling_steps = 10,
    experience_buffer_size = 10,

    entropy_coeff = 0.2,
    value_loss_coeff = 1.0,

    epsilon_period = 200,
    epsilon_end = 0.05,

    entropy_target = 0.5,

    eval_temp = -1.0,
    
    # verbose = False,
    device = parameters.device,
    steps_per_epoch = 16
)
train_model(model, env, training_parameters)
        

# Heterogeneous Baseline

In [6]:
%load_ext autoreload
%autoreload 2
from use_case.tests import * 
from models.eval import *
from models.model import *
from models.trainer import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# Initialize Environment
torch.manual_seed(1337)

env = initialize_sar_env()

In [None]:
# Configure the network here
parameters = ParameterSettings(
    n_agents = env.n_agents,
    d_action = env.n_actions, 
    d_obs = env.obs_size, 
    d_traits = env.d_traits,
    d_het_latent = 4, 
    d_beliefs = env.d_beliefs,
    d_relation = env.d_relation, 
    d_message = 4,
    d_comm_state= env.d_comm_state,
    device = "cuda"
)
model = PPOModel(parameters)

# Setup the training loop
training_parameters = TrainingParameters(
    outer_loops = 8_000,
    
    actor_learning_rate= 1e-4,
    critic_learning_rate = 1e-4,
    hypernet_learning_rate = 1e-4,
    decoder_learning_rate= 1e-4,
    filter_learning_rate= 1e-4,

    hypernet_jsd_threshold = 2.0,
    hypernet_samples = 3000,
    hypernet_jsd_weight = 1.0,
    hypernet_entropy_weight = 0.01, 
    hypernet_diversity_weight= 1.0,

    sampled_agents_proportion = 0.2,
    experience_sampling_steps = 20,
    experience_buffer_size = 10,

    entropy_coeff = 1.0,
    value_loss_coeff = 0.3,

    epsilon_period = 200,
    epsilon_end = 0.05,

    entropy_target = 0.5,

    eval_temp = 0.9,
    # eval_k = env.n_types,
    eval_k=4,
    # verbose = False,
    device = parameters.device,
    steps_per_epoch = 4,
)


In [9]:
evaluate_policy(model, env, temperature = -1.0)

np.float32(-3.0933657)

In [10]:
train_model(model, env, training_parameters)

  0%|          | 2/8000 [04:26<296:24:04, 133.41s/it]


KeyboardInterrupt: 