# Naming game experiments in a simplified probabilistic environment

In this notebook we can simulate the alignment results we would obtain with the naming game for different probabilities of solving different type of goals.

The `scores` matrix contains the probability of solving a pair of goals from the point of view of one agent, should be read as $P(\text{solving my goal i, when other agent selected goal j}) = p_{i,j}$

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations, product
from scipy.special import softmax
import torch.nn

In [None]:
plt.rc("font", family="serif")
plt.rc("xtick", labelsize="small")
plt.rc("ytick", labelsize="small")
plt.rc("legend", fontsize="small")
plt.rc("axes", labelsize="small")

# General settings

In [None]:
NUMBER_OF_LANDMARKS = 6
# Number of updates to the matrix
N_STEPS = 10_000
# Update step
DELTA = 1
# Exploration probability (0 is greedy but stil explores since there are negative updates for bad associations)
EGREEDY = 0.05

# Only cooperative goals

`scores0` is the probability matrix used by agent 0, `scores1` is the probability matrix for agent 1

In [None]:
# A basic probability matrix where the probability of solving aligned goals is 1, and different goals 0.3

scores = np.eye(NUMBER_OF_LANDMARKS)
P_DIFFERENT_GOALS = 0.3

for i in range(scores.shape[0]):
    for j in range(scores.shape[1]):
        if i != j:
            scores[i, j] = P_DIFFERENT_GOALS
n_goals = scores.shape[0]

scores0 = scores1 = scores

# Alternatively you can create the matrix entry by entry, and specify different matrices for each agent

# scores1 = np.array([
#     [0.8, 0, 0.1],
#     [0, 1, 0.2],
#     [0.7, 0.8, 0.9]
# ])

# scores0 = np.array([
#     [0.8, 0, 0.1],
#     [0.1, 1, 0],
#     [0.2, 0, 0.9]
# ])

15*a+(1-a) =b*15


14*a =b*15-1

a=(b*15-1)/14

In [None]:
def f(x):
    return (x*15-1)/14

print(f(0.5))
print(f(0.25))
print(f(0.75))

## One naming matrix per agent

Same matrix is used when agent is leader and follower

In [None]:
# Alignment for all seeds
a0 = []
a1 = []

for s in range(10):
    np.random.seed(s)
    agent_0 = np.zeros_like(scores0)
    agent_1 = np.zeros_like(scores1)
    for i in range(N_STEPS):
        leader_goal_index = np.random.randint(0, n_goals)
        if np.random.random() < 0.5:
            # Agent 0 leader
            if np.random.random() < EGREEDY:
                leader_msg_index = np.random.choice(range(scores.shape[0]))
            else:
                leader_msg_index = np.argmax(agent_0[leader_goal_index])
            if np.random.random() < EGREEDY:
                follower_goal_index = np.random.choice(range(scores.shape[0]))
            else:
                follower_goal_index = np.argmax(agent_1[:, leader_msg_index])
            leader_reward = (
                np.random.random() < scores0[leader_goal_index, follower_goal_index]
            )
            follower_reward = (
                np.random.random() < scores1[follower_goal_index, leader_goal_index]
            )
            if leader_reward:
                agent_0[leader_goal_index, :] -= DELTA
                agent_0[leader_goal_index, leader_msg_index] += (
                    leader_reward + 1
                ) * DELTA
            else:
                agent_0[leader_goal_index, leader_msg_index] -= DELTA
            if follower_reward:
                agent_1[:, leader_msg_index] -= DELTA
                agent_1[follower_goal_index, leader_msg_index] += (
                    follower_reward + 1
                ) * DELTA
            else:
                agent_1[follower_goal_index, leader_msg_index] -= DELTA

        else:
            # Agent 1 leader
            if np.random.random() < EGREEDY:
                leader_msg_index = np.random.choice(range(scores.shape[0]))
            else:
                leader_msg_index = np.argmax(agent_1[leader_goal_index])
            if np.random.random() < EGREEDY:
                follower_goal_index = np.random.choice(range(scores.shape[0]))
            else:
                follower_goal_index = np.argmax(agent_0[:, leader_msg_index])
            leader_reward = (
                np.random.random() < scores1[leader_goal_index, follower_goal_index]
            )
            follower_reward = (
                np.random.random() < scores0[follower_goal_index, leader_goal_index]
            )
            if leader_reward:
                agent_1[leader_goal_index, :] -= DELTA
                agent_1[leader_goal_index, leader_msg_index] += (
                    leader_reward + 1
                ) * DELTA
            else:
                agent_1[leader_goal_index, leader_msg_index] -= DELTA
            if follower_reward:
                agent_0[:, leader_msg_index] -= DELTA
                agent_0[follower_goal_index, leader_msg_index] += (
                    follower_reward + 1
                ) * DELTA
            else:
                agent_0[follower_goal_index, leader_msg_index] -= DELTA

    alignment_0_leader = []
    for i in range(0, n_goals):
        msg = np.argmax(agent_0[i])
        j = np.argmax(agent_1[:, msg])
        alignment_0_leader.append(i == j)

    alignment_1_leader = []
    for i in range(0, n_goals):
        msg = np.argmax(agent_1[i])
        j = np.argmax(agent_0[:, msg])
        alignment_1_leader.append(i == j)

    a0.append(sum(alignment_0_leader) / len(alignment_0_leader))
    a1.append(sum(alignment_1_leader) / len(alignment_1_leader))

In [None]:
print(f"Alignment for agent 0: {np.mean(a0)}")
print(f"Alignment for agent 1: {np.mean(a1)}")

## Minimum competence for getting alignment

Analysing what's the minimum probability needed for alignment. We assume that the probability of solving aligned goals is p, and different goals 0


In [None]:
discount=1.
t=0.5
EGREEDY=0.05
NUMBER_OF_LANDMARKS=15
N_STEPS = 10000
n_goals=NUMBER_OF_LANDMARKS
discount_b=1/n_goals

In [None]:
# Alignment for all seeds
a0_p = []
a1_p = []


probas = np.arange(0.1, 1.1, 0.1)
for p in probas:
    a0 = []
    a1 = []
    scores = (np.eye(NUMBER_OF_LANDMARKS)*t+np.ones((NUMBER_OF_LANDMARKS,NUMBER_OF_LANDMARKS))*(1-t)) * p 
    
    
    for s in range(3):
        np.random.seed(s)
        agent_0 = np.zeros_like(scores)
        agent_1 = np.zeros_like(scores)
        for i in range(N_STEPS):
            leader_goal_index = np.random.randint(0, scores.shape[0])
            if np.random.random() < 0.5:
                # Agent 0 leader
                if np.random.random() < EGREEDY:
                    leader_msg_index = np.random.choice(range(scores.shape[0]))
                else:
                    leader_msg_index = np.argmax(agent_0[leader_goal_index])
                if np.random.random() < EGREEDY:
                    follower_goal_index = np.random.choice(range(scores.shape[0]))
                else:
                    follower_goal_index = np.argmax(agent_1[:, leader_msg_index])
                leader_reward = (
                    np.random.random() < scores[leader_goal_index, follower_goal_index]
                )
                follower_reward = (
                    np.random.random() < scores[follower_goal_index, leader_goal_index]
                )
                if leader_reward:
                    agent_0[leader_goal_index, :] -= DELTA*discount_b
                    agent_0[leader_goal_index, leader_msg_index] += (
                        leader_reward + discount_b
                    ) * DELTA
                else:
                    agent_0[leader_goal_index, leader_msg_index] -= DELTA*discount
                if follower_reward:
                    agent_1[:, leader_msg_index] -= DELTA*discount_b
                    agent_1[follower_goal_index, leader_msg_index] += (
                        follower_reward +  discount_b
                    ) * DELTA
                else:
                    agent_1[follower_goal_index, leader_msg_index] -= DELTA*discount

            else:
                # Agent 1 leader
                if np.random.random() < EGREEDY:
                    leader_msg_index = np.random.choice(range(scores.shape[0]))
                else:
                    leader_msg_index = np.argmax(agent_1[leader_goal_index])
                if np.random.random() < EGREEDY:
                    follower_goal_index = np.random.choice(range(scores.shape[0]))
                else:
                    follower_goal_index = np.argmax(agent_0[:, leader_msg_index])
                leader_reward = (
                    np.random.random() < scores[leader_goal_index, follower_goal_index]
                )
                follower_reward = (
                    np.random.random() < scores[follower_goal_index, leader_goal_index]
                )
                if leader_reward:
                    agent_1[leader_goal_index, :] -= DELTA*discount_b
                    agent_1[leader_goal_index, leader_msg_index] += (
                        leader_reward + discount_b
                    ) * DELTA
                else:
                    agent_1[leader_goal_index, leader_msg_index] -= DELTA*discount
                if follower_reward:
                    agent_0[:, leader_msg_index] -= DELTA*discount_b
                    agent_0[follower_goal_index, leader_msg_index] += (
                        follower_reward  + discount_b
                    ) * DELTA
                else:
                    agent_0[follower_goal_index, leader_msg_index] -= DELTA*discount

        alignment_0_leader = []
        for i in range(0, n_goals):
            msg = np.argmax(agent_0[i])
            j = np.argmax(agent_1[:, msg])
            alignment_0_leader.append(i == j)

        alignment_1_leader = []
        for i in range(0, n_goals):
            msg = np.argmax(agent_1[i])
            j = np.argmax(agent_0[:, msg])
            alignment_1_leader.append(i == j)

        a0.append(sum(alignment_0_leader) / len(alignment_0_leader))
        a1.append(sum(alignment_1_leader) / len(alignment_1_leader))
    print(a0,a1)
    a0_p.append(a0)
    a1_p.append(a1)

In [None]:
plt.figure(figsize=[3, 3])
plt.plot(probas, np.mean(a0_p, 1), label="Agent 0")
plt.fill_between(
    probas,
    np.mean(a0_p, 1) + np.std(a0_p, 1),
    np.mean(a0_p, 1) - np.std(a0_p, 1),
    alpha=0.4,
)

plt.plot(probas, np.mean(a1_p, 1), label="Agent 1")
plt.fill_between(
    probas,
    np.mean(a1_p, 1) + np.std(a1_p, 1),
    np.mean(a1_p, 1) - np.std(a1_p, 1),
    alpha=0.4,
)
plt.legend()
plt.xlabel("Probability of solving aligned goals")
plt.ylabel("Alignment")
plt.grid()

plt.savefig("alignment_vs_p.png", dpi=300, bbox_inches="tight", transparent=True)

In [None]:
discount=1.
t=1.
EGREEDY=0.1
NUMBER_OF_LANDMARKS=15
N_STEPS = 100
n_goals=NUMBER_OF_LANDMARKS
discount_b=1/n_goals

alpha=0.9

In [None]:
# Alignment for all seeds
a0_p = []
a1_p = []


probas = np.arange(0.1, 1.1, 0.1)

for p in probas:
    a0 = []
    a1 = []
    scores = (np.eye(NUMBER_OF_LANDMARKS)*t+np.ones((NUMBER_OF_LANDMARKS,NUMBER_OF_LANDMARKS))*(1-t)) * p 
    
    
    for s in range(3):
        np.random.seed(s)
        agent_0 = np.zeros_like(scores)
        agent_1 = np.zeros_like(scores)
        change=True
        for i in range(N_STEPS):
            update_0=np.zeros_like(scores)
            update_1=np.zeros_like(scores)
            normalization_0=np.zeros_like(scores)
            normalization_1=np.zeros_like(scores)
            for _ in range(600):
                leader_goal_index = np.random.randint(0, scores.shape[0])

                
                if np.random.random() < 0.5:
                    # Agent 0 leader
                    if np.random.random() < EGREEDY:
                        leader_msg_index = np.random.choice(range(scores.shape[0]))
                    else:
                        leader_msg_index = np.argmax(agent_0[leader_goal_index])
                    if np.random.random() < EGREEDY:
                        follower_goal_index = np.random.choice(range(scores.shape[0]))
                    else:
                        follower_goal_index = np.argmax(agent_1[:, leader_msg_index])
                    leader_reward = (
                        np.random.random() < scores[leader_goal_index, follower_goal_index]
                    )
                    follower_reward = (
                        np.random.random() < scores[follower_goal_index, leader_goal_index]
                    )
                    if leader_reward:
                        update_0[leader_goal_index, leader_msg_index] += 1
                    normalization_0[leader_goal_index, leader_msg_index]+=1
                    if follower_reward:
                        update_1[follower_goal_index, leader_msg_index] += 1
                    normalization_1[follower_goal_index, leader_msg_index]+=1




                else:
                    # Agent 1 leader
                    if np.random.random() < EGREEDY:
                        leader_msg_index = np.random.choice(range(scores.shape[0]))
                    else:
                        leader_msg_index = np.argmax(agent_1[leader_goal_index])
                    if np.random.random() < EGREEDY:
                        follower_goal_index = np.random.choice(range(scores.shape[0]))
                    else:
                        follower_goal_index = np.argmax(agent_0[:, leader_msg_index])
                    leader_reward = (
                        np.random.random() < scores[leader_goal_index, follower_goal_index]
                    )
                    follower_reward = (
                        np.random.random() < scores[follower_goal_index, leader_goal_index]
                    )
                    if leader_reward:
                        update_1[leader_goal_index, leader_msg_index] += 1
                    normalization_1[leader_goal_index, leader_msg_index]+=1
                    if follower_reward:
                        update_0[follower_goal_index, leader_msg_index] += 1
                    normalization_0[follower_goal_index, leader_msg_index]+=1
            if(i%4==0):
                change=not change
                
                
            if(change):
                agent_0=(1-alpha)*agent_0+alpha*update_0/(normalization_0+1e-10)
            else:
                agent_1=(1-alpha)*agent_1+alpha*update_1/(normalization_1+1e-10)
            if(i%10==1000000):
                plt.imshow(agent_0)
                plt.show()

            
        alignment_0_leader = []
        for i in range(0, n_goals):
            msg = np.argmax(agent_0[i])
            j = np.argmax(agent_1[:, msg])
            alignment_0_leader.append(i == j)

        alignment_1_leader = []
        for i in range(0, n_goals):
            msg = np.argmax(agent_1[i])
            j = np.argmax(agent_0[:, msg])
            alignment_1_leader.append(i == j)

        a0.append(sum(alignment_0_leader) / len(alignment_0_leader))
        a1.append(sum(alignment_1_leader) / len(alignment_1_leader))
    print(a0,a1)
    a0_p.append(a0)
    a1_p.append(a1)

In [None]:
plt.figure(figsize=[3, 3])
plt.plot(probas, np.mean(a0_p, 1), label="Agent 0")
plt.fill_between(
    probas,
    np.mean(a0_p, 1) + np.std(a0_p, 1),
    np.mean(a0_p, 1) - np.std(a0_p, 1),
    alpha=0.4,
)

plt.plot(probas, np.mean(a1_p, 1), label="Agent 1")
plt.fill_between(
    probas,
    np.mean(a1_p, 1) + np.std(a1_p, 1),
    np.mean(a1_p, 1) - np.std(a1_p, 1),
    alpha=0.4,
)
plt.legend()
plt.xlabel("Probability of solving aligned goals")
plt.ylabel("Alignment")
plt.grid()

plt.savefig("alignment_vs_p.png", dpi=300, bbox_inches="tight", transparent=True)

In [None]:
discount=1.
t=0.75
EGREEDY=0.1
NUMBER_OF_LANDMARKS=15
N_STEPS = 100
n_goals=NUMBER_OF_LANDMARKS
discount_b=1/n_goals

alpha=0.05

In [None]:
def softmax(x,temperature=30):
    x=x*temperature
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [None]:
# Alignment for all seeds
a0_p = []
a1_p = []


probas = np.arange(0.1, 1.1, 0.1)
#probas=[0.4]

for p in probas:
    a0 = []
    a1 = []
    scores = (np.eye(NUMBER_OF_LANDMARKS)*t+np.ones((NUMBER_OF_LANDMARKS,NUMBER_OF_LANDMARKS))*(1-t)) * p 
    
    
    for s in range(3):
        np.random.seed(s)
        agent_0 = np.zeros_like(scores)
        agent_1 = np.zeros_like(scores)
        change=True
        for i in range(N_STEPS):
            update_0=np.zeros_like(scores)
            update_1=np.zeros_like(scores)
            normalization_0=np.zeros_like(scores)
            normalization_1=np.zeros_like(scores)
            update_0_b=np.zeros_like(scores)
            update_1_b=np.zeros_like(scores)
            normalization_0_b=np.zeros_like(scores)
            normalization_1_b=np.zeros_like(scores)
            for _ in range(600):
                leader_goal_index = np.random.randint(0, scores.shape[0])

                
                if np.random.random() < 0.5:
                    # Agent 0 leader
                    
                    leader_msg_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_0[leader_goal_index]))
                    follower_goal_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_1[:, leader_msg_index]))
                    #if(leader_goal_index==0):
                    #    print(softmax(agent_0[leader_goal_index]))
                    leader_reward = (
                        np.random.random() < scores[leader_goal_index, follower_goal_index]
                    )
                    follower_reward = (
                        np.random.random() < scores[follower_goal_index, leader_goal_index]
                    )
                    if leader_reward:
                        update_0[leader_goal_index, leader_msg_index] += 1
                    
                    normalization_0[leader_goal_index, leader_msg_index]+=1
                    if follower_reward:
                        update_1_b[follower_goal_index, leader_msg_index] += 1
                    normalization_1_b[follower_goal_index, leader_msg_index]+=1




                else:
                    # Agent 1 leader
                    leader_msg_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_1[leader_goal_index]))
                    follower_goal_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_0[:, leader_msg_index]))
                    
                    leader_reward = (
                        np.random.random() < scores[leader_goal_index, follower_goal_index]
                    )
                    follower_reward = (
                        np.random.random() < scores[follower_goal_index, leader_goal_index]
                    )
                    if leader_reward:
                        update_1[leader_goal_index, leader_msg_index] += 1
                    normalization_1[leader_goal_index, leader_msg_index]+=1
                    if follower_reward:
                        update_0_b[follower_goal_index, leader_msg_index] += 1
                    normalization_0_b[follower_goal_index, leader_msg_index]+=1
            #if(i%4==0):
            #    change=not change
                
                
            #if(change):
            agent_0=(1-alpha)*agent_0+alpha*(update_0/(normalization_0+1e-10)+update_0_b/(normalization_0_b+1e-10))/2
            #else:
            agent_1=(1-alpha)*agent_1+alpha*(update_1/(normalization_1+1e-10)+update_1_b/(normalization_1_b+1e-10))/2
            #if(i%10==0):
            #    print("a")
            #    plt.imshow(agent_0)
            #    plt.show()
            #    plt.imshow(agent_1)
            #    plt.show()

        plt.imshow(agent_0)
        plt.show()
        plt.imshow(agent_1)
        plt.show()
        alignment_0_leader = []
        for i in range(0, n_goals):
            msg = np.argmax(agent_0[i])
            j = np.argmax(agent_1[:, msg])
            alignment_0_leader.append(i == j)

        alignment_1_leader = []
        for i in range(0, n_goals):
            msg = np.argmax(agent_1[i])
            j = np.argmax(agent_0[:, msg])
            alignment_1_leader.append(i == j)

        a0.append(sum(alignment_0_leader) / len(alignment_0_leader))
        a1.append(sum(alignment_1_leader) / len(alignment_1_leader))
        print("aaaaaa")
        print(p)
        plt.imshow(agent_0)
        plt.show()
        plt.imshow(agent_1)
        plt.show()
        print(sum(alignment_0_leader) / len(alignment_0_leader),sum(alignment_1_leader) / len(alignment_1_leader))
        
    print(a0,a1)
    a0_p.append(a0)
    a1_p.append(a1)

In [None]:
plt.figure(figsize=[3, 3])
plt.plot(probas, np.mean(a0_p, 1), label="Agent 0")
plt.fill_between(
    probas,
    np.mean(a0_p, 1) + np.std(a0_p, 1),
    np.mean(a0_p, 1) - np.std(a0_p, 1),
    alpha=0.4,
)

plt.plot(probas, np.mean(a1_p, 1), label="Agent 1")
plt.fill_between(
    probas,
    np.mean(a1_p, 1) + np.std(a1_p, 1),
    np.mean(a1_p, 1) - np.std(a1_p, 1),
    alpha=0.4,
)
plt.legend()
plt.xlabel("Probability of solving aligned goals")
plt.ylabel("Alignment")
plt.grid()

plt.savefig("alignment_vs_p.png", dpi=300, bbox_inches="tight", transparent=True)

In [None]:
discount=1.
t=0.75
EGREEDY=0.1
NUMBER_OF_LANDMARKS=15
N_STEPS = 250
n_goals=NUMBER_OF_LANDMARKS
discount_b=1/n_goals
temperature=30
alpha=0.1

In [None]:
for temperature in np.linspace(1,41,5):
    print("aaaaaaaaa")
    print(temperature)
    # Alignment for all seeds
    a0_p = []
    a1_p = []


    probas = np.arange(0.1, 1.1, 0.1)

    for p in probas:
        a0 = []
        a1 = []
        scores = (np.eye(NUMBER_OF_LANDMARKS)*t+np.ones((NUMBER_OF_LANDMARKS,NUMBER_OF_LANDMARKS))*(1-t)) * p 


        for s in range(3):
            np.random.seed(s)
            agent_0 = np.zeros_like(scores)
            agent_1 = np.zeros_like(scores)
            change=True
            for i in range(N_STEPS):
                update_0=np.zeros_like(scores)
                update_1=np.zeros_like(scores)
                normalization_0=np.zeros_like(scores)
                normalization_1=np.zeros_like(scores)
                for _ in range(600):
                    leader_goal_index = np.random.randint(0, scores.shape[0])


                    if np.random.random() < 0.5:
                        # Agent 0 leader

                        leader_msg_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_0[leader_goal_index],temperature=temperature))
                        follower_goal_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_1[:, leader_msg_index],temperature=temperature))
                        #if(leader_goal_index==0):
                        #    print(softmax(agent_0[leader_goal_index]))
                        leader_reward = (
                            np.random.random() < scores[leader_goal_index, follower_goal_index]
                        )
                        follower_reward = (
                            np.random.random() < scores[follower_goal_index, leader_goal_index]
                        )
                        if leader_reward:
                            update_0[leader_goal_index, leader_msg_index] += 1
                        normalization_0[leader_goal_index, leader_msg_index]+=1
                        if follower_reward:
                            update_1[follower_goal_index, leader_msg_index] += 1
                        normalization_1[follower_goal_index, leader_msg_index]+=1




                    else:
                        # Agent 1 leader
                        leader_msg_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_1[leader_goal_index],temperature=temperature))
                        follower_goal_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_0[:, leader_msg_index],temperature=temperature))

                        leader_reward = (
                            np.random.random() < scores[leader_goal_index, follower_goal_index]
                        )
                        follower_reward = (
                            np.random.random() < scores[follower_goal_index, leader_goal_index]
                        )
                        if leader_reward:
                            update_1[leader_goal_index, leader_msg_index] += 1
                        normalization_1[leader_goal_index, leader_msg_index]+=1
                        if follower_reward:
                            update_0[follower_goal_index, leader_msg_index] += 1
                        normalization_0[follower_goal_index, leader_msg_index]+=1
                if(i%4==0):
                    change=not change


                if(change):
                    agent_0=(1-alpha)*agent_0+alpha*update_0/(normalization_0+1e-10)
                else:
                    agent_1=(1-alpha)*agent_1+alpha*update_1/(normalization_1+1e-10)
                if(i%10==1000000):
                    plt.imshow(agent_0)
                    plt.show()


            alignment_0_leader = []
            for i in range(0, n_goals):
                msg = np.argmax(agent_0[i])
                j = np.argmax(agent_1[:, msg])
                alignment_0_leader.append(i == j)

            alignment_1_leader = []
            for i in range(0, n_goals):
                msg = np.argmax(agent_1[i])
                j = np.argmax(agent_0[:, msg])
                alignment_1_leader.append(i == j)

            a0.append(sum(alignment_0_leader) / len(alignment_0_leader))
            a1.append(sum(alignment_1_leader) / len(alignment_1_leader))
        print(a0,a1)
        a0_p.append(a0)
        a1_p.append(a1)
    plt.figure(figsize=[3, 3])
    plt.plot(probas, np.mean(a0_p, 1), label="Agent 0")
    plt.fill_between(
        probas,
        np.mean(a0_p, 1) + np.std(a0_p, 1),
        np.mean(a0_p, 1) - np.std(a0_p, 1),
        alpha=0.4,
    )

    plt.plot(probas, np.mean(a1_p, 1), label="Agent 1")
    plt.fill_between(
        probas,
        np.mean(a1_p, 1) + np.std(a1_p, 1),
        np.mean(a1_p, 1) - np.std(a1_p, 1),
        alpha=0.4,
    )
    plt.legend()
    plt.xlabel("Probability of solving aligned goals")
    plt.ylabel("Alignment")
    plt.grid()

    plt.savefig("alignment_vs_p.png", dpi=300, bbox_inches="tight", transparent=True)
    plt.show()

## Two matrices per agent

One matrix is used when the agent is leader and a different one is used when the agent is a follower

In [None]:
# Alignment for all seeds
a0 = []
a1 = []

for s in range(10):
    np.random.seed(s)
    agent_0 = np.zeros_like(scores)
    agent_1 = np.zeros_like(scores)
    agent_0f = np.zeros_like(scores)
    agent_1f = np.zeros_like(scores)

    for i in range(N_STEPS):
        leader_goal_index = np.random.randint(0, n_goals)
        if np.random.random() < 0.5:
            # Agent 0 leader
            if np.random.random() < EGREEDY:
                leader_msg_index = np.random.choice(range(scores.shape[0]))
            else:
                leader_msg_index = np.argmax(agent_0[leader_goal_index])
            if np.random.random() < EGREEDY:
                follower_goal_index = np.random.choice(range(scores.shape[0]))
            else:
                follower_goal_index = np.argmax(agent_1f[leader_msg_index])
            leader_reward = (
                np.random.random() < scores0[leader_goal_index, follower_goal_index]
            )
            follower_reward = (
                np.random.random() < scores1[follower_goal_index, leader_goal_index]
            )
            if leader_reward:
                agent_0[leader_goal_index, :] -= DELTA
                agent_0[leader_goal_index, leader_msg_index] += (
                    leader_reward + 1
                ) * DELTA
            else:
                agent_0[leader_goal_index, leader_msg_index] -= DELTA
            if follower_reward:
                agent_1f[leader_msg_index] -= DELTA
                agent_1f[leader_msg_index, follower_goal_index] += (
                    follower_reward + 1
                ) * DELTA
            else:
                agent_1f[leader_msg_index, follower_goal_index] -= DELTA

        else:
            # Agent 1 leader
            if np.random.random() < EGREEDY:
                leader_msg_index = np.random.choice(range(scores.shape[0]))
            else:
                leader_msg_index = np.argmax(agent_1[leader_goal_index])
            if np.random.random() < EGREEDY:
                follower_goal_index = np.random.choice(range(scores.shape[0]))
            else:
                follower_goal_index = np.argmax(agent_0f[leader_msg_index])
            leader_reward = (
                np.random.random() < scores1[leader_goal_index, follower_goal_index]
            )
            follower_reward = (
                np.random.random() < scores0[follower_goal_index, leader_goal_index]
            )
            if leader_reward:
                agent_1[leader_goal_index, :] -= DELTA
                agent_1[leader_goal_index, leader_msg_index] += (
                    leader_reward + 1
                ) * DELTA
            else:
                agent_1[leader_goal_index, leader_msg_index] -= DELTA
            if follower_reward:
                agent_0f[leader_msg_index] -= DELTA
                agent_0f[leader_msg_index, follower_goal_index] += (
                    follower_reward + 1
                ) * DELTA
            else:
                agent_0f[leader_msg_index, follower_goal_index] -= DELTA

    alignment_0_leader = []
    for i in range(0, n_goals):
        msg = np.argmax(agent_0[i])
        j = np.argmax(agent_1f[msg])
        alignment_0_leader.append(i == j)

    alignment_1_leader = []
    for i in range(0, n_goals):
        msg = np.argmax(agent_1[i])
        j = np.argmax(agent_0f[msg])
        alignment_1_leader.append(i == j)

    a0.append(sum(alignment_0_leader) / len(alignment_0_leader))
    a1.append(sum(alignment_1_leader) / len(alignment_1_leader))

In [None]:
print(f"Alignment for agent 0: {np.mean(a0)}")
print(f"Alignment for agent 1: {np.mean(a1)}")

# separate follower and leader update

In [None]:
discount=1.
t=0.5
EGREEDY=0.1
NUMBER_OF_LANDMARKS=15
N_STEPS = 1000
n_goals=NUMBER_OF_LANDMARKS
discount_b=1/n_goals

alpha=0.1

In [None]:
# Alignment for all seeds
a0_p = []
a1_p = []


probas = np.arange(0.1, 1.1, 0.1)
probas=[0.7]

for p in probas:
    a0 = []
    a1 = []
    scores = (np.eye(NUMBER_OF_LANDMARKS)*t+np.ones((NUMBER_OF_LANDMARKS,NUMBER_OF_LANDMARKS))*(1-t)) * p 
    
    
    
    for s in range(3):
        np.random.seed(s)
        agent_0 = np.zeros_like(scores)
        agent_1 = np.zeros_like(scores)
        change=True
        
        vid_agents=np.zeros((N_STEPS,scores.shape[0]*3+10,2*scores.shape[1]+5,3))
        for i in range(N_STEPS):
            update_0=np.zeros_like(scores)
            update_1=np.zeros_like(scores)
            normalization_0=np.zeros_like(scores)
            normalization_1=np.zeros_like(scores)
            update_0_b=np.zeros_like(scores)
            update_1_b=np.zeros_like(scores)
            normalization_0_b=np.zeros_like(scores)
            normalization_1_b=np.zeros_like(scores)
            
            
            vid_agents[i,:scores.shape[0],:scores.shape[1],0]=agent_0
            vid_agents[i,:scores.shape[0],scores.shape[1]+5:,0]=agent_1
            for _ in range(600):
                leader_goal_index = np.random.randint(0, scores.shape[0])

                
                if np.random.random() < 0.5:
                    # Agent 0 leader
                    
                    leader_msg_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_0[leader_goal_index]))
                    follower_goal_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_1[:, leader_msg_index]))
                    #if(leader_goal_index==0):
                    #    print(softmax(agent_0[leader_goal_index]))
                    leader_reward = (
                        np.random.random() < scores[leader_goal_index, follower_goal_index]
                    )
                    follower_reward = (
                        np.random.random() < scores[follower_goal_index, leader_goal_index]
                    )
                    if leader_reward:
                        update_0[leader_goal_index, leader_msg_index] += 1
                    
                    normalization_0[leader_goal_index, leader_msg_index]+=1
                    if follower_reward:
                        update_1_b[follower_goal_index, leader_msg_index] += 1
                    normalization_1_b[follower_goal_index, leader_msg_index]+=1




                else:
                    # Agent 1 leader
                    leader_msg_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_1[leader_goal_index]))
                    follower_goal_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_0[:, leader_msg_index]))
                    
                    leader_reward = (
                        np.random.random() < scores[leader_goal_index, follower_goal_index]
                    )
                    follower_reward = (
                        np.random.random() < scores[follower_goal_index, leader_goal_index]
                    )
                    if leader_reward:
                        update_1[leader_goal_index, leader_msg_index] += 1
                    normalization_1[leader_goal_index, leader_msg_index]+=1
                    if follower_reward:
                        update_0_b[follower_goal_index, leader_msg_index] += 1
                    normalization_0_b[follower_goal_index, leader_msg_index]+=1
            #if(i%4==0):
            #    change=not change
                
                
            #if(change):
            agent_0=(1-alpha)*agent_0+alpha*(update_0/(normalization_0+1e-10)+update_0_b/(normalization_0_b+1e-10))/2
            #else:
            agent_1=(1-alpha)*agent_1+alpha*(update_1/(normalization_1+1e-10)+update_1_b/(normalization_1_b+1e-10))/2
            
            
            vid_agents[i,scores.shape[0]+5:scores.shape[0]*2+5,:scores.shape[1],1]=update_0/(normalization_0+1e-10)
            vid_agents[i,scores.shape[0]+5:scores.shape[0]*2+5,scores.shape[1]+5:,1]=update_1/(normalization_1+1e-10)
            
            vid_agents[i,scores.shape[0]*2+10:,:scores.shape[1],2]=update_0_b/(normalization_0_b+1e-10)
            vid_agents[i,scores.shape[0]*2+10:,scores.shape[1]+5:,2]=update_1_b/(normalization_1_b+1e-10)
            
            #if(i%10==0):
            #    print("a")
            #    plt.imshow(agent_0)
            #    plt.show()
            #    plt.imshow(agent_1)
            #    plt.show()

        plt.imshow(agent_0)
        plt.show()
        plt.imshow(agent_1)
        plt.show()
        alignment_0_leader = []
        for i in range(0, n_goals):
            msg = np.argmax(agent_0[i])
            j = np.argmax(agent_1[:, msg])
            alignment_0_leader.append(i == j)

        alignment_1_leader = []
        for i in range(0, n_goals):
            msg = np.argmax(agent_1[i])
            j = np.argmax(agent_0[:, msg])
            alignment_1_leader.append(i == j)

        a0.append(sum(alignment_0_leader) / len(alignment_0_leader))
        a1.append(sum(alignment_1_leader) / len(alignment_1_leader))
        print("aaaaaa")
        print(p)
        plt.imshow(agent_0)
        plt.show()
        plt.imshow(agent_1)
        plt.show()
        
        vid_agents=np.repeat(vid_agents,4,axis=1)
        vid_agents=np.repeat(vid_agents,4,axis=2)
        with VideoWriter("out.mp4", 20) as vid:
          for timestep in range(N_STEPS):
            im = vid_agents[timestep]
            
            #plt.imshow(im)
            #plt.show()
            vid.add(im)
          vid.show()
        print(sum(alignment_0_leader) / len(alignment_0_leader),sum(alignment_1_leader) / len(alignment_1_leader))
        
    print(a0,a1)
    a0_p.append(a0)
    a1_p.append(a1)

# video

In [None]:
!pip install moviepy

In [None]:
import os
os.environ['FFMPEG_BINARY'] = 'ffmpeg'
import moviepy.editor as mvp
from moviepy.video.io.ffmpeg_writer import FFMPEG_VideoWriter
from IPython.display import HTML, display, clear_output

class VideoWriter:
  def __init__(self, filename, fps=30.0, **kw):
    self.writer = None
    self.params = dict(filename=filename, fps=fps, **kw)

  def add(self, img):
    img = np.asarray(img)
    if self.writer is None:
      h, w = img.shape[:2]
      self.writer = FFMPEG_VideoWriter(size=(w, h), **self.params)
    if img.dtype in [np.float32, np.float64]:
      img = np.uint8(img.clip(0, 1)*255)
    if len(img.shape) == 2:
      img = np.repeat(img[..., None], 3, -1)
    self.writer.write_frame(img)

  def close(self):
    if self.writer:
      self.writer.close()

  def __enter__(self):
    return self

  def __exit__(self, *kw):
    self.close()

  def show(self, **kw):
      self.close()
      fn = self.params['filename']
      display(mvp.ipython_display(fn, **kw))

In [None]:
discount=1.
t=0.7
EGREEDY=0.01
NUMBER_OF_LANDMARKS=15
N_STEPS = 400
n_goals=NUMBER_OF_LANDMARKS
discount_b=1/n_goals
temperature=30
alpha=0.1

In [None]:
def softmax(x,temperature=30):
    x=x*temperature
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))

    return e_x / e_x.sum(axis=0)

    
    

In [None]:
a=np.zeros(15)
a[0]=0.4
a[1]=0.2
print(softmax(a,30))
print(softmax(a,10))

In [None]:
# Alignment for all seeds
a0_p = []
a1_p = []


probas = np.arange(0.1, 1.1, 0.1)
probas=[0.6]

for p in probas:
    a0 = []
    a1 = []
    scores = (np.eye(NUMBER_OF_LANDMARKS)*t+np.ones((NUMBER_OF_LANDMARKS,NUMBER_OF_LANDMARKS))*(1-t)) * p 
    
    alpha_b=0.1
    for s in range(3):
        p2=p
        np.random.seed(s)
        agent_0 = np.zeros((scores.shape[0],20))
        agent_1 = np.zeros((scores.shape[0],20))
       
        
        vid_agents=np.zeros((N_STEPS,agent_0.shape[0],2*agent_0.shape[1]+5))

        change=True
        
        alignement0_list=[]
        alignement1_list=[]
        for i in range(N_STEPS):
            t=i/(N_STEPS-100)
            t=np.clip(t,0,1)
            #p2=1*(t)+(1-t)*p
            p2=p
            if(i<200):
                alpha=0.6
            else:
                alpha=alpha_b
                
            
            
            scores = (np.eye(NUMBER_OF_LANDMARKS)*t+np.ones((NUMBER_OF_LANDMARKS,NUMBER_OF_LANDMARKS))*(1-t)) * p2
            update_0=np.zeros_like(agent_0)
            update_1=np.zeros_like(agent_1)
            normalization_0=np.zeros_like(agent_0)
            normalization_1=np.zeros_like(agent_1)
            
            
            vid_agents[i,:,:agent_0.shape[1]]=agent_0
            vid_agents[i,:,agent_0.shape[1]+5:]=agent_1
  
            for _ in range(600):
                leader_goal_index = np.random.randint(0, scores.shape[0])

                
                if np.random.random() < 0.5:
                    # Agent 0 leader

                    
                    leader_msg_index = np.random.choice(range(agent_0.shape[1]),p=softmax(agent_0[leader_goal_index],temperature))
                    follower_goal_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_1[:, leader_msg_index],temperature))
                    ##if(leader_goal_index==0):
                    ##    print(softmax(agent_0[leader_goal_index]))
                    leader_reward = (
                        np.random.random() < scores[leader_goal_index, follower_goal_index]
                    )
                    follower_reward = (
                        np.random.random() < scores[follower_goal_index, leader_goal_index]
                    )
                    if leader_reward:
                        update_0[leader_goal_index, leader_msg_index] += 1
                    
                    normalization_0[leader_goal_index, leader_msg_index]+=1
                    if follower_reward:
                        update_1[follower_goal_index, leader_msg_index] += 1
                    normalization_1[follower_goal_index, leader_msg_index]+=1




                else:
                    # Agent 1 leader
                    
                    leader_msg_index = np.random.choice(range(agent_1.shape[1]),p=softmax(agent_1[leader_goal_index],temperature))
                    follower_goal_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_0[:, leader_msg_index],temperature))
                    
                    leader_reward = (
                        np.random.random() < scores[leader_goal_index, follower_goal_index]
                    )
                    follower_reward = (
                        np.random.random() < scores[follower_goal_index, leader_goal_index]
                    )
                    if leader_reward:
                        update_1[leader_goal_index, leader_msg_index] += 1
                    normalization_1[leader_goal_index, leader_msg_index]+=1
                    if follower_reward:
                        update_0[follower_goal_index, leader_msg_index] += 1
                    normalization_0[follower_goal_index, leader_msg_index]+=1
            #if(i%4==0):
            #    change=not change
                
                
            #if(change):
            agent_0=(1-alpha)*agent_0+alpha*(update_0/(normalization_0+1e-10))
            #else:
            agent_1=(1-alpha)*agent_1+alpha*(update_1/(normalization_1+1e-10))
            #if(i%10==0):
            #    print("a")
            #    plt.imshow(agent_0)
            #    plt.show()
            #    plt.imshow(agent_1)
            #    plt.show()
            
            
            alignment_0_leader=[]
            for i in range(0, n_goals):
                msg = np.argmax(agent_0[i])
                j = np.argmax(agent_1[:, msg])
                alignment_0_leader.append(i == j)


            alignment_1_leader = []
            for i in range(0, n_goals):
                msg = np.argmax(agent_1[i])
                j = np.argmax(agent_0[:, msg])
                alignment_1_leader.append(i == j)
                
            alignement0_list.append(sum(alignment_0_leader) / len(alignment_0_leader))
            alignement1_list.append(sum(alignment_1_leader) / len(alignment_1_leader))
            
            

        alignment_0_leader=[]
        for i in range(0, n_goals):
            msg = np.argmax(agent_0[i])
            j = np.argmax(agent_1[:, msg])
            alignment_0_leader.append(i == j)


        alignment_1_leader = []
        for i in range(0, n_goals):
            msg = np.argmax(agent_1[i])
            j = np.argmax(agent_0[:, msg])
            alignment_1_leader.append(i == j)
            
        plt.plot(alignement0_list)
        plt.show()
        plt.plot(alignement1_list)
        plt.show()

      
        a0.append(sum(alignment_0_leader) / len(alignment_0_leader))
        a1.append(sum(alignment_1_leader) / len(alignment_1_leader))
        print("aaaaaa")
        print(p2)
        plt.imshow(agent_0)
        plt.show()
        plt.imshow(agent_1)
        plt.show()
        
        vid_agents=np.repeat(vid_agents,10,axis=1)
        vid_agents=np.repeat(vid_agents,10,axis=2)
        with VideoWriter("out.mp4", 20) as vid:
          for timestep in range(N_STEPS):
            im = vid_agents[timestep]
            
            #plt.imshow(im)
            #plt.show()
            vid.add(im)
          vid.show()
        print(sum(alignment_0_leader) / len(alignment_0_leader),sum(alignment_1_leader) / len(alignment_1_leader))
        
        
        
    print(a0,a1)
    a0_p.append(a0)
    a1_p.append(a1)

In [None]:
same with base params

with high prob

# p selecting goal 

In [None]:
def softmax(x,temperature=30,axis=0):
    x=x*temperature
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x,axis=axis,keepdims=True))

    return e_x / e_x.sum(axis=axis,keepdims=True)
def entropy_softmax(a,temperature):
    p=softmax(a,temperature,axis=1)
    return(-(p*np.log(p+1e-10)).sum(axis=1))

In [None]:
agent=agent_0*1.
agent=np.zeros((15,15))
agent[0,0]=0.5
plt.imshow(agent)
plt.show()
plt.imshow(softmax(agent,30,1))
plt.show()
print(softmax(agent,10,1)[0])

ent=entropy_softmax(agent,temperature)
print(ent)

temperature2=2
p=softmax(ent,temperature2)
print(p)

In [None]:
discount=1.
t=0.7
EGREEDY=0.1
NUMBER_OF_LANDMARKS=15
N_STEPS = 500
n_goals=NUMBER_OF_LANDMARKS
discount_b=1/n_goals
temperature=20
alpha=0.15

In [None]:
# Alignment for all seeds
a0_p = []
a1_p = []


probas = np.arange(0.1, 1.1, 0.1)
probas=[0.7]

for p in probas:
    a0 = []
    a1 = []
    scores = (np.eye(NUMBER_OF_LANDMARKS)*t+np.ones((NUMBER_OF_LANDMARKS,NUMBER_OF_LANDMARKS))*(1-t)) * p 
    
    
    for s in range(3):
        np.random.seed(s)
        agent_0 = np.zeros_like(scores)+1e-10
        agent_1 = np.zeros_like(scores)+1e-10
        
        vid_agents=np.zeros((N_STEPS,scores.shape[0],2*scores.shape[1]+5))

        change=True
        for i in range(N_STEPS):
            update_0=np.zeros_like(scores)
            update_1=np.zeros_like(scores)
            normalization_0=np.zeros_like(scores)
            normalization_1=np.zeros_like(scores)
            
            
            vid_agents[i,:,:scores.shape[1]]=agent_0
            vid_agents[i,:,scores.shape[1]+5:]=agent_1
  
            for _ in range(600):
                

                
                if np.random.random() < 0.5:
                    # Agent 0 leader
                    
                    ######## sample of the goal with high uncertainty
                    leader_goal_index = np.random.choice(range(scores.shape[0]),p=softmax(entropy_softmax(agent_0,temperature),2))
                    
                    
        
                    
                    leader_msg_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_0[leader_goal_index],temperature))
                    follower_goal_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_1[:, leader_msg_index],temperature))
                    #if(leader_goal_index==0):
                    #    print(softmax(agent_0[leader_goal_index]))
                    leader_reward = (
                        np.random.random() < scores[leader_goal_index, follower_goal_index]
                    )
                    follower_reward = (
                        np.random.random() < scores[follower_goal_index, leader_goal_index]
                    )
                    if leader_reward:
                        update_0[leader_goal_index, leader_msg_index] += 1
                    
                    normalization_0[leader_goal_index, leader_msg_index]+=1
                    if follower_reward:
                        update_1[follower_goal_index, leader_msg_index] += 1
                    normalization_1[follower_goal_index, leader_msg_index]+=1




                else:
                    # Agent 1 leader
                    leader_goal_index = np.random.choice(range(scores.shape[0]),p=softmax(entropy_softmax(agent_1,temperature),2))
                    
                    
                    leader_msg_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_1[leader_goal_index],temperature))
                    follower_goal_index = np.random.choice(range(scores.shape[0]),p=softmax(agent_0[:, leader_msg_index],temperature))
                    
                    leader_reward = (
                        np.random.random() < scores[leader_goal_index, follower_goal_index]
                    )
                    follower_reward = (
                        np.random.random() < scores[follower_goal_index, leader_goal_index]
                    )
                    if leader_reward:
                        update_1[leader_goal_index, leader_msg_index] += 1
                    normalization_1[leader_goal_index, leader_msg_index]+=1
                    if follower_reward:
                        update_0[follower_goal_index, leader_msg_index] += 1
                    normalization_0[follower_goal_index, leader_msg_index]+=1
            #if(i%4==0):
            #    change=not change
                
                
            #if(change):
            agent_0=(1-alpha)*agent_0+alpha*(update_0/(normalization_0+1e-10))
            #else:
            agent_1=(1-alpha)*agent_1+alpha*(update_1/(normalization_1+1e-10))
            #if(i%10==0):
            #    print("a")
            #    plt.imshow(agent_0)
            #    plt.show()
            #    plt.imshow(agent_1)
            #    plt.show()

        alignment_0_leader=[]
        for i in range(0, n_goals):
            msg = np.argmax(agent_0[i])
            j = np.argmax(agent_1[:, msg])
            alignment_0_leader.append(i == j)


        alignment_1_leader = []
        for i in range(0, n_goals):
            msg = np.argmax(agent_1[i])
            j = np.argmax(agent_0[:, msg])
            alignment_1_leader.append(i == j)


      
        a0.append(sum(alignment_0_leader) / len(alignment_0_leader))
        a1.append(sum(alignment_1_leader) / len(alignment_1_leader))
        print("aaaaaa")
        print(p)
        plt.imshow(agent_0)
        plt.show()
        plt.imshow(agent_1)
        plt.show()
        
        
        vid_agents=np.repeat(vid_agents,10,axis=1)
        vid_agents=np.repeat(vid_agents,10,axis=2)
        with VideoWriter("out.mp4", 20) as vid:
          for timestep in range(N_STEPS):
            im = vid_agents[timestep]
            
            #plt.imshow(im)
            #plt.show()
            vid.add(im)
          vid.show()
        print(sum(alignment_0_leader) / len(alignment_0_leader),sum(alignment_1_leader) / len(alignment_1_leader))
        if(sum(alignment_0_leader) / len(alignment_0_leader)<0.95):
            break
        
        
    print(a0,a1)
    a0_p.append(a0)
    a1_p.append(a1)

In [None]:
def softmax(x,temperature=30,axis=0):
    x=x*temperature
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x,axis=axis,keepdims=True))

    return e_x / e_x.sum(axis=axis,keepdims=True)
def entropy_softmax(a,temperature):
    p=softmax(a,temperature,axis=1)
    return(-(p*np.log(p+1e-10)).sum(axis=1))

In [None]:
agent=agent_0*1.0

plt.imshow(agent)
plt.show()

plt.imshow(softmax(agent,30,1))
plt.show()

ent=entropy_softmax(agent,temperature)
print(ent)

temperature2=2
p=softmax(ent,temperature2)
print(p)

# All goals

Now we consder all goals both individual and cooperative, we define the matrix for the different type of goals by setting the following constants

We consider variable reward multiplier for cooperative goals, this is needed to incentivize follower's picking cooperative goals.

In [None]:
# Probability of success when solving an individual goal
P_INDIVIDUAL = 1.0
# Probability of success when solving a collective goal, and the other agent is following the same goal
P_COLLECTIVE_SAME = 1.0
# Probability of success when solving a collective goal, and the other agent is solving an individual but compatible goal (e.g 101 and 001)
P_COLLECTIVE_INDIVIDUAL_COMPAT = 0.0
# Probability of success when solving a collective goal, and the other agent is solving another "compatible" collective goal (e.g 101 and 011)
P_COLLECTIVE_COLLECTIVE_COMPAT = 0.0

In [None]:
individual_goals = np.eye(NUMBER_OF_LANDMARKS, dtype=np.uint8).tolist()
collective_goals = np.array(list(combinations(individual_goals, 2))).sum(1).tolist()
goal_space = individual_goals + collective_goals
n_goals = len(goal_space)
n_individual = len(individual_goals)

In [None]:
n_goals**2

In [None]:
# scores(own_goal, other_goal) = probability of reward for that agent
scores = np.zeros((n_goals, n_goals))
# setting individual goals scores
scores[:n_individual, :] = P_INDIVIDUAL
# setting for collective goals
for n in range(n_individual, n_goals):
    # compatible collective goals
    for i in range(n_individual, n_goals):
        if np.bitwise_or.reduce([goal_space[n], goal_space[i]]).sum() == 3:
            scores[n, i] = P_COLLECTIVE_COLLECTIVE_COMPAT
    # compatible collective + individual goals
    for i in range(n_individual):
        if np.bitwise_or.reduce([goal_space[n], goal_space[i]]).sum() == 2:
            scores[n, i] = P_COLLECTIVE_INDIVIDUAL_COMPAT
    # same collective goals
    scores[n, n] = P_COLLECTIVE_SAME

In [None]:
scores

In [None]:
a0_all = []
a1_all = []
cooperative_factors = range(1, 100, 20)
for b in cooperative_factors:
    # # Double matrix full communication naming game double matrices
    a0 = []
    a1 = []
    for s in range(5):
        np.random.seed(s)
        agent_0 = np.zeros_like(scores)
        agent_1 = np.zeros_like(scores)

        agent_0f = np.zeros_like(scores)
        agent_1f = np.zeros_like(scores)

        rewards_0 = []
        rewards_1 = []
        alignment = []

        EGREEDY = 0.15
        EGREEDY = 0.05

        for i in range(N_STEPS):
            leader_goal_index = np.random.randint(0, n_goals)
            if np.random.random() < 0.5:
                # Agent 0 leader
                # egreedy
                if EGREEDY:
                    if np.random.random() < EGREEDY:
                        leader_msg_index = np.random.choice(range(scores.shape[0]))
                    else:
                        leader_msg_index = np.argmax(agent_0[leader_goal_index])
                    if np.random.random() < EGREEDY:
                        follower_goal_index = np.random.choice(range(scores.shape[0]))
                    else:
                        follower_goal_index = np.argmax(agent_1f[leader_msg_index])
                else:
                    leader_scores = torch.nn.functional.softmax(
                        torch.Tensor(agent_0[leader_goal_index])
                    ).numpy()
                    leader_scores /= sum(leader_scores)
                    leader_msg_index = np.random.choice(
                        range(scores.shape[0]), p=leader_scores
                    )
                    follower_scores = torch.nn.functional.softmax(
                        torch.Tensor(agent_1f[leader_msg_index])
                    ).numpy()
                    follower_scores /= sum(follower_scores)
                    follower_goal_index = np.random.choice(
                        range(scores.shape[0]), p=follower_scores
                    )
                leader_reward = (
                    np.random.random() < scores[leader_goal_index, follower_goal_index]
                )
                follower_reward = (
                    np.random.random() < scores[follower_goal_index, leader_goal_index]
                )
                # Collective reward adjustment
                if leader_goal_index > n_individual - 1:
                    leader_reward *= b
                if follower_goal_index > n_individual - 1:
                    follower_reward *= b
                rewards_0.append(leader_reward)
                rewards_1.append(follower_reward)
                if leader_reward:
                    agent_0[leader_goal_index, :] -= DELTA
                    agent_0[leader_goal_index, leader_msg_index] += (
                        leader_reward + 1
                    ) * DELTA
                else:
                    agent_0[leader_goal_index, leader_msg_index] -= DELTA
                if follower_reward:
                    agent_1f[leader_msg_index, :] -= DELTA
                    agent_1f[leader_msg_index, follower_goal_index] += (
                        follower_reward + 1
                    ) * DELTA
                else:
                    agent_1f[leader_msg_index, follower_goal_index] -= DELTA

            else:
                # Agent 1 leader
                if EGREEDY:
                    if np.random.random() < EGREEDY:
                        leader_msg_index = np.random.choice(range(scores.shape[0]))
                    else:
                        leader_msg_index = np.argmax(agent_1[leader_goal_index])
                    if np.random.random() < EGREEDY:
                        follower_goal_index = np.random.choice(range(scores.shape[0]))
                    else:
                        follower_goal_index = np.argmax(agent_0f[leader_msg_index])
                else:
                    leader_scores = torch.nn.functional.softmax(
                        torch.Tensor(agent_1[leader_goal_index])
                    ).numpy()
                    leader_scores /= sum(leader_scores)
                    leader_msg_index = np.random.choice(
                        range(scores.shape[0]), p=leader_scores
                    )
                    follower_scores = torch.nn.functional.softmax(
                        torch.Tensor(agent_0f[leader_msg_index])
                    ).numpy()
                    follower_scores /= sum(follower_scores)
                    follower_goal_index = np.random.choice(
                        range(scores.shape[0]), p=follower_scores
                    )
                leader_reward = (
                    np.random.random() < scores[leader_goal_index, follower_goal_index]
                )
                follower_reward = (
                    np.random.random() < scores[follower_goal_index, leader_goal_index]
                )
                # Collective reward adjustment
                if leader_goal_index > n_individual - 1:
                    leader_reward *= b
                if follower_goal_index > n_individual - 1:
                    follower_reward *= b
                rewards_1.append(leader_reward)
                rewards_0.append(follower_reward)
                if leader_reward:
                    agent_1[leader_goal_index, :] -= DELTA
                    agent_1[leader_goal_index, leader_msg_index] += (
                        leader_reward + 1
                    ) * DELTA
                else:
                    agent_1[leader_goal_index, leader_msg_index] -= DELTA
                if follower_reward:
                    agent_0f[leader_msg_index, :] -= DELTA
                    agent_0f[leader_msg_index, follower_goal_index] += (
                        follower_reward + 1
                    ) * DELTA
                else:
                    agent_0f[leader_msg_index, follower_goal_index] -= DELTA

            if leader_goal_index > n_individual - 1:
                alignment.append(leader_goal_index == follower_goal_index)

        alignment_0_leader = []
        for i in range(n_individual, n_goals):
            msg = np.argmax(agent_0[i])
            j = np.argmax(agent_1f[msg])
            alignment_0_leader.append(i == j)

        alignment_1_leader = []
        for i in range(n_individual, n_goals):
            msg = np.argmax(agent_1[i])
            j = np.argmax(agent_0f[msg])
            alignment_1_leader.append(i == j)

        a0.append(sum(alignment_0_leader) / len(alignment_0_leader))
        a1.append(sum(alignment_1_leader) / len(alignment_1_leader))
    a0_all.append(a0)
    a1_all.append(a1)

In [None]:
plt.figure(figsize=[3, 3])
plt.plot(cooperative_factors, np.mean(a0_all, axis=1), label="Agent 0 leader")
plt.fill_between(
    cooperative_factors,
    np.mean(a0_all, axis=1) - np.std(a0_all, axis=1),
    np.mean(a0_all, axis=1) + np.std(a0_all, axis=1),
    alpha=0.3,
)
plt.legend()
plt.plot(cooperative_factors, np.mean(a1_all, axis=1), label="Agent 1 leader")
plt.fill_between(
    cooperative_factors,
    np.mean(a1_all, axis=1) - np.std(a1_all, axis=1),
    np.mean(a1_all, axis=1) + np.std(a1_all, axis=1),
    alpha=0.3,
)
plt.legend()
plt.grid()
plt.xlabel("Cooperative reward multiplier")
plt.ylabel("Alignment for leader's cooperative goal")

plt.grid()

plt.savefig(
    "alignment_vs_coop_reward.png", dpi=300, bbox_inches="tight", transparent=True
)