# Switch Riddle

Architecture, methodology and notations inspired from the paper [Learning to Communicate to Solve Riddles with Deep Distributed Recurrent Q-Networks](https://www.semanticscholar.org/paper/Learning-to-Communicate-to-Solve-Riddles-with-Deep-Foerster-Assael/52cb696af18aad0383770071d150137c39404edf).

# Creating the model function

Action encodings:
  - 1, 0 = None;
  - 0, 1 = Tell.
  
Message encodings:
  - 1, 0 = Turn **Off** the bulb;
  - 0, 1 = Turn __On__ the bulb.

Observation encodings:
  - 1, 0 = Not in the room;
  - 0, 1 = In the room.


In [1]:
def get_max_num_episodes(number_of_agents):
    return number_of_agents * 4 - 6

number_of_agents = 3
max_num_episodes = get_max_num_episodes(number_of_agents)

rnn_size = 128

In [2]:
from keras.layers import Input, Add, Embedding, LSTM, Dense
from keras.layers import Reshape, Concatenate
from keras.models import Model
from keras.optimizers import Nadam
from keras.initializers import RandomNormal
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import Activation
from keras.layers import GRU, Flatten

def generate_model(number_of_agents, rnn_size = 128):
    def get_max_num_episodes(number_of_agents):
        return number_of_agents * 4 - 6
    max_num_episodes = get_max_num_episodes(number_of_agents)
    
    bias_initializer = RandomNormal(mean=0.0, stddev=0.05, seed=None)
    activation = 'tanh'#Activation('tanh')#LeakyReLU()
    
    #o^k_t #in or out of the room;
    #0-out; 1-in.
    #could be 0 1 or 1 0... 
    participation_inputs      = [Input(shape = (1,), dtype='float32', name='participation_input_' + str(i))\
                              for i in range(max_num_episodes)]

    #m_{t-1} #Received message;
    # 0 1 - On; 1 0 - Off, 0 0 - Null;
    message_inputs            = [Input(shape = (2,), dtype='float32', name='message_input_' + str(i))\
                              for i in range(max_num_episodes)]
    #Can add MLPs here with batch normalization, like in the article

    #u^a_{t-1}  #Tell or None action #Better idea: #Turn On or Tun Off;
    # 0 1 - On; 1 0 - Off, 0 0 - Null;
    last_action_inputs        = [Input(shape = (2,), dtype='float32', name='last_action_input_' + str(i))\
                              for i in range(max_num_episodes)]

    #a. One hot encoding of the agent's index.
    # index_inputs              = [Input(shape = (number_of_agents,), dtype='float32', name='index_input_' + str(i))\
    #                           for i in range(max_num_episodes)]

    index_input              = Input(shape = (number_of_agents,), dtype='float32', name='index_input')
    
    
    #o^k_t #in or out of the room;
    #0-out; 1-in.
    #could be 0 1 or 1 0... 
    participation_embeddings      = [Dense(128, activation=activation, dtype='float32')(o) for o in participation_inputs]

    #m_{t-1} #Received message;
    # 0 1 - On; 1 0 - Off, 0 0 - Null;
    message_embeddings           = [Dense(128, activation=activation, dtype='float32')(Activation('tanh')(m))\
                                    for m in message_inputs]#Activation('sigmoid')(m)
    #Can add MLPs here with batch normalization, like in the article

    #u^a_{t-1}  #Tell or None action #Better idea: #Turn On or Tun Off;
    # 0 1 - On; 1 0 - Off, 0 0 - Null;
    last_action_embeddings       = [Dense(128, activation=activation, dtype='float32')(u) for u in last_action_inputs]

    #a. One hot encoding of the agent's index.
    # index_embeddings             = [Dense(128, dtype='float32')(a) for a in index_inputs]

    index_embedding             = Dense(128, activation=activation, dtype='float32')(index_input)
    
    #z^a_t
    recurrent_inputs = [Add()([o, m, u, index_embedding]) for \
                       (((o, m), u)) in zip(zip(participation_embeddings,\
                                                       message_embeddings),\
                                                   last_action_embeddings)]


    recurrent_input = Concatenate(axis=1)(recurrent_inputs)
    
    #h^a_{1, t}
    recurrent_output_1 = (GRU(rnn_size, return_sequences=True)(Reshape((-1, 128))(recurrent_input)))
    recurrent_input_2  = recurrent_output_1
    
    #h^a_{2, t}
    recurrent_output_2 = GRU(rnn_size)(Reshape((-1, 128))(recurrent_input_2))#recurrent_input))#
    recurrent_output_2

    dense_1 = Dense(rnn_size, activation=activation)(recurrent_output_2)
    dense_2 = Dense(rnn_size,  activation=activation)(dense_1)
    #Q values for messages and for actions
    output  = Dense(4,  activation=activation)(dense_2)

    
    
    model = Model(inputs=participation_inputs + message_inputs + last_action_inputs + [index_input],\
              outputs=[output])
    
    
    rmsprop = Nadam(lr=5 * 10 ** (-4), beta_1=0.95, beta_2=0.999, schedule_decay=0.004)
    model.compile(loss='mean_squared_error', optimizer=rmsprop, )

    return model

model = generate_model(number_of_agents)

Using TensorFlow backend.


In [3]:
generate_model(4).summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
message_input_0 (InputLayer)    (None, 2)            0                                            
__________________________________________________________________________________________________
message_input_1 (InputLayer)    (None, 2)            0                                            
__________________________________________________________________________________________________
message_input_2 (InputLayer)    (None, 2)            0                                            
__________________________________________________________________________________________________
message_input_3 (InputLayer)    (None, 2)            0                                            
__________________________________________________________________________________________________
message_in

Find if there is an available GPU:

In [11]:
import tensorflow as tf

tf.test.gpu_device_name()

'/gpu:0'

# Data set creation

In [5]:
import numpy as np

In [6]:
def draw_episode(number_of_agents):
    '''return an array, with indexes of the sampled prisoners, of length max_num_episodes'''
    max_num_episodes = get_max_num_episodes(number_of_agents)
    return [np.random.randint(1, number_of_agents + 1) for i in range(max_num_episodes)]

draw_episode(3)

def draw_episodes_batch(number_of_agents, batch_size):
    '''return an array, of length batch_size, of arrays, with 1's and 2's, of length number_of_agents'''
    max_num_episodes = get_max_num_episodes(number_of_agents)
    return np.random.randint(1, number_of_agents + 1, size=(batch_size, max_num_episodes))

draw_episodes_batch(number_of_agents=3, batch_size=5)

array([[2, 2, 2, 2, 3, 3],
       [3, 3, 3, 1, 1, 2],
       [1, 3, 3, 2, 1, 3],
       [2, 3, 3, 1, 2, 1],
       [1, 1, 1, 2, 1, 1]])

In [7]:
def generate_agent_first_input(number_of_agents, agent_idx):
    '''agent_idx starts from 1 and goes to number_of_agents inclusively'''
    max_num_episodes = get_max_num_episodes(number_of_agents)
    input_prefixes = ["participation_input_", "message_input_", "last_action_input_"]
    input_dims     = [1, 2, 2]
    pref_dim_pairs = {p : d for p, d in zip(input_prefixes, input_dims)}
    
    
    from functools import reduce
    input_names = reduce(lambda x,y: x + y, [list(map(lambda x: x + str(i), input_prefixes))\
                                             for i in range(max_num_episodes)])
    
    
    
    inputs                 = {name: np.zeros((1, pref_dim_pairs[name[:-1]])) for name in input_names}
    inputs["index_input"]  = np.reshape(np.eye(number_of_agents)[agent_idx-1], (-1, number_of_agents))#((1,number_of_agents))[:-1]
    return inputs

mock_input= generate_agent_first_input(3, 2)
# mock_input = generate_empty_inputs(2)
# model.predict(mock_input)
generate_agent_first_input(2, 1)

{'index_input': array([[ 1.,  0.]]),
 'last_action_input_0': array([[ 0.,  0.]]),
 'last_action_input_1': array([[ 0.,  0.]]),
 'message_input_0': array([[ 0.,  0.]]),
 'message_input_1': array([[ 0.,  0.]]),
 'participation_input_0': array([[ 0.]]),
 'participation_input_1': array([[ 0.]])}

In [8]:
#method to be tested and to be added edge cases

def generate_agent_input(time_step, previous_input, received_message, last_action):
    '''
    time_step starts from 0 and goes until max_num_episodes-1 inclusively.
    episode:             the agent order in the room;
    agent_idx:        the agent for which the input is generated (indexed from 1);
    received_message: list with two elements indicating the postion of the bulb;
    last_action:      list with two elements indicating the message sent at time_step - 1.

    '''
    #useless parameters: episode, agent_idx, 
    
    new_input = previous_input
    
    new_input["participation_input_" + str(time_step)][0] = 1
    new_input['message_input_' + str(time_step)][0] = received_message
    #last_action = previous_input["last_action_input_" + str(time_step - 1)][0] if time_step > 0 else [0, 0] 
    
    new_input["last_action_input_" + str(time_step)][0] = last_action
    
    return new_input

episode    = draw_episode(2)
mock_input = generate_agent_first_input(3, 2)
# generate_agent_input(time_step=5,\
#                      previous_input=mock_input,\
#                      received_message=[0, 1],\
#                      last_action=[1, 0])

In [9]:

def generate_batch_input(time_step, previous_inputs, received_messages, last_actions):
    '''
    time_step:         starts from 0 and goes until max_num_episodes-1 inclusively;
    received_messages: list of lists with two elements indicating the postion of the bulb;
    previous_inputs:   list of previous_input;
    last_actions:      list of lists with two elements indicating the message sent at time_step - 1.

    
    calls generate_agent_input() for each pair from zip(previous_inputs, received_messages, last_actions)
    '''
    from functools import reduce
    new_inputs_list = [generate_agent_input(time_step, p, r, l) for ((p, r), l) in zip(zip(previous_inputs,\
                                                                                      received_messages),\
                                                                                 last_actions)]
    new_inputs = {}
    for name in new_inputs_list[0]:
        new_inputs[name] = reduce(lambda x, y: np.vstack((x, y)), [inp[name] for inp in new_inputs_list]) 
    return new_inputs, new_inputs_list

episode    = draw_episodes_batch(3, 2)

mock_batch = generate_batch_input(1, [mock_input, mock_input], [[1,0], [0,1]], [[1,0], [1,0]])


In [10]:
model.predict(mock_batch[0])

array([[ 0.09314222, -0.04448696,  0.05284136,  0.05662699],
       [ 0.09314222, -0.04448696,  0.05284136,  0.05662699]], dtype=float32)

### Flow of data through the model

In [11]:
def action_selector(predictions, epsilon=1, test_mode=False):
    '''predictions: np array with 4 elements
    returns pair action, message
    
    e.g. [.2, .9, .89, .01]
    First two are for indicating Tell or Not;
    Last two are for turning On or Off the Switch 
    '''
    action  = np.zeros(2)#[0, 0]
    message = np.zeros(2)#[0, 0]
    
    if test_mode == True:    
        action[predictions[:2].argmax()]   = 1
        message[predictions[2:4].argmax()] = 1    
        return action, message
    
    action  = np.array([p for p in predictions[0:2]])
    message = np.array([p for p in predictions[2:4]])
    if np.random.rand() < epsilon:
        choice = np.random.randint(2)
        action[choice]     = 1
        action[choice - 1] = predictions[:2][choice - 1]
    if np.random.rand() < epsilon:
        choice = np.random.randint(2)
        message[choice]     = 1
        message[choice - 1] = predictions[2:4][choice - 1]
    return action, message
    
action_selector(np.arange(4) / 4, .1)

(array([ 0.  ,  0.25]), array([ 0.5 ,  0.75]))

In [13]:
def generate_episode_description(number_of_agents, batch_size):
    #Intialization of one episode batch:
    episode_batch    = draw_episodes_batch(number_of_agents, batch_size)
    #who visited the room in each episode of the batch
    visits   = [[0] * number_of_agents for i in range(batch_size)]
    #which episode from the batch ended
    ended    = [0] * batch_size
    #the position of the bulb in each episode #this are the received messages
    #states   = [[0, 0] for i in range(batch_size)] #can be deduced from per_step_prediction
    reward  = [0] * batch_size
    
    return episode_batch, visits, ended, reward

In [14]:
def generate_episode_input_holder(number_of_agents, batch_size):
    #generate first inputs for all agents in all the episodes
    all_prev_inputs = [[generate_agent_first_input(number_of_agents, agent_idx=a + 1)\
                        for a in range(number_of_agents)] \
                       for ep in range(batch_size)]
    
    #rember all inputs for the backward pass
    per_step_inputs = [[] for i in range(batch_size)]
    #remeber all predictions. They will have a one on one corespondence with "per_step_inputs"
    per_step_predictions = [[] for i in range(batch_size)]
    
    return all_prev_inputs, per_step_inputs, per_step_predictions

In [15]:
number_of_agents = 4
max_num_episodes = get_max_num_episodes(number_of_agents)
model = generate_model(number_of_agents)
batch_size   = 32
num_episodes = 20000
epsilon = .2
gamma = 1
winning_percentages = []
# average_saved_agents = [number_of_agents / 2] * 20

In [30]:
import sys
import copy
import time
from functools import reduce

eta_per_eps = []
for episode in range(num_episodes):
    #################
    #Initializations:
    ep_start_time = time.time()
    episode_batch, visits, ended, reward = generate_episode_description(number_of_agents, batch_size)
    all_prev_inputs, per_step_inputs, per_step_predictions = generate_episode_input_holder(number_of_agents, batch_size)
    
    ############
    #1.Prepare inputs;
    #2.Forward pass;
    #3.Validate predictions;
    #4.Backward pass.
    for step in range(max_num_episodes):
        if sum(ended) == batch_size:
            break
        ##################
        #1.Prepare inputs:
        #which agents are in the room at time "step"
        
        active_episodes     = [i for i in range(batch_size) if ended[i] == 0]
        #which agents are in the room at time "step"
        selected_agents     = [episode_batch[ep][step] for ep in active_episodes]        
        #take previous inputs of the selected agents. !Remember that the agents are idexed starting with 1!
        previous_inputs   = [all_prev_inputs[a][s - 1] \
                             for (a,s) in zip(active_episodes, selected_agents)]

        #if the same agent entered the room the previous step:
        #then the last action is what he predicted (On or Off), else [0, 0]. 
        last_actions      = [action_selector(np.hstack(per_step_predictions[i][-1]), test_mode=True)[1]\
                             if step > 0 and episode_batch[i][step-1] == episode_batch[i][step]\
                             else [0, 0]\
                             for i in range(batch_size)\
                             if i in active_episodes]
        #what is the position of the bulb. At step==0, bulb is Off([1,0])
        received_messages = [[1, 0] if step == 0 \
                             else action_selector(np.hstack(per_step_predictions[i][-1]), test_mode=True)[1]\
                             for i in range(batch_size)\
                             if i in active_episodes]
        #generate inputs for the DQN 
        inputs_batch, inputs_list = generate_batch_input(time_step=step,\
                                      previous_inputs=previous_inputs,\
                                      received_messages=received_messages,\
                                      last_actions=last_actions)
        ###########
        #2.Forward:
        #predict actions
        outputs = model.predict(inputs_batch, batch_size=len(active_episodes))

        #update previous_inputs 
        for ep in range(len(active_episodes)):
            
            idx_in_batch  = active_episodes[ep]
            idx_in_active = ep
        
            #add the new visitor in visits
            visits[idx_in_batch][selected_agents[ep] - 1] = 1
            #update all previous inputs
            all_prev_inputs[idx_in_batch][selected_agents[ep] - 1] = copy.deepcopy(inputs_list[ep])
            #remeber inputs and predictions respectively
            per_step_inputs[idx_in_batch].append(copy.deepcopy(inputs_list[ep]))
            prediction = action_selector(outputs[ep], epsilon)
            per_step_predictions[idx_in_batch].append(copy.deepcopy(prediction))            
        
            ##############
            #3.Validation:
            #end episodes which just used "Tell"
            #validate the just ended episodes
            if prediction[0].argmax() == 1:
                #then the episode has ended
                ended[ep] = 1
                if sum(visits[ep]) == number_of_agents:
                    reward[ep] = 1
                else:
                    reward[ep] = -1
    
    ##################
    #4. Backward pass:
    for step in np.arange(max_num_episodes - 1, -1, -1):
        targets = []
        inputs  = {}
        input_list = []
        target_list = []
        
        active_episodes = [ep for ep in np.arange(batch_size)\
                           if len(per_step_inputs[ep]) > step]
        if len(active_episodes) == 0:
            continue
        input_list  = [copy.deepcopy(per_step_inputs[ep][step]) for ep in active_episodes]
        target_list = [copy.deepcopy(per_step_predictions[ep][step]) for ep in active_episodes]
        
        for ep in active_episodes:
            idx_in_active = active_episodes.index(ep)
            
            action  = target_list[idx_in_active][0]
            message = target_list[idx_in_active][1]
                        
            if len(per_step_inputs[ep]) == (step + 1):
                action[action.argmax()]   = reward[ep]
                message[message.argmax()] = reward[ep]
            else:
                action[action.argmax()]   = np.max(per_step_predictions[ep][step][0])
                message[message.argmax()] = np.max(per_step_predictions[ep][step][1])
            
            targets.append(np.hstack([action, message]))
        
        for name in inputs_list[0]:
            inputs[name] = reduce(lambda x, y: np.vstack((x, y)), [inp[name] for inp in input_list])
        
        targets = np.array(targets)
        model.fit(inputs, targets, batch_size=len(active_episodes), verbose=0)
    
    winning_percentages.append(len([r for r in reward if r == 1])/batch_size)
    eta_per_eps.append(time.time() - ep_start_time)
    if(episode % 20 == 19):
        winning_episode_percentage = sum(winning_percentages[-20:]) / 20
        training_percentage = (episode + 1) / num_episodes * 100
        
        sys.stdout.write("\r                         Percentage of winning episodes, after {0:d}, out of {1:d} ({2:.2f}%), episodes is {3:.2f}."\
              .format(episode + 1, num_episodes, training_percentage, winning_episode_percentage))
        
        eta_seconds = int(((sum(eta_per_eps[-30:]) / 30) * (num_episodes - episode)))
        
        eta_hours   = eta_seconds // 3600
        eta_minutes = (eta_seconds % 3600) // 60
        eta_seconds = eta_seconds % 60
        sys.stdout.write("\r ETA: {0:0d}:{1:0d}:{2:0d}.".format(eta_hours, eta_minutes, eta_seconds))
        
        
    if(episode % 100 == 99):
        epsilon = max(.01, epsilon - .01)

 ETA: 0:0:0.             Percentage of winning episodes, after 20000, out of 20000 (100.00%), episodes is 0.75.

In [None]:
np.savetxt("results/statistics/switch_riddle/switch_{0:d}_winning_percentages".format(number_of_agents)\
           , winning_percentages)

Plot preliminary results

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(1)

def plot_performace_with_confidence(number_of_agents, winning_percentages):
    df = pd.DataFrame(winning_percentages)

    ma = df.rolling(1000).mean()
    mstd = 2 * df.rolling(1000).std()
    figure = plt.figure(figsize=(5, 3), linewidth=.5, dpi= 120, facecolor = 'white')

    plt.title(str(number_of_agents) + " prisoners.")
    plt.ylim(.0, 1)
    # plt.yticks(np.arange(.5, 1.01, .1))
    plt.grid(color='k', linewidth=.5, linestyle=':')
    # plt.
    # plt.xticks(np.arange(20000, 100001, 20000), [str(i) + 'k' for i in range(20, 101, 20)])
    ax = plt.plot(ma, linewidth=2)
    plt.fill_between(mstd.index, (ma-mstd)[0], (ma+mstd)[0], color='b', alpha=0.05)

In [None]:
plot_performace_with_confidence(number_of_agents, winning_percentages)