Implementation of the Hats problem in keras using a MXNet backend:

In [1]:
# Test mxnet GPU usage:

import mxnet as mx 
def gpu_device(gpu_number=0):
    try:
        _ = mx.nd.array([1, 2, 3], ctx=mx.gpu(gpu_number))
    except mx.MXNetError:
        return None
    return mx.gpu(gpu_number)

if not gpu_device(0):
    print('No GPU device found!')
else:
    print('GPU {0:s} in use!'.format(str(mx.gpu(0))))

GPU gpu(0) in use!


In [2]:
# import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
from keras.models import Sequential
from keras.layers import Dense, Activation

Using MXNet backend


In [3]:
#Make lists of NN components for each prisoner (This part may definetly be improoved)  
inputs1 = []
W = []
Qout = []
predict = []
nextQ = []
loss = []
trainer = []
updateModel = []
models = []

number_of_agents = 2
for ag in range(number_of_agents):
    
    nb_inputs = 5

    model = Sequential()
    model.add(Dense(20, activation='relu', input_dim=number_of_agents - 1))
    #model.add(Dense(40, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(number_of_agents, activation='sigmoid'))
    model.compile(optimizer='sgd',
                  loss='mean_squared_error',
                  metrics=['accuracy'])

    models.append(model)


In [5]:
import sys
import copy
import time
from functools import reduce

# Set learning parameters
e = 0.1
num_episodes = 3 * (10 ** 4)
#create lists to contain total rewards and steps per episode
rList = []
rAll = 0
eta_per_eps = []

for ep in range(num_episodes):
    
    ep_start_time = time.time()
    #initialize hats
    hats = [np.random.randint(number_of_agents) for i in range(number_of_agents)]

    #trasform the seen hats into inputs for the network
    states = [hats[:i] + hats[i+1:] for i in range(number_of_agents)]

    #The Q-Network
    #Take actions in for the agents
    actions = []
    for i in range(number_of_agents):
        state = np.array(states[i]) / number_of_agents #normalize the input
        predictions = models[i].predict( np.reshape(state, (-1, number_of_agents - 1)))
        action      = predictions[0].argmax()
        
        if np.random.rand(1) < e:
            action = np.random.randint(number_of_agents) #env.action_space.sample()
        actions.append((action, predictions[0]))
        
        
    validate = sum(actions[x][0] == hats[x] for x in range(number_of_agents))
    r = 0
    r += 1 if validate >= 1 else 0
    
    
    #update the Q networks
    for i in range(number_of_agents):
        targetQ = [actions[i][1]]
        targetQ[0][actions[i][0]] = r
        targetQ = np.reshape(targetQ, (-1, number_of_agents))
        state =  np.reshape(np.array(states[i]) / number_of_agents, (-1, number_of_agents - 1))
        models[i].fit(state, targetQ, verbose=0)
        
        
    #record rewards
    rAll += r
    eta_per_eps.append(time.time() - ep_start_time)
    
    #Update displayed information and epsilon
    if(ep % 200 == 199):
        episode = ep
        winning_episode_percentage = rAll / 200.
        training_percentage = (episode + 1) / num_episodes * 100
        
        sys.stdout.write("\r             Percentage of winning episodes, after {0:d}, out of {1:d} ({2:.2f}%), episodes is {3:.2f}."\
              .format(episode + 1, num_episodes, training_percentage, winning_episode_percentage))
        
        
        #Reduce chance of random action as we train the model.
        e = 1./((ep/10) + 10)

        #Updated accuracy
        rList.append(rAll / 200.)
        rAll = 0         
        
        #Print estimated time 
        eta_seconds = int(((sum(eta_per_eps[-30:]) / 30) * (num_episodes - episode)))
        eta_hours   = eta_seconds // 3600
        eta_minutes = (eta_seconds % 3600) // 60
        eta_seconds = eta_seconds % 60
        sys.stdout.write("\r ETA: {0:0d}:{1:0d}:{2:0d}.".format(eta_hours, eta_minutes, eta_seconds))
print("")

 ETA: 0:0:0. Percentage of winning episodes, after 30000, out of 30000 (100.00%), episodes is 0.74.


Test the network results and plot obtained results:

In [6]:
def test_accuracy(Q_nets):
    number_of_agents = len(Q_nets)
    Q = Q_nets
    reward = 0
    for ep in range(10000):
        hats = [np.random.randint(number_of_agents) for i in range(number_of_agents)]
        #trasform the seen hats into inputs for the network
        states = [hats[:i] + hats[i+1:] for i in range(number_of_agents)]
        
        actions = []
        for i in range(number_of_agents):
            state = np.array(states[i]) / number_of_agents #normalize the input
            predictions = Q[i].predict( np.reshape(state, (-1, number_of_agents - 1)))
            action      = predictions[0].argmax()
            actions.append((action, predictions[0]))
            
        #verify if the the agents solved the problem
        #(i.e. at least one agent correctly guessed the color of its own hat)
        validate = sum(actions[x][0] == hats[x] for x in range(number_of_agents))
        r = 0
        r += 1 if validate >= 1 else 0

        reward += r
    return reward / 10000.
test_accuracy(models)

0.7523

In [None]:
batch_accuracy = rList
plt.figure(figsize=(8, 4), dpi= 120)
plt.axes()
plt.title("Accuracy improvement for {0:d} prisoners using Q nets".format(number_of_agents))
plt.plot(batch_accuracy, label = "Accuracy")
plt.yscale("linear")
plt.xlabel("Training percentage")
plt.legend()
plt.grid(linestyle='--')


print("For training", number_of_agents, "agents (using Q nets), for", num_episodes, "episodes,")
print("the accuracy started at:", batch_accuracy[0], "and reached:", batch_accuracy[-1])