In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import networkx as nx
import sys
import random
sys.path.append('../src/')
from random_graph import RandGraph
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [27]:
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Lambda, multiply
from keras.initializers import RandomUniform

## Model

Adrien Lucas Ecoffet suggested in his [post](https://becominghuman.ai/lets-build-an-atari-ai-part-1-dqn-df57e8ff3b26) to use a second input as a mask of possible actions. This mask will be multiplied by the output of all possible actions. This filtered output is the Q value. Thus the network compute the Q values for all possible actions but the gradient will be updated according to the loss of the specific action chosen.

In [20]:
nb_values = 6
nb_actions = 6

In [21]:
values_input = Input((nb_values,), name='values')
action_input = Input((nb_actions,), name='mask')

In [54]:
x = Dense(32, activation='relu')(values_input)
x = Dense(32, activation='relu')(x)
output = Dense(nb_actions)(x)
filtered_output = multiply([output, action_input])

model = Model(inputs=[values_input, action_input], outputs=filtered_output)


In [55]:
model.compile(optimizer='rmsprop', loss='mse')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
values (InputLayer)             (None, 6)            0                                            
__________________________________________________________________________________________________
dense_28 (Dense)                (None, 32)           224         values[0][0]                     
__________________________________________________________________________________________________
dense_29 (Dense)                (None, 32)           1056        dense_28[0][0]                   
__________________________________________________________________________________________________
dense_30 (Dense)                (None, 6)            198         dense_29[0][0]                   
__________________________________________________________________________________________________
mask (Inpu

## Fit batch

In [56]:
gamma = 0.99
start_states = np.array([[0.1,0,0,0,0,0]]) # loading of the 6 core nodes
actions = np.array([[0,1,0,1,0,0]]) # one hot encoding of blocked nodes
rewards = np.array([[0.0]])
next_states = np.array([[0.2,0.1,0,0,0,0]]) # next values after action

In [57]:
# get all the Q_values for this specific states
next_Q_values = model.predict([next_states, np.ones_like(actions)])

In [58]:
next_Q_values

array([[ 0.03522554,  0.06019057,  0.05209278, -0.03180493, -0.01657225,
        -0.0520423 ]], dtype=float32)

In [59]:
# bellman
Q_values = rewards + gamma * np.max(next_Q_values, axis=1)

In [60]:
Q_values

array([[0.05958866]])

In [61]:
0.99 * 0.06019057

0.0595886643

In [64]:
# target Q_values
out = actions * Q_values

In [65]:
out

array([[0.        , 0.05958866, 0.        , 0.05958866, 0.        ,
        0.        ]])

In [69]:
# train
model.fit([start_states, actions], out, batch_size=len(start_states))

Epoch 1/1


<keras.callbacks.History at 0xe3116a0>

In [70]:
# check the Q values after update
model.predict([next_states, np.ones_like(actions)])

array([[ 0.03659564,  0.06620126,  0.05930661,  0.01066939, -0.02798991,
        -0.04506534]], dtype=float32)