In [6]:
# states
state_names = ['age', 'sex', 'BMI', 'activity', 'alcohol', 'hba', 'race_b', 'race_l',
       'race_w']

# action
maps = '0,1,2'
maps = maps.split(",")
action_tabs = {}
for idx, k in enumerate(maps):
    action_tabs[idx] = k

state_size = len(state_names)
action_size = len(action_tabs)

In [27]:
# function assess
class TestCallback(Callback):
    
    def __init__(self, model, num_epoch):
        self.epsilon = 0.9 # for exploration
        self.test_data = pd.read_csv("testdia.csv")
        self.rewards = []
        self.vars = []
        self.model = model
        self.num_epoch = num_epoch
        self.eval_batch = 50

    def on_epoch_end(self, epoch, logs={}):
        result = []
        states, actions, rewards = [], [], []
        for k in range(self.test_data.shape[0]):
            state = self.test_data.loc[k, state_names]
            action = self.test_data.loc[k, "treatment"]
            reward = self.test_data.loc[k, "reward"]
            if len(states) == self.eval_batch:
                states = np.stack(states)
                states = states.reshape([states.shape[0], 1, states.shape[1]])
                actions = np.array(actions)
                rewards = np.array(rewards)
                pred_acts = self.model.predict(states).argmax(axis=-1)
                for idx in range(len(pred_acts)):
                    if random.random() < self.epsilon:
                        pred_acts[idx] = random.choice(range(len(pred_acts)))
                ret_r = (pred_acts == actions) * rewards
                result.append(ret_r)
                states, actions, rewards = [], [], []
            states.append(state)
            actions.append(action)
            rewards.append(reward)
        self.epsilon /= (epoch* 0.05 + 1)
        if epoch > int(self.num_epoch * 0.8):
            self.epsilon = 0
        print("epoch: ", epoch, "reward: ", np.mean(result), "variance: ", np.std(result))
        self.rewards.append(np.mean(result))
        self.vars.append(np.std(result))

In [28]:
class DQNAgent:
    
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = []
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.batch_size = 256
        self.num_epochs = 50
        self.learning_rate = 1e-3
        self.time_steps = 1
        self.n_hidden = 64
        self.n_output = action_size
        
        self.model = self._build_model()

    # build model
    def _build_model(self, dropout=0.2):
        inputs = keras.Input(shape=(self.time_steps, self.state_size), name='input2')
        x21 = keras.layers.LSTM(self.n_hidden, return_sequences=False)(inputs)
        x22 = keras.layers.Dense(self.n_hidden)(x21)
        x = keras.layers.Dropout(dropout)(x22)
        output = keras.layers.Dense(self.n_output, activation='softmax')(x)
        model = keras.Model(inputs=inputs, outputs=[output])
        return model

    # buffer
    def memorize(self, state, action, reward, next_state, done):
        self.memory = (state, action, reward, next_state, done) 

    # predict
    def act(self, state):
        pred_acts = self.model.predict(states).argmax(axis=-1)[0]
        return action_tabs[pred_acts]

    # train
    def learn(self):
        self.model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
        Metrics = TestCallback(self.model, self.num_epochs)
        info = self.model.fit(self.memory[0], self.memory[1], epochs=self.num_epochs,
                              shuffle=True, callbacks=[Metrics])
        return Metrics

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)