In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, LSTM, Subtract
from tensorflow.keras.models import Model
from Environment import *
from tqdm import tqdm
import matplotlib.pyplot as plt
from copy import deepcopy
import os 
import numpy as np

In [2]:
dense_1_user =  Dense(32, activation = 'relu')
dense_2_user =  Dense(32, activation = 'relu')
# dense_3_user =  Dense(32, activation = 'relu')

dense_1_assist =  Dense(32, activation = 'relu')
lstm_1_assist = LSTM(32, activation = 'tanh')
dense_2_assist = Dense(32, activation = 'relu')

advantage_layer_user = Dense(4)
value_layer_user = Dense(1)

advantage_layer_assist = Dense(4)
value_layer_assist = Dense(1)

# advantage_layer = Dense(4)
# value_layer = Dense(1)

# advantage_layer_user = advantage_layer
# advantage_layer_assist = advantage_layer

# value_layer_user = value_layer
# value_layer_assist = value_layer

In [3]:
class AI_Design:
    def __init__(self, steps = 4):        
        self.loss_fn = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(lr = 0.0001)
        self.batch_size = 128
        self.replay_buffer_size = 1024
        self.replay_buffer = Replay_Buffer(self.replay_buffer_size)
        self.epsilon = 1
        self.gamma = 0.9
        self.env = Environment()
        self.env.cells = np.array([[0.7, 0.1], [0.1, 0.1], [0.5, 0.7], [0.6, 0.2], [0.7, 0.4], [0.2, 0.9]])
#         self.env_cells = np.array([[0.7, 0.1]])
        self.env_cell_mapping = give_mapping(self.env.cells)
        self.env_cell_mapping = self.env_cell_mapping[np.newaxis, :, :, np.newaxis]
        #-------------------------------------------------------------------------------------------------
        input_A = Input(shape = (4,))
        input_B = Input(shape = (steps,6))
        input_C = Input(shape = (11, 11, 1)) #Location of every icon
                 
        action_user = Input(shape = 1, dtype = tf.int32)
        action_assist = Input(shape = 1, dtype = tf.int32)
        
        
        
        #User Network 
        
        
        x = Subtract()([input_A[:, 2:], input_A[:, :2]])
        x = dense_1_user(x)
        x = dense_2_user(x)
#         x = dense_3_user(x)
        adv_user = advantage_layer_user(x)
        val_user = value_layer_user(x)
        output_user = adv_user - tf.reduce_mean(adv_user, axis = 1, keepdims = True) + val_user
        
        self.user_model = Model(inputs = input_A, outputs = output_user)
        self.user_model.summary()
        
        self.target_user_model = tf.keras.models.clone_model(self.user_model)
        self.target_user_model.set_weights(self.user_model.get_weights())
        

        
        #Assistant Network
        z = tf.keras.layers.Conv2D(filters = 2, kernel_size = 3, activation = 'relu')(input_C)
        z = tf.keras.layers.MaxPooling2D()(z)
        z = tf.keras.layers.Flatten()(z)
        z = tf.keras.layers.Dense(32, activation = 'relu')(z)
        
        y = dense_1_assist(input_B)
        y = lstm_1_assist(y)
        y = tf.keras.layers.Concatenate()([y,z])
        y = dense_2_assist(y)
        adv_assist = advantage_layer_assist(y)
        val_assist = value_layer_assist(y)
        output_assist = adv_assist - tf.reduce_mean(adv_assist, axis = 1, keepdims = True) + val_assist
        
        self.assist_model = Model(inputs = [input_B, input_C], outputs = output_assist)
        self.assist_model.summary()
        
        self.target_assist_model = tf.keras.models.clone_model(self.assist_model)
        self.target_assist_model.set_weights(self.assist_model.get_weights())
        
        
        #Complete Network

        mask_user = tf.reduce_sum(tf.one_hot(action_user, 4), axis = 1)
        mask_assist = tf.reduce_sum(tf.one_hot(action_assist, 4), axis = 1)
        output_user = output_user*mask_user
        output_assist = output_assist*mask_assist
        
        out = tf.reduce_sum(output_user + output_assist, axis = 1, keepdims = True)
        
        self.model = Model(inputs = [input_A, input_B, input_C, action_user, action_assist], outputs = out)  
        self.model.summary() 
        #-------------------------------------------------------------------------------------------------
    
    def infer(self):
        ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist,\
        reward_assist, next_ob_assist, done, importance, indices = self.sample_exp()
        
        ob_user = ob_user[:4]
        action_user = action_user[:4]
        reward_user = reward_user[:4]
        
        ob_assist = ob_assist[:4]
        action_assist = action_assist[:4]
        reward_assist = reward_assist[:4]
        
        print(action_user, action_assist)
        
        print(self.user_model(ob_user))
        print(self.assist_model([ob_assist, np.vstack(tuple([self.env_cell_mapping]*4))]))
        
        print(self.model([ob_user, ob_assist, np.vstack(tuple([self.env_cell_mapping]*4)), action_user, action_assist]))
    
    def exp_policy_user(self, state, next_action = False):
        if np.random.rand()<self.epsilon:
            return np.random.randint(4)
        else:
            state = np.array(state)[np.newaxis]
            if next_action == False:
                Q_values = self.user_model(state)
            else:
                Q_values = self.target_user_model(state)
            return np.argmax(Q_values[0])
    
    def exp_policy_assist(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(1,5)
        else:
            state = np.array(state)[np.newaxis]
            Q_values = self.assist_model([state, self.env_cell_mapping])
            return np.argmax(Q_values[0])+1
    
    def step(self, ob_user, prev_steps_assist):
        curr_loc = ob_user[:2]
        target_loc = ob_user[2:4]
        
        action_user = self.exp_policy_user(ob_user)
        action_user_one_hot = make_one_hot(action_user, 4)
        
        ob_assist = [action_user_one_hot + ob_user[:2]]
        ob_assist = prev_steps_assist + ob_assist 
        action_assist = self.exp_policy_assist(ob_assist)
        
        new_loc, reward_user, reward_assist, done = self.env.step(action_user, action_assist-1, target_loc, curr_loc)
        
        next_ob_user = new_loc[:]
        next_ob_user = next_ob_user + target_loc
        
        next_action_user = self.exp_policy_user(next_ob_user, next_action = True)
        next_action_user_one_hot = make_one_hot(next_action_user, 4)
        next_ob_assist = [next_action_user_one_hot + next_ob_user[:2]]
        next_ob_assist = ob_assist[1:] + next_ob_assist
        
        self.add_replay_buffer(ob_user, action_user, reward_user, next_ob_user, ob_assist,\
                          action_assist-1, reward_assist, next_ob_assist, done)
        
        return next_ob_user, ob_assist[1:], reward_user, reward_assist, done 
        
        
    
    def add_replay_buffer(self, ob_user, action_user, reward_user, next_ob_user, ob_assist,\
                         action_assist, reward_assist, next_ob_assist, done):
        
        self.replay_buffer.ob_user_history.append(ob_user)
        self.replay_buffer.action_user_history.append(action_user)
        self.replay_buffer.reward_user_history.append(reward_user)
        self.replay_buffer.next_ob_user_history.append(next_ob_user)
        self.replay_buffer.ob_assist_history.append(ob_assist)
        self.replay_buffer.action_assist_history.append(action_assist)
        self.replay_buffer.reward_assist_history.append(reward_assist)
        self.replay_buffer.next_ob_assist_history.append(next_ob_assist)
        self.replay_buffer.done_history.append(done)
        self.replay_buffer.priorities.append(self.replay_buffer.max_val)
    
    def sample_exp(self):
        sample_probs = self.replay_buffer.get_probabilities(priority_scale = 0.7)
        indices = np.random.choice(len(self.replay_buffer.done_history), size = self.batch_size, p = sample_probs)
        importance = self.replay_buffer.get_importance(sample_probs[indices])
        
        ob_user = np.array([self.replay_buffer.ob_user_history[i] for i in indices])
        action_user = np.array([self.replay_buffer.action_user_history[i] for i in indices])
        reward_user = np.array([self.replay_buffer.reward_user_history[i] for i in indices])
        next_ob_user = np.array([self.replay_buffer.next_ob_user_history[i] for i in indices])
        ob_assist = np.array([self.replay_buffer.ob_assist_history[i] for i in indices])
        action_assist = np.array([self.replay_buffer.action_assist_history[i] for i in indices])
        reward_assist = np.array([self.replay_buffer.reward_assist_history[i] for i in indices])
        next_ob_assist = np.array([self.replay_buffer.next_ob_assist_history[i] for i in indices])
        done = np.array([self.replay_buffer.done_history[i] for i in indices])
        
        return ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist, reward_assist, next_ob_assist, done,\
    importance, indices 
    
    def train(self):
        ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist,\
        reward_assist, next_ob_assist, done, importance, indices = self.sample_exp()
        
        input_A = ob_user
        input_B = ob_assist
        input_C = np.vstack(tuple([self.env_cell_mapping]*128))
        
        rewards = reward_user + reward_assist
        
        next_Q_values_user, next_Q_values_assist = self.user_model(next_ob_user), self.assist_model([next_ob_assist, input_C])
        best_next_actions_user, best_next_actions_assist = tf.math.argmax(next_Q_values_user, axis = 1), tf.math.argmax(next_Q_values_assist, axis = 1)
        next_Q_values_user, next_Q_values_assist = self.target_user_model(next_ob_user), self.target_assist_model([next_ob_assist, input_C])
        
        best_next_Q_values_user = tf.reduce_sum(next_Q_values_user*tf.one_hot(best_next_actions_user, 4), axis = 1)
        best_next_Q_values_assist = tf.reduce_sum(next_Q_values_user*tf.one_hot(best_next_actions_assist, 4), axis = 1)
        best_next_Q_values = best_next_Q_values_user + best_next_Q_values_assist
        
        target_Q_values = rewards + (1-done)*self.gamma*best_next_Q_values
        
        with tf.GradientTape() as tape:
            Q_values = self.model([input_A, input_B, input_C, action_user, action_assist])
            error = tf.multiply(self.loss_fn(target_Q_values, Q_values), importance**(1-self.epsilon))
            loss = tf.reduce_mean(error)
        
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        self.replay_buffer.set_priorities(indices, error)
        
        
#         Q_values_assist = self.assist_model([input_B, input_C])
#         with tf.GradientTape() as tape:
#             Q_values_user = self.user_model(input_A)
#             error = tf.multiply(self.loss_fn(Q_values_assist, Q_values_user), importance**(1-self.epsilon))
#             loss = tf.reduce_mean(error)
            
#         grads = tape.gradient(loss, self.user_model.trainable_variables)
#         self.optimizer.apply_gradients(zip(grads, self.user_model.trainable_variables))
            
#         with tf.GradientTape() as tape:
#             Q_values_assist = self.assist_model([input_B, input_C])
#             error = tf.multiply(self.loss_fn(Q_values_assist, Q_values_user), importance**(1-self.epsilon))
#             loss = tf.reduce_mean(error)
            
#         grads = tape.gradient(loss, self.assist_model.trainable_variables)
#         self.optimizer.apply_gradients(zip(grads, self.assist_model.trainable_variables))                          

        

In [4]:
steps = 4
model = AI_Design(steps)
env = model.env

# if os.path.exists('user_model.h5'):
#     model.user_model = tf.keras.models.load_model('user_model.h5')
#     model.assist_model = tf.keras.models.load_model('assist_model.h5')

Icon Locations:
[[0.  0.8]
 [0.4 0.8]
 [0.9 0.2]
 [0.9 0.3]
 [0.3 0.6]
 [0.6 0.3]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(None, 2)]          0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_1 (Te [(None, 2)]          0           input_1[0][0]                    
__________________________________________________________________________________________________
subtract (

In [5]:
def give_prev_steps(prev_steps_assist, steps):
    prev_steps_assist = [[0,0,0,0,-1,-1] for i in range(steps-1)]
    return prev_steps_assist

In [6]:
max_steps = 40
reached = 0
reached_history = []
max_reached = 0

running_reward = 0

for epoch in tqdm(range(100000)):
    done = False
    episode_reward = 0
    start, dest = env.give_start_dest()
    ob_user = [start[0], start[1], dest[0], dest[1]]
    prev_steps_assist = []
    prev_steps_assist = give_prev_steps(prev_steps_assist, steps)
    step = 0
    
    while not done and step<max_steps:
        ob_user, prev_steps_assist, reward_user, reward_assist, done = model.step(ob_user, prev_steps_assist)
        episode_reward+=reward_user
        step+=1
        if done:
            reached+=1
    
    if epoch:
        running_reward = 0.01 * episode_reward + (1 - 0.01) * running_reward
    else:
        running_reward = episode_reward
        
    if epoch>50:
        model.train()
        
        if epoch%100==0:
            model.target_user_model.set_weights(model.user_model.get_weights())
            model.target_assist_model.set_weights(model.assist_model.get_weights())
            reached_history.append(reached)
            rewards = []
            
            if reached>max_reached:
                print(reached)
                print('Saved Weights')
                max_reached = reached
                model.user_model.save('user_model.h5')
                model.assist_model.save('assist_model.h5')
                
            reached = 0
            
            if epoch%500==0:
                print(f'Running reward = {running_reward}')
                print(f'Successful runs = {np.mean(reached_history)}')
                reached_history = []
                
                if epoch%1000==0:
                    model.epsilon-=0.01
                    model.epsilon= max(model.epsilon, 0.1)
                    
                    if epoch%20000==0:
                        model.infer()

  0%|                                                                           | 101/100000 [00:08<4:05:58,  6.77it/s]

7
Saved Weights


  0%|▏                                                                          | 201/100000 [00:23<4:28:52,  6.19it/s]

16
Saved Weights


  1%|▍                                                                          | 502/100000 [01:05<3:57:23,  6.99it/s]

Running reward = -30.93637521773998
Successful runs = 12.0


  1%|▍                                                                          | 601/100000 [01:20<4:31:17,  6.11it/s]

21
Saved Weights


  1%|▋                                                                         | 1002/100000 [02:16<4:02:49,  6.80it/s]

Running reward = -29.361438974570753
Successful runs = 13.0


  2%|█                                                                         | 1502/100000 [03:31<4:07:46,  6.63it/s]

Running reward = -27.07107660014679
Successful runs = 18.6


  2%|█▍                                                                        | 2001/100000 [04:49<4:52:19,  5.59it/s]

Running reward = -28.67523611664215
Successful runs = 13.8


  2%|█▋                                                                        | 2201/100000 [05:27<5:43:22,  4.75it/s]

23
Saved Weights


  3%|█▊                                                                        | 2502/100000 [06:22<4:56:43,  5.48it/s]

Running reward = -28.827999326852513
Successful runs = 17.0


  3%|██▏                                                                       | 3001/100000 [07:55<5:03:22,  5.33it/s]

Running reward = -24.978178210777525
Successful runs = 16.6


  4%|██▌                                                                       | 3501/100000 [09:30<5:28:53,  4.89it/s]

Running reward = -27.695036803678924
Successful runs = 14.2


  4%|██▉                                                                       | 4002/100000 [11:06<5:16:34,  5.05it/s]

Running reward = -29.288126023338677
Successful runs = 15.2


  5%|███▎                                                                      | 4502/100000 [12:45<5:24:36,  4.90it/s]

Running reward = -27.478683202647204
Successful runs = 13.8


  5%|███▋                                                                      | 5002/100000 [14:23<5:11:01,  5.09it/s]

Running reward = -31.69103183839072
Successful runs = 14.8


  6%|████                                                                      | 5501/100000 [16:05<5:22:11,  4.89it/s]

Running reward = -31.717250357031624
Successful runs = 13.6


  6%|████▍                                                                     | 6001/100000 [17:47<5:11:48,  5.02it/s]

Running reward = -31.640814666567234
Successful runs = 10.6


  7%|████▊                                                                     | 6501/100000 [19:32<5:15:11,  4.94it/s]

Running reward = -27.35421696855135
Successful runs = 15.2


  7%|█████▏                                                                    | 7001/100000 [21:16<5:10:07,  5.00it/s]

Running reward = -28.017467843100906
Successful runs = 18.0


  7%|█████▎                                                                    | 7101/100000 [21:38<5:53:39,  4.38it/s]

24
Saved Weights


  8%|█████▌                                                                    | 7501/100000 [23:05<5:53:04,  4.37it/s]

Running reward = -25.543622473007826
Successful runs = 21.0


  8%|█████▉                                                                    | 8002/100000 [24:53<5:32:55,  4.61it/s]

Running reward = -28.2716462525397
Successful runs = 17.6


  9%|██████▎                                                                   | 8501/100000 [26:44<6:00:37,  4.23it/s]

26
Saved Weights
Running reward = -23.101870708368352
Successful runs = 21.2


  9%|██████▋                                                                   | 9001/100000 [28:34<5:34:02,  4.54it/s]

Running reward = -23.09390034172622
Successful runs = 20.0


 10%|███████                                                                   | 9501/100000 [30:28<5:58:00,  4.21it/s]

Running reward = -29.524912149943397
Successful runs = 16.4


 10%|███████▎                                                                 | 10001/100000 [32:22<5:58:18,  4.19it/s]

Running reward = -29.938182048528507
Successful runs = 15.0


 11%|███████▋                                                                 | 10501/100000 [34:18<5:50:02,  4.26it/s]

Running reward = -26.537643095516483
Successful runs = 17.6


 11%|████████                                                                 | 11002/100000 [36:16<5:27:11,  4.53it/s]

Running reward = -25.51904503124619
Successful runs = 18.8


 12%|████████▍                                                                | 11501/100000 [38:16<6:06:46,  4.02it/s]

Running reward = -25.68419810907637
Successful runs = 19.6


 12%|████████▍                                                                | 11601/100000 [38:41<6:14:13,  3.94it/s]

28
Saved Weights


 12%|████████▊                                                                | 12001/100000 [40:17<6:06:03,  4.01it/s]

Running reward = -28.34068368965187
Successful runs = 19.4


 12%|█████████                                                                | 12401/100000 [41:56<6:20:36,  3.84it/s]

29
Saved Weights


 13%|█████████▏                                                               | 12501/100000 [42:20<6:18:11,  3.86it/s]

Running reward = -24.56568342329741
Successful runs = 22.6


 13%|█████████▍                                                               | 13001/100000 [44:23<6:04:28,  3.98it/s]

Running reward = -26.26656647476318
Successful runs = 23.2


 14%|█████████▊                                                               | 13501/100000 [46:28<5:52:23,  4.09it/s]

Running reward = -22.620004347532525
Successful runs = 24.0


 14%|██████████▏                                                              | 14001/100000 [48:34<6:28:46,  3.69it/s]

Running reward = -22.229288656998367
Successful runs = 24.6


 15%|██████████▌                                                              | 14501/100000 [50:41<6:34:50,  3.61it/s]

30
Saved Weights
Running reward = -19.3019944967154
Successful runs = 25.4


 15%|██████████▉                                                              | 15001/100000 [52:49<6:16:59,  3.76it/s]

Running reward = -19.01000507369305
Successful runs = 28.4


 15%|███████████                                                              | 15101/100000 [53:14<6:26:30,  3.66it/s]

36
Saved Weights


 16%|███████████▎                                                             | 15501/100000 [54:59<6:31:57,  3.59it/s]

Running reward = -24.69282308931102
Successful runs = 27.0


 16%|███████████▋                                                             | 16001/100000 [57:10<6:38:00,  3.52it/s]

Running reward = -20.22298509929715
Successful runs = 28.8


 16%|███████████▉                                                             | 16401/100000 [58:56<6:51:25,  3.39it/s]

37
Saved Weights


 17%|████████████                                                             | 16501/100000 [59:23<6:32:47,  3.54it/s]

Running reward = -17.684953985996287
Successful runs = 31.6


 17%|████████████                                                             | 16601/100000 [59:48<6:59:07,  3.32it/s]

45
Saved Weights


 17%|████████████                                                           | 17001/100000 [1:01:35<6:13:04,  3.71it/s]

Running reward = -17.646933022241488
Successful runs = 35.2


 18%|████████████▍                                                          | 17501/100000 [1:03:50<6:04:32,  3.77it/s]

Running reward = -13.933064447827489
Successful runs = 34.8


 18%|████████████▍                                                          | 17601/100000 [1:04:16<6:46:51,  3.38it/s]

47
Saved Weights


 18%|████████████▊                                                          | 18001/100000 [1:06:05<6:31:13,  3.49it/s]

Running reward = -14.928443725343898
Successful runs = 34.4


 19%|█████████████▏                                                         | 18501/100000 [1:08:23<6:36:50,  3.42it/s]

Running reward = -16.193099355927956
Successful runs = 34.4


 19%|█████████████▍                                                         | 19001/100000 [1:10:41<6:49:11,  3.30it/s]

Running reward = -19.05820958159213
Successful runs = 31.4


 20%|█████████████▊                                                         | 19501/100000 [1:13:03<7:06:24,  3.15it/s]

Running reward = -17.455345730421197
Successful runs = 35.0


 20%|██████████████▏                                                        | 20000/100000 [1:15:25<6:27:17,  3.44it/s]

Running reward = -14.973835863525938
Successful runs = 31.6
[3 1 3 2] [1 2 1 3]
tf.Tensor(
[[-0.31179044 -0.2733512  -0.3175138  -0.33816212]
 [-0.3951024  -0.3846099  -0.4139261  -0.37656763]
 [-0.36227763 -0.36240944 -0.35491338 -0.3893795 ]
 [-0.24921702 -0.23261121 -0.27240154 -0.2383579 ]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-1.6752957 -1.666479  -1.6571398 -1.6590617]
 [-1.6729501 -1.6710753 -1.6673231 -1.6799183]
 [-1.6787312 -1.6904724 -1.7058456 -1.6677287]
 [-1.7195377 -1.7132518 -1.7086098 -1.7095699]], shape=(4, 4), dtype=float32)


 20%|██████████████▏                                                        | 20001/100000 [1:15:25<8:16:20,  2.69it/s]

tf.Tensor(
[[-2.004641 ]
 [-2.051933 ]
 [-2.0798519]
 [-1.9819715]], shape=(4, 1), dtype=float32)


 21%|██████████████▌                                                        | 20501/100000 [1:17:48<6:12:54,  3.55it/s]

Running reward = -13.849264685606043
Successful runs = 35.8


 21%|██████████████▊                                                        | 20801/100000 [1:19:12<6:57:42,  3.16it/s]

58
Saved Weights


 21%|██████████████▉                                                        | 21001/100000 [1:20:09<5:58:22,  3.67it/s]

Running reward = -12.187085597203335
Successful runs = 44.8


 22%|███████████████▎                                                       | 21501/100000 [1:22:32<6:47:09,  3.21it/s]

Running reward = -9.922716960814943
Successful runs = 44.8


 22%|███████████████▌                                                       | 22001/100000 [1:24:57<6:06:08,  3.55it/s]

Running reward = -13.718410566659577
Successful runs = 36.4


 23%|███████████████▉                                                       | 22501/100000 [1:27:23<6:26:26,  3.34it/s]

Running reward = -11.69119384176677
Successful runs = 45.2


 23%|████████████████▎                                                      | 23001/100000 [1:29:49<6:25:30,  3.33it/s]

Running reward = -9.550758016988548
Successful runs = 44.4


 24%|████████████████▋                                                      | 23501/100000 [1:32:18<6:56:03,  3.06it/s]

Running reward = -9.396550872944477
Successful runs = 39.4


 24%|█████████████████                                                      | 24001/100000 [1:34:50<6:34:31,  3.21it/s]

Running reward = -10.255139124984165
Successful runs = 37.8


 25%|█████████████████▍                                                     | 24501/100000 [1:37:24<6:23:36,  3.28it/s]

Running reward = -17.076302674468913
Successful runs = 34.4


 25%|█████████████████▊                                                     | 25001/100000 [1:39:56<6:56:15,  3.00it/s]

Running reward = -12.449740256322862
Successful runs = 36.4


 26%|██████████████████                                                     | 25501/100000 [1:42:29<6:23:41,  3.24it/s]

Running reward = -8.149276353785954
Successful runs = 42.2


 26%|██████████████████▍                                                    | 26001/100000 [1:45:04<7:09:43,  2.87it/s]

Running reward = -14.128490136153646
Successful runs = 41.8


 27%|██████████████████▊                                                    | 26501/100000 [1:47:42<5:51:22,  3.49it/s]

Running reward = -12.24818624429817
Successful runs = 39.2


 27%|███████████████████▏                                                   | 27001/100000 [1:50:19<6:00:20,  3.38it/s]

Running reward = -12.846097412838013
Successful runs = 38.6


 28%|███████████████████▌                                                   | 27501/100000 [1:53:01<7:07:00,  2.83it/s]

Running reward = -15.327872040698725
Successful runs = 35.2


 28%|███████████████████▉                                                   | 28001/100000 [1:55:36<5:15:53,  3.80it/s]

Running reward = 0.09061082395159643
Successful runs = 51.0


 28%|████████████████████▏                                                  | 28401/100000 [1:57:42<7:30:09,  2.65it/s]

62
Saved Weights


 29%|████████████████████▏                                                  | 28501/100000 [1:58:12<6:57:32,  2.85it/s]

Running reward = -1.6089189259246384
Successful runs = 52.8


 29%|████████████████████▌                                                  | 29001/100000 [2:00:49<6:19:18,  3.12it/s]

Running reward = -4.1652953408899736
Successful runs = 53.0


 30%|████████████████████▉                                                  | 29501/100000 [2:03:27<6:49:47,  2.87it/s]

Running reward = 1.4594102545114658
Successful runs = 55.4


 30%|█████████████████████▏                                                 | 29901/100000 [2:05:31<5:50:16,  3.34it/s]

65
Saved Weights


 30%|█████████████████████▎                                                 | 30001/100000 [2:06:04<6:28:34,  3.00it/s]

Running reward = 0.7878786115394083
Successful runs = 55.4


 31%|█████████████████████▋                                                 | 30501/100000 [2:08:41<5:43:49,  3.37it/s]

Running reward = 1.8137664789003713
Successful runs = 55.0


 31%|██████████████████████                                                 | 31001/100000 [2:11:20<5:34:08,  3.44it/s]

Running reward = -1.7164952536039049
Successful runs = 54.8


 31%|██████████████████████                                                 | 31101/100000 [2:11:51<6:23:33,  2.99it/s]

66
Saved Weights


 32%|██████████████████████▎                                                | 31501/100000 [2:14:01<7:06:47,  2.67it/s]

Running reward = 1.543892107137797
Successful runs = 62.4


 32%|██████████████████████▋                                                | 32001/100000 [2:16:41<6:17:15,  3.00it/s]

Running reward = 3.22170305099649
Successful runs = 61.8


 32%|███████████████████████                                                | 32401/100000 [2:18:53<6:14:32,  3.01it/s]

68
Saved Weights


 33%|███████████████████████                                                | 32501/100000 [2:19:25<5:53:41,  3.18it/s]

Running reward = 5.848697821907434
Successful runs = 60.4


 33%|███████████████████████▍                                               | 33001/100000 [2:22:07<6:15:45,  2.97it/s]

Running reward = 5.4291216236837005
Successful runs = 58.0


 34%|███████████████████████▊                                               | 33501/100000 [2:24:51<6:15:13,  2.95it/s]

73
Saved Weights
Running reward = 8.645261867142933
Successful runs = 61.2


 34%|████████████████████████▏                                              | 34001/100000 [2:27:34<5:23:12,  3.40it/s]

Running reward = 7.211393641645805
Successful runs = 63.8


 35%|████████████████████████▍                                              | 34501/100000 [2:30:17<5:18:46,  3.42it/s]

Running reward = 8.463158594297665
Successful runs = 65.4


 35%|████████████████████████▊                                              | 34901/100000 [2:32:27<5:54:54,  3.06it/s]

76
Saved Weights


 35%|████████████████████████▊                                              | 35001/100000 [2:32:58<6:32:04,  2.76it/s]

Running reward = 13.995100091903225
Successful runs = 65.2


 36%|█████████████████████████▏                                             | 35501/100000 [2:35:42<6:16:59,  2.85it/s]

Running reward = 8.68187248572421
Successful runs = 69.0


 36%|█████████████████████████▌                                             | 36001/100000 [2:38:26<6:09:15,  2.89it/s]

79
Saved Weights
Running reward = 12.519060308523418
Successful runs = 69.2


 37%|█████████████████████████▉                                             | 36501/100000 [2:41:10<6:02:35,  2.92it/s]

Running reward = 10.380105004907875
Successful runs = 66.4


 37%|██████████████████████████▎                                            | 37001/100000 [2:44:05<5:55:05,  2.96it/s]

Running reward = 0.30694910565241007
Successful runs = 55.6


 38%|██████████████████████████▋                                            | 37501/100000 [2:47:04<6:56:04,  2.50it/s]

Running reward = -2.440057187888355
Successful runs = 51.4


 38%|██████████████████████████▉                                            | 38001/100000 [2:49:56<5:22:00,  3.21it/s]

Running reward = 4.601523639058476
Successful runs = 61.0


 39%|███████████████████████████▎                                           | 38501/100000 [2:52:47<6:40:16,  2.56it/s]

Running reward = 4.077610193368068
Successful runs = 59.8


 39%|███████████████████████████▋                                           | 39001/100000 [2:55:39<4:51:46,  3.48it/s]

Running reward = 10.542438083389838
Successful runs = 63.8


 40%|████████████████████████████                                           | 39501/100000 [2:58:33<7:03:52,  2.38it/s]

Running reward = 9.319895482159533
Successful runs = 66.2


 40%|████████████████████████████▍                                          | 40001/100000 [3:01:26<6:37:46,  2.51it/s]

Running reward = 4.246700636227574
Successful runs = 66.8
[0 3 1 1] [1 2 1 0]
tf.Tensor(
[[0.19016941 0.16146566 0.2222745  0.14302088]
 [0.17084843 0.15718013 0.15621622 0.17309497]
 [0.15629217 0.15832515 0.1436447  0.16280055]
 [0.18569481 0.14448832 0.14897048 0.13638486]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[0.16377436 0.31190655 0.14402676 0.22618774]
 [0.16811773 0.32872295 0.16141167 0.23814388]
 [0.17033651 0.31113636 0.15659991 0.23215319]
 [0.18415868 0.3451572  0.16398871 0.24619429]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[0.50207597]
 [0.33450663]
 [0.4694615 ]
 [0.32864702]], shape=(4, 1), dtype=float32)


 41%|████████████████████████████▊                                          | 40501/100000 [3:04:17<5:42:10,  2.90it/s]

Running reward = 10.684546580219367
Successful runs = 70.2


 41%|█████████████████████████████                                          | 41001/100000 [3:07:13<6:42:04,  2.45it/s]

Running reward = 9.139731258641667
Successful runs = 67.4


 41%|█████████████████████████████▎                                         | 41301/100000 [3:08:57<5:34:13,  2.93it/s]

85
Saved Weights


 42%|█████████████████████████████▍                                         | 41501/100000 [3:10:07<6:50:46,  2.37it/s]

Running reward = 12.361117934743062
Successful runs = 70.8


 42%|█████████████████████████████▊                                         | 42001/100000 [3:12:59<6:02:50,  2.66it/s]

Running reward = 12.173448546836504
Successful runs = 70.4


 43%|██████████████████████████████▏                                        | 42501/100000 [3:15:55<5:34:14,  2.87it/s]

Running reward = 13.850451716770259
Successful runs = 71.6


 43%|██████████████████████████████▌                                        | 43001/100000 [3:18:48<6:35:32,  2.40it/s]

Running reward = 12.302287146991867
Successful runs = 73.2


 44%|██████████████████████████████▉                                        | 43501/100000 [3:21:43<6:21:41,  2.47it/s]

Running reward = 12.827037582918908
Successful runs = 73.2


 44%|███████████████████████████████▏                                       | 44001/100000 [3:24:36<5:37:17,  2.77it/s]

Running reward = 17.693037433996434
Successful runs = 74.2


 45%|███████████████████████████████▌                                       | 44501/100000 [3:27:37<5:40:09,  2.72it/s]

Running reward = 8.142568610980401
Successful runs = 69.4


 45%|███████████████████████████████▉                                       | 45001/100000 [3:30:32<5:43:22,  2.67it/s]

Running reward = 14.074105586008068
Successful runs = 74.0


 46%|████████████████████████████████▎                                      | 45501/100000 [3:33:29<5:39:07,  2.68it/s]

Running reward = 12.797808714064406
Successful runs = 75.2


 46%|████████████████████████████████▋                                      | 46001/100000 [3:36:23<5:01:25,  2.99it/s]

Running reward = 17.586917659628238
Successful runs = 77.0


 47%|█████████████████████████████████                                      | 46501/100000 [3:39:13<4:05:07,  3.64it/s]

Running reward = 16.98694718481898
Successful runs = 78.4


 47%|█████████████████████████████████▎                                     | 47001/100000 [3:42:06<5:08:53,  2.86it/s]

Running reward = 17.96779901517572
Successful runs = 78.0


 48%|█████████████████████████████████▋                                     | 47501/100000 [3:44:59<6:37:08,  2.20it/s]

Running reward = 14.530534408949821
Successful runs = 78.0


 48%|█████████████████████████████████▉                                     | 47801/100000 [3:46:43<6:09:30,  2.35it/s]

88
Saved Weights


 48%|██████████████████████████████████                                     | 48001/100000 [3:47:53<4:33:57,  3.16it/s]

Running reward = 19.76459766691691
Successful runs = 81.8


 49%|██████████████████████████████████▍                                    | 48501/100000 [3:50:53<5:33:27,  2.57it/s]

Running reward = 16.019264626112268
Successful runs = 75.8


 49%|██████████████████████████████████▊                                    | 49001/100000 [3:53:43<4:54:02,  2.89it/s]

Running reward = 21.781619296635043
Successful runs = 82.8


 49%|██████████████████████████████████▉                                    | 49201/100000 [3:54:53<5:14:33,  2.69it/s]

89
Saved Weights


 50%|███████████████████████████████████▏                                   | 49501/100000 [3:56:38<5:00:59,  2.80it/s]

Running reward = 19.64720875637838
Successful runs = 81.6


 50%|███████████████████████████████████▌                                   | 50001/100000 [3:59:34<4:44:17,  2.93it/s]

Running reward = 21.508852781262313
Successful runs = 79.2


 51%|███████████████████████████████████▊                                   | 50501/100000 [4:02:38<4:47:09,  2.87it/s]

Running reward = 18.952182121883595
Successful runs = 75.8


 51%|████████████████████████████████████▏                                  | 51001/100000 [4:05:38<3:30:54,  3.87it/s]

Running reward = 18.96288695721998
Successful runs = 76.0


 51%|████████████████████████████████████▎                                  | 51201/100000 [4:06:44<4:30:45,  3.00it/s]

95
Saved Weights


 52%|████████████████████████████████████▌                                  | 51501/100000 [4:08:27<4:52:18,  2.77it/s]

Running reward = 25.24323861077505
Successful runs = 88.2


 52%|████████████████████████████████████▉                                  | 52001/100000 [4:11:21<4:31:12,  2.95it/s]

Running reward = 23.73497401554296
Successful runs = 86.8


 53%|█████████████████████████████████████▎                                 | 52501/100000 [4:14:18<5:09:49,  2.56it/s]

Running reward = 22.34285451145577
Successful runs = 83.4


 53%|█████████████████████████████████████▋                                 | 53001/100000 [4:17:14<4:10:50,  3.12it/s]

Running reward = 26.256301885691595
Successful runs = 84.4


 54%|█████████████████████████████████████▉                                 | 53501/100000 [4:20:05<3:30:04,  3.69it/s]

Running reward = 23.693360066257597
Successful runs = 87.2


 54%|██████████████████████████████████████▏                                | 53701/100000 [4:21:13<4:28:23,  2.88it/s]

96
Saved Weights


 54%|██████████████████████████████████████▎                                | 54001/100000 [4:22:51<4:25:28,  2.89it/s]

Running reward = 31.604691780420517
Successful runs = 91.2


 55%|██████████████████████████████████████▋                                | 54501/100000 [4:25:41<4:36:02,  2.75it/s]

Running reward = 20.19111490337612
Successful runs = 88.4


 55%|███████████████████████████████████████                                | 55001/100000 [4:28:28<3:44:06,  3.35it/s]

Running reward = 32.47474340292974
Successful runs = 88.6


 55%|███████████████████████████████████████                                | 55101/100000 [4:28:59<4:12:48,  2.96it/s]

97
Saved Weights


 55%|███████████████████████████████████████▏                               | 55201/100000 [4:29:30<3:47:45,  3.28it/s]

100
Saved Weights


 56%|███████████████████████████████████████▍                               | 55501/100000 [4:31:06<3:31:33,  3.51it/s]

Running reward = 34.03402703663692
Successful runs = 97.4


 56%|███████████████████████████████████████▊                               | 56001/100000 [4:33:47<3:33:15,  3.44it/s]

Running reward = 34.68403274292869
Successful runs = 92.8


 57%|████████████████████████████████████████                               | 56501/100000 [4:36:25<5:05:49,  2.37it/s]

Running reward = 27.59271958998323
Successful runs = 93.8


 57%|████████████████████████████████████████▍                              | 57001/100000 [4:39:20<3:21:25,  3.56it/s]

Running reward = 29.07936386726787
Successful runs = 85.4


 58%|████████████████████████████████████████▊                              | 57501/100000 [4:42:15<3:40:01,  3.22it/s]

Running reward = 28.005469553784813
Successful runs = 89.8


 58%|█████████████████████████████████████████▏                             | 58001/100000 [4:44:55<3:38:38,  3.20it/s]

Running reward = 35.065602990297116
Successful runs = 95.2


 59%|█████████████████████████████████████████▌                             | 58501/100000 [4:47:48<3:37:47,  3.18it/s]

Running reward = 25.74563659437329
Successful runs = 90.4


 59%|█████████████████████████████████████████▉                             | 59001/100000 [4:50:35<4:51:19,  2.35it/s]

Running reward = 31.23913174791924
Successful runs = 92.4


 60%|██████████████████████████████████████████▏                            | 59501/100000 [4:53:32<4:29:49,  2.50it/s]

Running reward = 20.464152765086055
Successful runs = 87.8


 60%|██████████████████████████████████████████▌                            | 60001/100000 [4:56:26<4:22:32,  2.54it/s]

Running reward = 29.653254080596025
Successful runs = 91.6
[0 2 1 0] [1 1 1 3]
tf.Tensor(
[[0.7014769  0.6481849  0.6557882  0.66006714]
 [0.72440445 0.7516729  0.7598224  0.63356674]
 [0.68815404 0.73041385 0.68341345 0.7112649 ]
 [0.71967435 0.74203604 0.64176434 0.8106936 ]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[2.912241  3.0262358 2.8865206 2.883346 ]
 [3.018256  3.1429152 2.9861982 3.0186596]
 [2.9270604 3.0302603 2.9040632 2.9112747]
 [2.994915  3.068125  3.0318718 2.9996629]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[3.7277126]
 [3.9027376]
 [3.7606742]
 [3.7193372]], shape=(4, 1), dtype=float32)


 61%|██████████████████████████████████████████▉                            | 60501/100000 [4:59:22<5:08:13,  2.14it/s]

Running reward = 24.129912963782623
Successful runs = 87.4


 61%|███████████████████████████████████████████▎                           | 61001/100000 [5:02:13<3:38:31,  2.97it/s]

Running reward = 32.335031897346326
Successful runs = 90.2


 62%|███████████████████████████████████████████▋                           | 61501/100000 [5:05:08<2:37:50,  4.07it/s]

Running reward = 29.479424991967427
Successful runs = 87.8


 62%|████████████████████████████████████████████                           | 62001/100000 [5:08:01<3:12:58,  3.28it/s]

Running reward = 27.75870834052706
Successful runs = 88.4


 63%|████████████████████████████████████████████▍                          | 62501/100000 [5:10:39<2:35:57,  4.01it/s]

Running reward = 37.03836580081943
Successful runs = 93.8


 63%|████████████████████████████████████████████▋                          | 63001/100000 [5:13:20<5:04:09,  2.03it/s]

Running reward = 30.594361158219776
Successful runs = 95.8


 64%|█████████████████████████████████████████████                          | 63501/100000 [5:15:58<2:55:18,  3.47it/s]

Running reward = 35.50860751669092
Successful runs = 96.8


 64%|█████████████████████████████████████████████▍                         | 64001/100000 [5:18:26<3:23:54,  2.94it/s]

Running reward = 39.14700033148675
Successful runs = 99.6


 65%|█████████████████████████████████████████████▊                         | 64501/100000 [5:21:14<2:52:38,  3.43it/s]

Running reward = 31.38192925414566
Successful runs = 89.0


 65%|██████████████████████████████████████████████▏                        | 65001/100000 [5:23:51<2:56:09,  3.31it/s]

Running reward = 37.5524346053749
Successful runs = 95.8


 66%|██████████████████████████████████████████████▌                        | 65501/100000 [5:26:25<2:49:04,  3.40it/s]

Running reward = 38.72605940861788
Successful runs = 99.0


 66%|██████████████████████████████████████████████▊                        | 66001/100000 [5:28:52<2:31:27,  3.74it/s]

Running reward = 39.476432696854346
Successful runs = 99.0


 67%|███████████████████████████████████████████████▏                       | 66501/100000 [5:31:52<2:49:48,  3.29it/s]

Running reward = 20.97606187323722
Successful runs = 88.2


 67%|███████████████████████████████████████████████▌                       | 67001/100000 [5:35:08<4:19:31,  2.12it/s]

Running reward = 17.20985267807055
Successful runs = 78.4


 68%|███████████████████████████████████████████████▉                       | 67501/100000 [5:39:08<4:07:10,  2.19it/s]

Running reward = -2.519425741241594
Successful runs = 55.8


 68%|████████████████████████████████████████████████▎                      | 68001/100000 [5:43:30<4:45:17,  1.87it/s]

Running reward = -15.971943297794008
Successful runs = 38.8


 69%|████████████████████████████████████████████████▋                      | 68501/100000 [5:48:05<5:08:40,  1.70it/s]

Running reward = -14.391653826370264
Successful runs = 30.4


 69%|████████████████████████████████████████████████▉                      | 69001/100000 [5:52:54<5:33:19,  1.55it/s]

Running reward = -28.837172539179985
Successful runs = 21.8


 70%|█████████████████████████████████████████████████▎                     | 69501/100000 [5:57:47<4:50:05,  1.75it/s]

Running reward = -23.80170246719706
Successful runs = 20.2


 70%|█████████████████████████████████████████████████▋                     | 70001/100000 [6:02:34<5:08:05,  1.62it/s]

Running reward = -26.65445552017332
Successful runs = 23.4


 71%|██████████████████████████████████████████████████                     | 70501/100000 [6:07:40<5:18:59,  1.54it/s]

Running reward = -32.58922047608793
Successful runs = 14.0


 71%|██████████████████████████████████████████████████▍                    | 71001/100000 [6:12:42<5:03:18,  1.59it/s]

Running reward = -32.62370838894922
Successful runs = 15.0


 72%|██████████████████████████████████████████████████▊                    | 71501/100000 [6:17:54<4:28:20,  1.77it/s]

Running reward = -29.003491908426835
Successful runs = 9.6


 72%|███████████████████████████████████████████████████                    | 72001/100000 [6:22:54<4:59:58,  1.56it/s]

Running reward = -31.091299759266494
Successful runs = 16.0


 73%|███████████████████████████████████████████████████▍                   | 72501/100000 [6:28:05<4:23:50,  1.74it/s]

Running reward = -30.97311617488656
Successful runs = 11.6


 73%|███████████████████████████████████████████████████▊                   | 73001/100000 [6:33:19<4:48:35,  1.56it/s]

Running reward = -33.705168368686515
Successful runs = 8.0


 74%|████████████████████████████████████████████████████▏                  | 73501/100000 [6:38:34<4:24:21,  1.67it/s]

Running reward = -33.20994366908033
Successful runs = 8.2


 74%|████████████████████████████████████████████████████▌                  | 74001/100000 [6:43:54<4:35:35,  1.57it/s]

Running reward = -34.98962479549432
Successful runs = 5.6


 75%|████████████████████████████████████████████████████▉                  | 74501/100000 [6:49:18<4:52:07,  1.45it/s]

Running reward = -33.803893662489116
Successful runs = 6.0


 75%|█████████████████████████████████████████████████████▎                 | 75001/100000 [6:54:38<4:41:07,  1.48it/s]

Running reward = -34.420755238059336
Successful runs = 7.2


 76%|█████████████████████████████████████████████████████▌                 | 75501/100000 [7:00:06<4:41:09,  1.45it/s]

Running reward = -32.44093301148401
Successful runs = 5.6


 76%|█████████████████████████████████████████████████████▉                 | 76001/100000 [7:05:36<4:30:50,  1.48it/s]

Running reward = -35.0587725288503
Successful runs = 6.4


 77%|██████████████████████████████████████████████████████▎                | 76501/100000 [7:11:02<3:39:32,  1.78it/s]

Running reward = -23.131340192148905
Successful runs = 13.2


 77%|██████████████████████████████████████████████████████▋                | 77001/100000 [7:16:14<3:21:59,  1.90it/s]

Running reward = -29.15752856319918
Successful runs = 16.8


 78%|███████████████████████████████████████████████████████                | 77501/100000 [7:21:35<4:05:49,  1.53it/s]

Running reward = -28.90874257847119
Successful runs = 13.4


 78%|███████████████████████████████████████████████████████▍               | 78001/100000 [7:26:55<3:32:44,  1.72it/s]

Running reward = -32.16021929041203
Successful runs = 12.8


 79%|███████████████████████████████████████████████████████▋               | 78501/100000 [7:32:18<4:12:40,  1.42it/s]

Running reward = -30.85845740552718
Successful runs = 12.6


 79%|████████████████████████████████████████████████████████               | 79001/100000 [7:37:43<4:04:19,  1.43it/s]

Running reward = -31.105851645375367
Successful runs = 11.4


 80%|████████████████████████████████████████████████████████▍              | 79501/100000 [7:43:09<4:07:45,  1.38it/s]

Running reward = -32.023200019773164
Successful runs = 11.8


 80%|████████████████████████████████████████████████████████▊              | 80001/100000 [7:48:44<3:48:15,  1.46it/s]

Running reward = -33.43162662356081
Successful runs = 7.8
[3 0 1 3] [2 1 1 2]
tf.Tensor(
[[-2.7576947 -2.8199055 -2.991744  -2.6297495]
 [-1.9843416 -2.0339596 -2.1781437 -1.9892555]
 [-2.1353617 -2.1280568 -2.3961046 -2.1842575]
 [-2.7576947 -2.8199055 -2.991744  -2.6297495]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-2.2977145 -2.4104044 -2.0768876 -2.2706087]
 [-1.8164057 -1.9939046 -1.673021  -1.8010006]
 [-1.9262489 -2.0618067 -1.7809944 -1.9029275]
 [-2.2977145 -2.4104044 -2.0768876 -2.2706087]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-4.7066374]
 [-3.9782462]
 [-4.189863 ]
 [-4.7066374]], shape=(4, 1), dtype=float32)


 81%|█████████████████████████████████████████████████████████▏             | 80501/100000 [7:54:16<3:50:34,  1.41it/s]

Running reward = -29.72555167337929
Successful runs = 11.2


 81%|█████████████████████████████████████████████████████████▌             | 81001/100000 [7:59:44<3:28:16,  1.52it/s]

Running reward = -28.041965960666435
Successful runs = 12.6


 82%|█████████████████████████████████████████████████████████▊             | 81501/100000 [8:05:15<3:46:12,  1.36it/s]

Running reward = -27.01312420087762
Successful runs = 14.0


 82%|██████████████████████████████████████████████████████████▏            | 82001/100000 [8:10:44<2:26:24,  2.05it/s]

Running reward = -26.777883516359886
Successful runs = 14.2


 83%|██████████████████████████████████████████████████████████▌            | 82501/100000 [8:16:26<3:32:41,  1.37it/s]

Running reward = -34.11293823604026
Successful runs = 9.6


 83%|██████████████████████████████████████████████████████████▉            | 83001/100000 [8:22:00<3:10:59,  1.48it/s]

Running reward = -27.967681674315195
Successful runs = 12.0


 84%|███████████████████████████████████████████████████████████▎           | 83501/100000 [8:27:52<2:55:16,  1.57it/s]

Running reward = -34.05888175285293
Successful runs = 6.8


 84%|███████████████████████████████████████████████████████████▋           | 84001/100000 [8:33:38<3:14:00,  1.37it/s]

Running reward = -32.13186044657302
Successful runs = 9.2


 85%|███████████████████████████████████████████████████████████▉           | 84501/100000 [8:39:19<3:15:41,  1.32it/s]

Running reward = -34.070304191068715
Successful runs = 13.4


 85%|████████████████████████████████████████████████████████████▎          | 85001/100000 [8:45:04<2:23:25,  1.74it/s]

Running reward = -32.266667708732555
Successful runs = 11.6


 86%|████████████████████████████████████████████████████████████▋          | 85501/100000 [8:50:43<2:13:00,  1.82it/s]

Running reward = -26.188290763868068
Successful runs = 15.6


 86%|█████████████████████████████████████████████████████████████          | 85991/100000 [8:56:28<1:27:23,  2.67it/s]


KeyboardInterrupt: 

In [7]:
model.user_model = tf.keras.models.load_model('user_model.h5')
model.assist_model = tf.keras.models.load_model('assist_model.h5')



In [8]:
model.infer()

[0 0 0 1] [2 2 2 3]
tf.Tensor(
[[0.9516038  0.8917836  0.9332022  0.82326126]
 [0.9098215  0.9214255  0.91650915 0.79966193]
 [0.9098215  0.9214255  0.91650915 0.79966193]
 [0.8878898  0.92717445 0.88216174 0.7957046 ]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[4.2889824 4.79168   4.230976  4.238483 ]
 [4.3104734 4.805112  4.2488728 4.2613645]
 [4.252157  4.741939  4.1969833 4.202025 ]
 [4.2992435 4.8114977 4.2456346 4.2587614]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-2.9156542]
 [-2.922504 ]
 [-2.8914485]
 [-3.110583 ]], shape=(4, 1), dtype=float32)


In [9]:
model.epsilon = 0
done = False
episode_reward = 0
start, dest = env.give_start_dest()
ob_user = [start[0], start[1], dest[0], dest[1]]
prev_steps_assist = []
prev_steps_assist = give_prev_steps(prev_steps_assist, steps)
step = 0

while not done and step<max_steps:
    ob_user, prev_steps_assist, reward_user, reward_assist, done = model.step(ob_user, prev_steps_assist)
    episode_reward+=reward_user
    step+=1
    print(ob_user)
    
print(done)

[0.4, 0.4, 0.5, 0.7]
[0.4, 0.5, 0.5, 0.7]
[0.4, 0.6, 0.5, 0.7]
[0.4, 0.7, 0.5, 0.7]
[0.5, 0.7, 0.5, 0.7]
1


In [10]:
reached = 0
for i in range(1000):
    model.epsilon = 0
    done = False
    episode_reward = 0
    start, dest = env.give_start_dest()
    ob_user = [start[0], start[1], dest[0], dest[1]]
    prev_steps_assist = []
    prev_steps_assist = give_prev_steps(prev_steps_assist, steps)
    step = 0

    while not done and step<max_steps:
        ob_user, prev_steps_assist, reward_user, reward_assist, done = model.step(ob_user, prev_steps_assist)
        episode_reward+=reward_user
        step+=1
#         print(ob_user)
        if done:
            reached += 1
#     print(done)

print(reached)

1000


In [11]:
ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist,\
reward_assist, next_ob_assist, done, importance, indices = model.sample_exp()
input_C = np.vstack(tuple([model.env_cell_mapping]*128))
ob_assist

array([[[ 0. ,  0. ,  1. ,  0. ,  0.2,  0.7],
        [ 0. ,  0. ,  1. ,  0. ,  0.2,  0.6],
        [ 1. ,  0. ,  0. ,  0. ,  0.2,  0.5],
        [ 0. ,  0. ,  1. ,  0. ,  0.1,  0.5]],

       [[ 0. ,  0. ,  1. ,  0. ,  0.7,  0.5],
        [ 0. ,  0. ,  1. ,  0. ,  0.7,  0.4],
        [ 0. ,  0. ,  1. ,  0. ,  0.7,  0.3],
        [ 0. ,  0. ,  1. ,  0. ,  0.7,  0.2]],

       [[ 0. ,  0. ,  1. ,  0. ,  0.2,  0.6],
        [ 1. ,  0. ,  0. ,  0. ,  0.2,  0.5],
        [ 0. ,  0. ,  1. ,  0. ,  0.1,  0.5],
        [ 0. ,  0. ,  1. ,  0. ,  0.1,  0.4]],

       ...,

       [[ 0. ,  0. ,  0. ,  0. , -1. , -1. ],
        [ 0. ,  0. ,  0. ,  1. ,  0.6,  0.4],
        [ 1. ,  0. ,  0. ,  0. ,  0.6,  0.5],
        [ 0. ,  0. ,  0. ,  1. ,  0.5,  0.5]],

       [[ 0. ,  0. ,  0. ,  0. , -1. , -1. ],
        [ 0. ,  0. ,  0. ,  0. , -1. , -1. ],
        [ 0. ,  0. ,  0. ,  0. , -1. , -1. ],
        [ 0. ,  0. ,  0. ,  1. ,  0.2,  0.4]],

       [[ 0. ,  0. ,  1. ,  0. ,  0.3,  0.4],
        [ 0

In [12]:
np.argmax(model.assist_model([ob_assist, input_C]), axis = 1) + 1

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)