In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, LSTM, Subtract
from tensorflow.keras.models import Model
from Environment import *
from tqdm import tqdm
import matplotlib.pyplot as plt
from copy import deepcopy
import os 
import numpy as np

In [2]:
dense_1_user =  Dense(32, activation = 'relu')
dense_2_user =  Dense(32, activation = 'relu')
# dense_3_user =  Dense(32, activation = 'relu')

dense_1_assist =  Dense(32, activation = 'relu')
lstm_1_assist = LSTM(32, activation = 'tanh')
dense_2_assist = Dense(32, activation = 'relu')

advantage_layer_user = Dense(4)
value_layer_user = Dense(1)

advantage_layer_assist = Dense(4)
value_layer_assist = Dense(1)

# advantage_layer = Dense(4)
# value_layer = Dense(1)

# advantage_layer_user = advantage_layer
# advantage_layer_assist = advantage_layer

# value_layer_user = value_layer
# value_layer_assist = value_layer

In [3]:
class AI_Design:
    def __init__(self, steps = 4):        
        self.loss_fn = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(lr = 0.0001)
        self.batch_size = 128
        self.replay_buffer_size = 1024
        self.replay_buffer = Replay_Buffer(self.replay_buffer_size)
        self.epsilon = 1
        self.gamma = 0.9
        self.env = Environment()
        self.env.cells = np.array([[0.7, 0.1], [0.1, 0.1], [0.5, 0.7], [0.6, 0.2], [0.7, 0.4], [0.2, 0.9]])
        self.env_cell_mapping = give_mapping(self.env.cells)
        self.env_cell_mapping = self.env_cell_mapping[np.newaxis, :, :, np.newaxis]
        #-------------------------------------------------------------------------------------------------
        input_A = Input(shape = (4,))
        input_B = Input(shape = (steps,6))
        input_C = Input(shape = (11, 11, 1)) #Location of every icon
                 
        action_user = Input(shape = 1, dtype = tf.int32)
        action_assist = Input(shape = 1, dtype = tf.int32)
        
        
        
        #User Network 
        x = Subtract()([input_A[:, 2:], input_A[:, :2]])
        x = dense_1_user(x)
        x = dense_2_user(x)
#         x = dense_3_user(x)
        adv_user = advantage_layer_user(x)
        val_user = value_layer_user(x)
        output_user = adv_user - tf.reduce_mean(adv_user, axis = 1, keepdims = True) + val_user
        
        self.user_model = Model(inputs = input_A, outputs = output_user)
        self.user_model.summary()
        
        self.target_user_model = tf.keras.models.clone_model(self.user_model)
        self.target_user_model.set_weights(self.user_model.get_weights())
        

        
        #Assistant Network
        z = tf.keras.layers.Conv2D(filters = 2, kernel_size = 3, activation = 'relu')(input_C)
        z = tf.keras.layers.MaxPooling2D()(z)
        z = tf.keras.layers.Flatten()(z)
        z = tf.keras.layers.Dense(32, activation = 'relu')(z)
        
        y = dense_1_assist(input_B)
        y = lstm_1_assist(y)
        y = tf.keras.layers.Concatenate()([y,z])
        y = dense_2_assist(y)
        adv_assist = advantage_layer_assist(y)
        val_assist = value_layer_assist(y)
        output_assist = adv_assist - tf.reduce_mean(adv_assist, axis = 1, keepdims = True) + val_assist
        
        self.assist_model = Model(inputs = [input_B, input_C], outputs = output_assist)
        self.assist_model.summary()
        
        self.target_assist_model = tf.keras.models.clone_model(self.assist_model)
        self.target_assist_model.set_weights(self.assist_model.get_weights())
        
        
        #Complete Network
        mask_user = tf.reduce_sum(tf.one_hot(action_user, 4), axis = 1)
        mask_assist = tf.reduce_sum(tf.one_hot(action_assist, 4), axis = 1)
        output_user = output_user*mask_user
        output_assist = output_assist*mask_assist
        
        out = tf.reduce_sum(output_user + output_assist, axis = 1, keepdims = True)
        
        self.model = Model(inputs = [input_A, input_B, input_C, action_user, action_assist], outputs = out)  
        self.model.summary() 
        #-------------------------------------------------------------------------------------------------
    
    def infer(self):
        ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist,\
        reward_assist, next_ob_assist, done, importance, indices = self.sample_exp()
        
        ob_user = ob_user[:4]
        action_user = action_user[:4]
        reward_user = reward_user[:4]
        
        ob_assist = ob_assist[:4]
        action_assist = action_assist[:4]
        reward_assist = reward_assist[:4]
        
        print(action_user, action_assist)
        
        print(self.user_model(ob_user))
        print(self.assist_model([ob_assist, np.vstack(tuple([self.env_cell_mapping]*4))]))
        
        print(self.model([ob_user, ob_assist, np.vstack(tuple([self.env_cell_mapping]*4)), action_user, action_assist]))
    
    def exp_policy_user(self, state, next_action = False):
        if np.random.rand()<self.epsilon:
            return np.random.randint(4)
        else:
            state = np.array(state)[np.newaxis]
            if next_action == False:
                Q_values = self.user_model(state)
            else:
                Q_values = self.target_user_model(state)
            return np.argmax(Q_values[0])
    
    def exp_policy_assist(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(1,5)
        else:
            state = np.array(state)[np.newaxis]
            Q_values = self.assist_model([state, self.env_cell_mapping])
            return np.argmax(Q_values[0])+1
    
    def step(self, ob_user, prev_steps_assist):
        curr_loc = ob_user[:2]
        target_loc = ob_user[2:4]
        
        action_user = self.exp_policy_user(ob_user)
        action_user_one_hot = make_one_hot(action_user, 4)
        
        ob_assist = [action_user_one_hot + ob_user[:2]]
        ob_assist = prev_steps_assist + ob_assist 
        action_assist = self.exp_policy_assist(ob_assist)
        
        new_loc, reward_user, reward_assist, done = self.env.step(action_user, action_assist, target_loc, curr_loc)
        
        next_ob_user = new_loc[:]
        next_ob_user = next_ob_user + target_loc
        
        next_action_user = self.exp_policy_user(next_ob_user, next_action = True)
        next_action_user_one_hot = make_one_hot(next_action_user, 4)
        next_ob_assist = [next_action_user_one_hot + next_ob_user[:2]]
        next_ob_assist = ob_assist[1:] + next_ob_assist
        
        self.add_replay_buffer(ob_user, action_user, reward_user, next_ob_user, ob_assist,\
                          action_assist-1, reward_assist, next_ob_assist, done)
        
        return next_ob_user, ob_assist[1:], reward_user, reward_assist, done 
        
        
    
    def add_replay_buffer(self, ob_user, action_user, reward_user, next_ob_user, ob_assist,\
                         action_assist, reward_assist, next_ob_assist, done):
        
        self.replay_buffer.ob_user_history.append(ob_user)
        self.replay_buffer.action_user_history.append(action_user)
        self.replay_buffer.reward_user_history.append(reward_user)
        self.replay_buffer.next_ob_user_history.append(next_ob_user)
        self.replay_buffer.ob_assist_history.append(ob_assist)
        self.replay_buffer.action_assist_history.append(action_assist)
        self.replay_buffer.reward_assist_history.append(reward_assist)
        self.replay_buffer.next_ob_assist_history.append(next_ob_assist)
        self.replay_buffer.done_history.append(done)
        self.replay_buffer.priorities.append(self.replay_buffer.max_val)
    
    def sample_exp(self):
        sample_probs = self.replay_buffer.get_probabilities(priority_scale = 0.7)
        indices = np.random.choice(len(self.replay_buffer.done_history), size = self.batch_size, p = sample_probs)
        importance = self.replay_buffer.get_importance(sample_probs[indices])
        
        ob_user = np.array([self.replay_buffer.ob_user_history[i] for i in indices])
        action_user = np.array([self.replay_buffer.action_user_history[i] for i in indices])
        reward_user = np.array([self.replay_buffer.reward_user_history[i] for i in indices])
        next_ob_user = np.array([self.replay_buffer.next_ob_user_history[i] for i in indices])
        ob_assist = np.array([self.replay_buffer.ob_assist_history[i] for i in indices])
        action_assist = np.array([self.replay_buffer.action_assist_history[i] for i in indices])
        reward_assist = np.array([self.replay_buffer.reward_assist_history[i] for i in indices])
        next_ob_assist = np.array([self.replay_buffer.next_ob_assist_history[i] for i in indices])
        done = np.array([self.replay_buffer.done_history[i] for i in indices])
        
        return ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist, reward_assist, next_ob_assist, done,\
    importance, indices 
    
    def train(self):
        ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist,\
        reward_assist, next_ob_assist, done, importance, indices = self.sample_exp()
        
        input_A = ob_user
        input_B = ob_assist
        input_C = np.vstack(tuple([self.env_cell_mapping]*128))
        
        rewards = reward_user + reward_assist
        
        next_Q_values_user, next_Q_values_assist = self.user_model(next_ob_user), self.assist_model([next_ob_assist, input_C])
        best_next_actions_user, best_next_actions_assist = tf.math.argmax(next_Q_values_user, axis = 1), tf.math.argmax(next_Q_values_assist, axis = 1)
        next_Q_values_user, next_Q_values_assist = self.target_user_model(next_ob_user), self.target_assist_model([next_ob_assist, input_C])
        
        best_next_Q_values_user = tf.reduce_sum(next_Q_values_user*tf.one_hot(best_next_actions_user, 4), axis = 1)
        best_next_Q_values_assist = tf.reduce_sum(next_Q_values_user*tf.one_hot(best_next_actions_assist, 4), axis = 1)
        best_next_Q_values = best_next_Q_values_user + best_next_Q_values_assist
        
        target_Q_values = rewards + (1-done)*self.gamma*best_next_Q_values
        
        with tf.GradientTape() as tape:
            Q_values = self.model([input_A, input_B, input_C, action_user, action_assist])
            error = tf.multiply(self.loss_fn(target_Q_values, Q_values), importance**(1-self.epsilon))
            loss = tf.reduce_mean(error)
        
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        self.replay_buffer.set_priorities(indices, error)
        
        
#         Q_values_assist = self.assist_model([input_B, input_C])
#         with tf.GradientTape() as tape:
#             Q_values_user = self.user_model(input_A)
#             error = tf.multiply(self.loss_fn(Q_values_assist, Q_values_user), importance**(1-self.epsilon))
#             loss = tf.reduce_mean(error)
            
#         grads = tape.gradient(loss, self.user_model.trainable_variables)
#         self.optimizer.apply_gradients(zip(grads, self.user_model.trainable_variables))
            
#         with tf.GradientTape() as tape:
#             Q_values_assist = self.assist_model([input_B, input_C])
#             error = tf.multiply(self.loss_fn(Q_values_assist, Q_values_user), importance**(1-self.epsilon))
#             loss = tf.reduce_mean(error)
            
#         grads = tape.gradient(loss, self.assist_model.trainable_variables)
#         self.optimizer.apply_gradients(zip(grads, self.assist_model.trainable_variables))                          

        

In [4]:
steps = 4
model = AI_Design(steps)
env = model.env

# if os.path.exists('user_model.h5'):
#     model.user_model = tf.keras.models.load_model('user_model.h5')
#     model.assist_model = tf.keras.models.load_model('assist_model.h5')

Icon Locations:
[[0.8 0.2]
 [0.6 0. ]
 [0.  0.9]
 [0.4 0.9]
 [0.1 0.9]
 [0.1 0.3]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(None, 2)]          0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_1 (Te [(None, 2)]          0           input_1[0][0]                    
__________________________________________________________________________________________________
subtract (

In [5]:
def give_prev_steps(prev_steps_assist, steps):
    prev_steps_assist = [[0,0,0,0,-1,-1] for i in range(steps-1)]
    return prev_steps_assist

In [6]:
max_steps = 40
reached = 0
reached_history = []
max_reached = 0

running_reward = 0

for epoch in tqdm(range(100000)):
    done = False
    episode_reward = 0
    start, dest = env.give_start_dest()
    ob_user = [start[0], start[1], dest[0], dest[1]]
    prev_steps_assist = []
    prev_steps_assist = give_prev_steps(prev_steps_assist, steps)
    step = 0
    
    while not done and step<max_steps:
        ob_user, prev_steps_assist, reward_user, reward_assist, done = model.step(ob_user, prev_steps_assist)
        episode_reward+=reward_user
        step+=1
        if done:
            reached+=1
    
    if epoch:
        running_reward = 0.01 * episode_reward + (1 - 0.01) * running_reward
    else:
        running_reward = episode_reward
        
    if epoch>50:
        model.train()
        
        if epoch%100==0:
            model.target_user_model.set_weights(model.user_model.get_weights())
            model.target_assist_model.set_weights(model.assist_model.get_weights())
            reached_history.append(reached)
            rewards = []
            
            if reached>max_reached:
                print(reached)
                print('Saved Weights')
                max_reached = reached
                model.user_model.save('user_model.h5')
                model.assist_model.save('assist_model.h5')
                
#                 if reached == 100:
#                     print('Environment solved..........\nStopping training...........')
#                     break
                
            reached = 0
            
            if epoch%500==0:
                print(f'Running reward = {running_reward}')
                print(f'Successful runs = {np.mean(reached_history)}')
                reached_history = []
                
                if epoch%1000==0:
                    model.epsilon-=0.01
                    model.epsilon= max(model.epsilon, 0.1)
                    
                    if epoch%20000==0:
                        model.infer()

  0%|                                                                           | 101/100000 [00:13<5:03:04,  5.49it/s]

15
Saved Weights


  0%|▏                                                                          | 301/100000 [00:46<5:31:52,  5.01it/s]

23
Saved Weights


  1%|▍                                                                          | 502/100000 [01:23<4:48:14,  5.75it/s]

Running reward = -34.17803085487519
Successful runs = 17.0


  1%|▋                                                                         | 1002/100000 [02:53<4:53:27,  5.62it/s]

Running reward = -34.42998513642788
Successful runs = 15.6


  2%|█                                                                         | 1501/100000 [04:24<4:51:47,  5.63it/s]

Running reward = -35.84987008915627
Successful runs = 15.8


  2%|█▎                                                                        | 1801/100000 [05:22<6:21:08,  4.29it/s]

25
Saved Weights


  2%|█▍                                                                        | 2002/100000 [06:02<5:24:08,  5.04it/s]

Running reward = -34.85277553237351
Successful runs = 19.4


  3%|█▊                                                                        | 2501/100000 [07:46<5:09:56,  5.24it/s]

Running reward = -35.933659223624254
Successful runs = 16.4


  3%|██▏                                                                       | 3001/100000 [09:18<5:14:48,  5.14it/s]

Running reward = -34.70447342304754
Successful runs = 19.8


  4%|██▌                                                                       | 3501/100000 [11:07<5:51:31,  4.58it/s]

Running reward = -34.69253496357203
Successful runs = 14.8


  4%|██▉                                                                       | 4002/100000 [12:48<5:02:30,  5.29it/s]

Running reward = -34.674211177916256
Successful runs = 16.2


  5%|███▎                                                                      | 4502/100000 [14:29<5:12:49,  5.09it/s]

Running reward = -35.41240881166094
Successful runs = 15.6


  5%|███▋                                                                      | 5001/100000 [16:09<5:05:56,  5.18it/s]

Running reward = -35.00296998722375
Successful runs = 14.4


  5%|███▊                                                                      | 5101/100000 [16:29<5:20:21,  4.94it/s]

28
Saved Weights


  6%|████                                                                      | 5501/100000 [18:02<5:20:28,  4.91it/s]

Running reward = -34.260124855201
Successful runs = 20.2


  6%|████▍                                                                     | 6001/100000 [19:53<5:53:30,  4.43it/s]

Running reward = -35.10398233815168
Successful runs = 17.2


  7%|████▊                                                                     | 6501/100000 [21:40<5:25:36,  4.79it/s]

Running reward = -34.07354821546504
Successful runs = 16.0


  7%|█████▏                                                                    | 7001/100000 [23:28<6:09:49,  4.19it/s]

Running reward = -33.13406150086733
Successful runs = 18.6


  8%|█████▌                                                                    | 7501/100000 [25:26<9:04:51,  2.83it/s]

Running reward = -35.0393304432977
Successful runs = 16.4


  8%|█████▉                                                                    | 8001/100000 [27:23<5:37:35,  4.54it/s]

Running reward = -33.13269545581268
Successful runs = 21.0


  9%|██████▎                                                                   | 8501/100000 [29:18<6:48:55,  3.73it/s]

Running reward = -34.36326052532864
Successful runs = 18.8


  9%|██████▋                                                                   | 9001/100000 [31:20<6:38:40,  3.80it/s]

Running reward = -33.325249860504826
Successful runs = 21.4


  9%|██████▋                                                                   | 9101/100000 [31:45<6:30:25,  3.88it/s]

30
Saved Weights


 10%|███████                                                                   | 9501/100000 [33:22<6:03:02,  4.15it/s]

Running reward = -34.7067528018826
Successful runs = 22.8


 10%|███████▎                                                                  | 9901/100000 [34:57<6:28:37,  3.86it/s]

32
Saved Weights


 10%|███████▎                                                                 | 10001/100000 [35:20<5:53:27,  4.24it/s]

Running reward = -33.16790588323962
Successful runs = 21.8


 11%|███████▋                                                                 | 10501/100000 [37:28<6:00:29,  4.14it/s]

Running reward = -32.51853710630422
Successful runs = 20.4


 11%|████████                                                                 | 11001/100000 [39:32<5:41:44,  4.34it/s]

Running reward = -33.04838935856911
Successful runs = 20.4


 12%|████████▍                                                                | 11501/100000 [41:36<5:50:34,  4.21it/s]

Running reward = -33.773565767364076
Successful runs = 20.2


 12%|████████▊                                                                | 12001/100000 [43:42<5:48:53,  4.20it/s]

Running reward = -32.165714328858016
Successful runs = 20.8


 13%|█████████▏                                                               | 12501/100000 [45:46<6:13:42,  3.90it/s]

Running reward = -34.60602560263235
Successful runs = 21.2


 13%|█████████▍                                                               | 13001/100000 [48:01<9:47:25,  2.47it/s]

Running reward = -34.400459277634894
Successful runs = 19.2


 14%|█████████▊                                                               | 13501/100000 [50:14<7:08:54,  3.36it/s]

Running reward = -34.36541907196142
Successful runs = 18.0


 14%|██████████▏                                                              | 14001/100000 [52:32<7:20:42,  3.25it/s]

Running reward = -33.49815804442353
Successful runs = 23.2


 15%|██████████▌                                                              | 14502/100000 [54:50<5:22:02,  4.42it/s]

Running reward = -32.18752632700467
Successful runs = 23.6


 15%|██████████▉                                                              | 15001/100000 [57:09<6:04:03,  3.89it/s]

Running reward = -32.52891528368874
Successful runs = 21.2


 16%|███████████▎                                                             | 15501/100000 [59:18<6:39:53,  3.52it/s]

Running reward = -33.05312505160015
Successful runs = 25.0


 16%|███████████▎                                                           | 16001/100000 [1:01:29<5:35:03,  4.18it/s]

Running reward = -32.16314052678773
Successful runs = 23.4


 17%|███████████▋                                                           | 16501/100000 [1:03:40<6:43:34,  3.45it/s]

Running reward = -32.17659428925091
Successful runs = 27.0


 17%|████████████                                                           | 17001/100000 [1:05:53<5:44:14,  4.02it/s]

Running reward = -31.781277555181386
Successful runs = 24.6


 18%|████████████▍                                                          | 17501/100000 [1:08:09<6:28:36,  3.54it/s]

Running reward = -33.15616941354714
Successful runs = 23.4


 18%|████████████▊                                                          | 18001/100000 [1:10:25<6:37:23,  3.44it/s]

Running reward = -32.75810608892259
Successful runs = 25.2


 19%|█████████████▏                                                         | 18501/100000 [1:12:43<6:39:08,  3.40it/s]

Running reward = -32.839256121209225
Successful runs = 25.8


 19%|█████████████▍                                                         | 19001/100000 [1:15:00<6:22:14,  3.53it/s]

Running reward = -31.18974352898065
Successful runs = 26.8


 20%|█████████████▊                                                         | 19501/100000 [1:17:21<6:42:06,  3.34it/s]

Running reward = -31.31429331661048
Successful runs = 26.8


 20%|██████████████▏                                                        | 20000/100000 [1:19:41<5:50:07,  3.81it/s]

Running reward = -29.82214955495106
Successful runs = 25.8
[2 0 0 2] [1 2 2 0]
tf.Tensor(
[[-0.16703847 -0.1708968  -0.15984784 -0.1640141 ]
 [-0.1454617  -0.1311161  -0.147087   -0.17245401]
 [-0.1642996  -0.15487668 -0.14427781 -0.16312596]
 [-0.11788633 -0.1423415  -0.14607583 -0.15074435]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-1.8271451 -1.8089656 -1.813582  -1.8156983]
 [-1.8086891 -1.8200716 -1.7901646 -1.8073187]
 [-1.8138925 -1.8164631 -1.8025115 -1.8159007]
 [-1.8037641 -1.8095852 -1.793772  -1.7989081]], shape=(4, 4), dtype=float32)


 20%|██████████████▏                                                        | 20001/100000 [1:19:42<7:45:51,  2.86it/s]

tf.Tensor(
[[-1.9688134]
 [-1.9356263]
 [-1.9668111]
 [-1.94984  ]], shape=(4, 1), dtype=float32)


 21%|██████████████▌                                                        | 20501/100000 [1:22:04<6:41:06,  3.30it/s]

Running reward = -31.93898739655219
Successful runs = 27.6


 21%|██████████████▉                                                        | 21001/100000 [1:24:27<6:19:03,  3.47it/s]

Running reward = -30.807659205931696
Successful runs = 26.8


 22%|███████████████▎                                                       | 21501/100000 [1:26:55<6:54:07,  3.16it/s]

Running reward = -32.82246419170394
Successful runs = 25.8


 22%|███████████████▍                                                       | 21701/100000 [1:27:53<6:10:04,  3.53it/s]

35
Saved Weights


 22%|███████████████▌                                                       | 22001/100000 [1:29:21<6:29:52,  3.33it/s]

Running reward = -30.122157928818925
Successful runs = 30.0


 23%|███████████████▉                                                       | 22501/100000 [1:31:50<6:31:10,  3.30it/s]

Running reward = -30.731612708958075
Successful runs = 28.2


 23%|████████████████                                                       | 22601/100000 [1:32:20<7:18:30,  2.94it/s]

36
Saved Weights


 23%|████████████████▎                                                      | 22901/100000 [1:33:48<5:37:39,  3.81it/s]

43
Saved Weights


 23%|████████████████▎                                                      | 23001/100000 [1:34:18<6:01:21,  3.55it/s]

Running reward = -29.994455395034123
Successful runs = 31.0


 24%|████████████████▋                                                      | 23501/100000 [1:36:50<6:46:22,  3.14it/s]

Running reward = -32.04504099914378
Successful runs = 25.8


 24%|█████████████████                                                      | 24001/100000 [1:39:23<6:35:24,  3.20it/s]

Running reward = -31.86576048119481
Successful runs = 28.2


 25%|█████████████████▍                                                     | 24501/100000 [1:41:57<6:56:55,  3.02it/s]

Running reward = -31.056865121805135
Successful runs = 27.8


 25%|█████████████████▊                                                     | 25002/100000 [1:44:31<5:34:19,  3.74it/s]

Running reward = -31.704350043125135
Successful runs = 29.4


 26%|██████████████████                                                     | 25501/100000 [1:47:05<5:52:50,  3.52it/s]

Running reward = -28.10295877893209
Successful runs = 31.2


 26%|██████████████████▍                                                    | 26001/100000 [1:49:40<6:29:55,  3.16it/s]

Running reward = -29.04914398888893
Successful runs = 31.8


 27%|██████████████████▊                                                    | 26501/100000 [1:52:20<6:14:11,  3.27it/s]

Running reward = -31.33448960633653
Successful runs = 29.4


 27%|███████████████████▏                                                   | 27001/100000 [1:54:59<6:31:30,  3.11it/s]

Running reward = -30.666345026600087
Successful runs = 27.2


 28%|███████████████████▌                                                   | 27501/100000 [1:57:39<6:51:41,  2.93it/s]

Running reward = -29.996838582197896
Successful runs = 33.8


 28%|███████████████████▉                                                   | 28001/100000 [2:00:21<6:57:01,  2.88it/s]

Running reward = -31.268056024613514
Successful runs = 29.2


 29%|████████████████████▏                                                  | 28501/100000 [2:03:03<6:04:52,  3.27it/s]

Running reward = -27.21647868548561
Successful runs = 33.6


 29%|████████████████████▌                                                  | 29001/100000 [2:05:54<6:17:14,  3.14it/s]

Running reward = -27.147283980751233
Successful runs = 33.4


 30%|████████████████████▉                                                  | 29501/100000 [2:08:57<6:56:04,  2.82it/s]

Running reward = -28.86089132818974
Successful runs = 31.2


 30%|█████████████████████▎                                                 | 30001/100000 [2:12:17<6:59:48,  2.78it/s]

Running reward = -28.7639270748216
Successful runs = 33.6


 31%|█████████████████████▋                                                 | 30501/100000 [2:15:18<6:47:34,  2.84it/s]

Running reward = -31.582671016198532
Successful runs = 29.0


 31%|██████████████████████                                                 | 31001/100000 [2:18:15<6:29:08,  2.96it/s]

Running reward = -29.83615401085582
Successful runs = 31.6


 31%|██████████████████████▏                                                | 31201/100000 [2:19:23<6:57:35,  2.75it/s]

44
Saved Weights


 32%|██████████████████████▎                                                | 31501/100000 [2:21:06<6:34:51,  2.89it/s]

Running reward = -28.250153665060125
Successful runs = 39.0


 32%|██████████████████████▋                                                | 32001/100000 [2:23:58<6:18:07,  3.00it/s]

Running reward = -29.760050472434735
Successful runs = 30.8


 33%|███████████████████████                                                | 32501/100000 [2:26:53<5:49:48,  3.22it/s]

Running reward = -26.74684454109222
Successful runs = 38.6


 33%|███████████████████████▍                                               | 33001/100000 [2:29:44<6:22:38,  2.92it/s]

Running reward = -27.313156769265838
Successful runs = 39.2


 33%|███████████████████████▋                                               | 33401/100000 [2:32:04<7:04:32,  2.61it/s]

45
Saved Weights


 34%|███████████████████████▊                                               | 33501/100000 [2:32:40<6:21:33,  2.90it/s]

Running reward = -27.336200694705266
Successful runs = 39.4


 34%|████████████████████████                                               | 33901/100000 [2:35:01<5:52:44,  3.12it/s]

47
Saved Weights


 34%|████████████████████████▏                                              | 34001/100000 [2:35:37<7:14:21,  2.53it/s]

Running reward = -27.292039822876802
Successful runs = 36.8


 35%|████████████████████████▍                                              | 34501/100000 [2:38:36<6:30:56,  2.79it/s]

Running reward = -28.80740240606774
Successful runs = 32.0


 35%|████████████████████████▊                                              | 35001/100000 [2:41:32<6:04:12,  2.97it/s]

Running reward = -27.374666200561443
Successful runs = 38.8


 36%|█████████████████████████▏                                             | 35501/100000 [2:44:32<6:20:58,  2.82it/s]

Running reward = -27.590624251609764
Successful runs = 36.2


 36%|█████████████████████████▎                                             | 35701/100000 [2:45:41<6:39:32,  2.68it/s]

48
Saved Weights


 36%|█████████████████████████▌                                             | 36001/100000 [2:47:30<6:27:09,  2.76it/s]

Running reward = -28.064550474801628
Successful runs = 38.8


 37%|█████████████████████████▉                                             | 36501/100000 [2:50:32<6:32:12,  2.70it/s]

Running reward = -28.320821384272843
Successful runs = 36.4


 37%|██████████████████████████▎                                            | 37001/100000 [2:53:36<6:51:17,  2.55it/s]

Running reward = -27.723656342884937
Successful runs = 35.6


 38%|██████████████████████████▋                                            | 37501/100000 [2:56:42<6:12:00,  2.80it/s]

Running reward = -29.45180117596059
Successful runs = 36.2


 38%|██████████████████████████▉                                            | 38001/100000 [2:59:46<7:03:43,  2.44it/s]

Running reward = -27.44821022613831
Successful runs = 34.0


 39%|███████████████████████████▎                                           | 38501/100000 [3:02:52<6:13:05,  2.75it/s]

Running reward = -26.548155927337156
Successful runs = 38.0


 39%|███████████████████████████▌                                           | 38801/100000 [3:04:44<5:55:54,  2.87it/s]

54
Saved Weights


 39%|███████████████████████████▋                                           | 39001/100000 [3:06:00<4:21:54,  3.88it/s]

Running reward = -28.34368437132733
Successful runs = 38.2


 40%|████████████████████████████                                           | 39501/100000 [3:09:05<7:07:57,  2.36it/s]

Running reward = -24.813145532626198
Successful runs = 43.2


 40%|████████████████████████████▍                                          | 40001/100000 [3:12:10<5:51:50,  2.84it/s]

Running reward = -25.415551909023822
Successful runs = 44.2
[1 1 1 2] [2 1 1 3]
tf.Tensor(
[[-0.1403343  -0.12591733 -0.1494827  -0.14715053]
 [-0.16227166 -0.15126342 -0.15839252 -0.1633623 ]
 [-0.14295094 -0.13799608 -0.16257536 -0.15700376]
 [-0.13892831 -0.14233667 -0.13813272 -0.15180334]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-1.6090894 -1.6152655 -1.6382064 -1.656887 ]
 [-1.598826  -1.6136768 -1.6269821 -1.650974 ]
 [-1.6040488 -1.627249  -1.6444639 -1.6694   ]
 [-1.5821049 -1.5919217 -1.6132181 -1.6367903]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-1.7641237]
 [-1.7649403]
 [-1.7652451]
 [-1.774923 ]], shape=(4, 1), dtype=float32)


 41%|████████████████████████████▊                                          | 40501/100000 [3:15:24<4:33:55,  3.62it/s]

Running reward = -27.843232979663377
Successful runs = 39.2


 41%|█████████████████████████████                                          | 41001/100000 [3:18:30<6:17:38,  2.60it/s]

Running reward = -25.874687932820777
Successful runs = 43.2


 42%|█████████████████████████████▍                                         | 41501/100000 [3:21:47<6:24:13,  2.54it/s]

Running reward = -30.551217637775313
Successful runs = 33.4


 42%|█████████████████████████████▊                                         | 42001/100000 [3:24:59<6:06:41,  2.64it/s]

Running reward = -26.72092154640795
Successful runs = 40.8


 43%|██████████████████████████████▏                                        | 42501/100000 [3:28:13<5:00:07,  3.19it/s]

Running reward = -25.29825949738717
Successful runs = 38.8


 43%|██████████████████████████████▌                                        | 43001/100000 [3:31:29<5:10:45,  3.06it/s]

Running reward = -26.714745980335433
Successful runs = 38.2


 43%|██████████████████████████████▋                                        | 43201/100000 [3:32:44<6:10:27,  2.56it/s]

56
Saved Weights


 43%|██████████████████████████████▋                                        | 43301/100000 [3:33:21<6:14:46,  2.52it/s]

57
Saved Weights


 44%|██████████████████████████████▉                                        | 43501/100000 [3:34:38<5:19:09,  2.95it/s]

61
Saved Weights
Running reward = -21.43232392847782
Successful runs = 50.0


 44%|███████████████████████████████▏                                       | 44001/100000 [3:37:54<6:40:08,  2.33it/s]

Running reward = -24.56513695822336
Successful runs = 46.4


 45%|███████████████████████████████▌                                       | 44501/100000 [3:41:49<5:52:16,  2.63it/s]

Running reward = -23.284848180202125
Successful runs = 45.4


 45%|███████████████████████████████▉                                       | 45001/100000 [3:45:18<9:34:33,  1.60it/s]

Running reward = -23.41257593799368
Successful runs = 52.2


 46%|███████████████████████████████▊                                      | 45501/100000 [3:49:19<14:58:41,  1.01it/s]

Running reward = -25.99499425428683
Successful runs = 51.0


 46%|████████████████████████████████▋                                      | 46001/100000 [3:53:01<5:07:32,  2.93it/s]

Running reward = -24.82119504266559
Successful runs = 48.8


 46%|████████████████████████████████▊                                      | 46301/100000 [3:55:03<6:57:10,  2.15it/s]

66
Saved Weights


 46%|████████████████████████████████▉                                      | 46401/100000 [3:55:40<5:22:38,  2.77it/s]

68
Saved Weights


 47%|█████████████████████████████████                                      | 46501/100000 [3:56:15<5:18:15,  2.80it/s]

Running reward = -16.60874206156899
Successful runs = 59.8


 47%|█████████████████████████████████▎                                     | 47001/100000 [3:59:53<6:02:17,  2.44it/s]

Running reward = -24.017873647104626
Successful runs = 50.6


 48%|█████████████████████████████████▋                                     | 47501/100000 [4:03:15<6:43:06,  2.17it/s]

Running reward = -25.116962254721507
Successful runs = 47.2


 48%|██████████████████████████████████                                     | 48001/100000 [4:07:04<5:29:06,  2.63it/s]

Running reward = -22.60262947219683
Successful runs = 49.0


 49%|██████████████████████████████████▍                                    | 48501/100000 [4:11:01<6:26:32,  2.22it/s]

Running reward = -24.77262154219587
Successful runs = 48.0


 49%|██████████████████████████████████▊                                    | 49001/100000 [4:15:32<7:32:26,  1.88it/s]

Running reward = -21.806628510029856
Successful runs = 47.8


 50%|███████████████████████████████████▏                                   | 49501/100000 [4:19:55<8:45:25,  1.60it/s]

Running reward = -21.23792532554755
Successful runs = 52.2


 50%|███████████████████████████████████▍                                   | 49901/100000 [4:22:55<5:47:49,  2.40it/s]

72
Saved Weights


 50%|███████████████████████████████████▌                                   | 50001/100000 [4:23:33<5:55:13,  2.35it/s]

Running reward = -16.695865887277247
Successful runs = 63.8


 51%|███████████████████████████████████▊                                   | 50501/100000 [4:27:09<7:25:12,  1.85it/s]

Running reward = -21.017881196285003
Successful runs = 55.4


 51%|████████████████████████████████████▏                                  | 51001/100000 [4:30:31<5:34:19,  2.44it/s]

Running reward = -20.26420666064656
Successful runs = 57.2


 52%|████████████████████████████████████▌                                  | 51501/100000 [4:33:57<5:19:16,  2.53it/s]

Running reward = -20.50530885774312
Successful runs = 51.8


 52%|████████████████████████████████████▉                                  | 52001/100000 [4:37:16<5:10:49,  2.57it/s]

Running reward = -21.89027472663768
Successful runs = 59.4


 53%|█████████████████████████████████████▎                                 | 52501/100000 [4:40:40<5:48:59,  2.27it/s]

Running reward = -21.477073734395848
Successful runs = 55.6


 53%|█████████████████████████████████████▍                                 | 52701/100000 [4:41:57<6:04:04,  2.17it/s]

74
Saved Weights


 53%|█████████████████████████████████████▋                                 | 53001/100000 [4:43:56<6:06:21,  2.14it/s]

Running reward = -20.79597499286324
Successful runs = 66.0


 54%|█████████████████████████████████████▉                                 | 53501/100000 [4:47:02<3:54:40,  3.30it/s]

85
Saved Weights
Running reward = -10.087063305953558
Successful runs = 70.6


 54%|██████████████████████████████████████                                 | 53601/100000 [4:47:34<4:47:53,  2.69it/s]

93
Saved Weights


 54%|██████████████████████████████████████▎                                | 54001/100000 [4:50:16<5:49:20,  2.19it/s]

Running reward = -22.562112048179255
Successful runs = 67.2


 55%|██████████████████████████████████████▋                                | 54501/100000 [4:53:56<5:58:05,  2.12it/s]

Running reward = -24.693793616260546
Successful runs = 47.8


 55%|███████████████████████████████████████                                | 55001/100000 [4:57:33<6:40:17,  1.87it/s]

Running reward = -23.700466164578042
Successful runs = 50.0


 56%|███████████████████████████████████████▍                               | 55501/100000 [5:01:14<5:31:29,  2.24it/s]

Running reward = -21.85742195997593
Successful runs = 48.8


 56%|███████████████████████████████████████▊                               | 56001/100000 [5:04:48<5:50:23,  2.09it/s]

Running reward = -23.51880922311415
Successful runs = 53.4


 57%|████████████████████████████████████████                               | 56501/100000 [5:08:29<4:51:53,  2.48it/s]

Running reward = -24.088456179012464
Successful runs = 45.4


 57%|████████████████████████████████████████▍                              | 57001/100000 [5:12:06<4:56:58,  2.41it/s]

Running reward = -19.379253215532174
Successful runs = 56.0


 58%|████████████████████████████████████████▊                              | 57501/100000 [5:15:36<3:38:03,  3.25it/s]

Running reward = -17.637031077056303
Successful runs = 63.0


 58%|█████████████████████████████████████████▏                             | 58001/100000 [5:19:07<5:00:22,  2.33it/s]

Running reward = -21.423665452252134
Successful runs = 59.2


 59%|█████████████████████████████████████████▌                             | 58501/100000 [5:22:51<4:12:01,  2.74it/s]

Running reward = -23.612249251007142
Successful runs = 53.4


 59%|█████████████████████████████████████████▉                             | 59001/100000 [5:26:23<5:08:49,  2.21it/s]

Running reward = -18.138487694689164
Successful runs = 56.6


 60%|██████████████████████████████████████████▏                            | 59501/100000 [5:29:45<4:44:00,  2.38it/s]

Running reward = -14.459929044783495
Successful runs = 71.2


 60%|██████████████████████████████████████████▌                            | 60001/100000 [5:33:07<5:26:26,  2.04it/s]

Running reward = -17.341586008929678
Successful runs = 68.6
[1 3 1 0] [2 3 1 0]
tf.Tensor(
[[-0.16119419 -0.14377329 -0.16903807 -0.15570273]
 [-0.15606672 -0.1462805  -0.17137994 -0.14086647]
 [-0.18133676 -0.16396524 -0.17978734 -0.17322877]
 [-0.12973975 -0.13534856 -0.14283356 -0.14154392]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-1.406407  -1.4136757 -1.4329478 -1.4206572]
 [-1.3937368 -1.4076209 -1.4067848 -1.4115441]
 [-1.4641457 -1.4464445 -1.4623352 -1.4571202]
 [-1.3887475 -1.4075278 -1.4249762 -1.4168496]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-1.5767211]
 [-1.5524106]
 [-1.6104097]
 [-1.5184872]], shape=(4, 1), dtype=float32)


 61%|██████████████████████████████████████████▉                            | 60501/100000 [5:36:37<4:48:40,  2.28it/s]

Running reward = -17.19925822050974
Successful runs = 64.0


 61%|███████████████████████████████████████████▎                           | 61001/100000 [5:40:08<3:43:28,  2.91it/s]

Running reward = -16.828014630813303
Successful runs = 63.8


 62%|███████████████████████████████████████████▋                           | 61501/100000 [5:43:39<3:17:53,  3.24it/s]

Running reward = -16.398206646200105
Successful runs = 65.8


 62%|████████████████████████████████████████████                           | 62001/100000 [5:47:13<4:20:05,  2.43it/s]

Running reward = -17.882956771454005
Successful runs = 64.8


 63%|████████████████████████████████████████████▍                          | 62501/100000 [5:50:14<3:08:52,  3.31it/s]

Running reward = -7.41351030542832
Successful runs = 82.6


 63%|████████████████████████████████████████████▌                          | 62701/100000 [5:51:19<3:57:50,  2.61it/s]

95
Saved Weights


 63%|████████████████████████████████████████████▋                          | 63001/100000 [5:53:32<5:37:31,  1.83it/s]

Running reward = -21.45203815315483
Successful runs = 72.0


 64%|█████████████████████████████████████████████                          | 63501/100000 [5:57:29<4:22:44,  2.32it/s]

Running reward = -23.951595485587074
Successful runs = 47.6


 64%|█████████████████████████████████████████████▍                         | 64001/100000 [6:01:32<4:57:31,  2.02it/s]

Running reward = -23.95089279706536
Successful runs = 43.6


 65%|█████████████████████████████████████████████▊                         | 64501/100000 [6:05:26<5:34:34,  1.77it/s]

Running reward = -20.42072572304859
Successful runs = 49.8


 65%|██████████████████████████████████████████████▏                        | 65001/100000 [6:09:06<5:11:19,  1.87it/s]

Running reward = -18.785111000629655
Successful runs = 59.2


 66%|██████████████████████████████████████████████▌                        | 65501/100000 [6:12:36<5:23:30,  1.78it/s]

Running reward = -14.101668580442421
Successful runs = 71.8


 66%|██████████████████████████████████████████████▊                        | 66001/100000 [6:16:03<3:50:51,  2.45it/s]

Running reward = -14.742324892105808
Successful runs = 73.8


 67%|███████████████████████████████████████████████▏                       | 66501/100000 [6:19:40<5:01:59,  1.85it/s]

Running reward = -19.18605245654255
Successful runs = 63.6


 67%|███████████████████████████████████████████████▌                       | 67001/100000 [6:23:27<4:51:15,  1.89it/s]

Running reward = -17.73815605311502
Successful runs = 58.6


 68%|███████████████████████████████████████████████▉                       | 67501/100000 [6:27:15<2:36:49,  3.45it/s]

Running reward = -16.629509038685452
Successful runs = 62.4


 68%|████████████████████████████████████████████████▎                      | 68001/100000 [6:30:50<4:02:33,  2.20it/s]

Running reward = -14.622578431694246
Successful runs = 67.4


 69%|████████████████████████████████████████████████▋                      | 68501/100000 [6:34:20<4:56:52,  1.77it/s]

Running reward = -14.076053914256285
Successful runs = 71.4


 69%|████████████████████████████████████████████████▉                      | 69001/100000 [6:37:58<3:28:16,  2.48it/s]

Running reward = -14.30809488542295
Successful runs = 68.6


 70%|█████████████████████████████████████████████████▎                     | 69501/100000 [6:41:22<3:44:07,  2.27it/s]

Running reward = -12.09811107240315
Successful runs = 76.2


 70%|█████████████████████████████████████████████████▋                     | 70001/100000 [6:44:48<3:03:34,  2.72it/s]

Running reward = -9.879926436145745
Successful runs = 77.4


 71%|██████████████████████████████████████████████████                     | 70501/100000 [6:48:26<2:31:30,  3.25it/s]

Running reward = -12.237730871513241
Successful runs = 73.8


 71%|██████████████████████████████████████████████████▍                    | 71001/100000 [6:52:01<3:09:48,  2.55it/s]

Running reward = -13.492989150074207
Successful runs = 68.6


 72%|██████████████████████████████████████████████████▊                    | 71501/100000 [6:55:35<3:55:59,  2.01it/s]

Running reward = -12.495580979142794
Successful runs = 70.2


 72%|███████████████████████████████████████████████████                    | 72001/100000 [6:59:09<3:38:45,  2.13it/s]

Running reward = -13.256182966982601
Successful runs = 70.4


 73%|███████████████████████████████████████████████████▍                   | 72501/100000 [7:02:39<3:43:45,  2.05it/s]

Running reward = -12.87492128951141
Successful runs = 78.4


 73%|███████████████████████████████████████████████████▊                   | 73001/100000 [7:06:00<3:15:11,  2.31it/s]

Running reward = -8.049877351649878
Successful runs = 79.6


 74%|████████████████████████████████████████████████████▏                  | 73501/100000 [7:09:09<2:07:57,  3.45it/s]

Running reward = -6.494857851792762
Successful runs = 86.6


 74%|████████████████████████████████████████████████████▌                  | 74001/100000 [7:12:12<2:43:46,  2.65it/s]

Running reward = -7.983927378023082
Successful runs = 86.4


 75%|████████████████████████████████████████████████████▉                  | 74501/100000 [7:15:25<2:05:28,  3.39it/s]

Running reward = -6.073475408193768
Successful runs = 86.0


 75%|█████████████████████████████████████████████████████▎                 | 75001/100000 [7:18:42<3:22:32,  2.06it/s]

Running reward = -9.93702680935246
Successful runs = 86.8


 76%|█████████████████████████████████████████████████████▌                 | 75501/100000 [7:21:46<1:46:41,  3.83it/s]

Running reward = -5.297047496428056
Successful runs = 90.8


 76%|█████████████████████████████████████████████████████▋                 | 75701/100000 [7:23:02<3:16:02,  2.07it/s]

97
Saved Weights


 76%|█████████████████████████████████████████████████████▉                 | 76001/100000 [7:24:59<2:34:27,  2.59it/s]

Running reward = -9.359727344175026
Successful runs = 87.6


 77%|██████████████████████████████████████████████████████▎                | 76501/100000 [7:28:51<3:10:55,  2.05it/s]

Running reward = -14.595002524747818
Successful runs = 68.2


 77%|██████████████████████████████████████████████████████▋                | 77001/100000 [7:32:46<2:57:34,  2.16it/s]

Running reward = -15.1278980467055
Successful runs = 64.6


 78%|███████████████████████████████████████████████████████                | 77501/100000 [7:36:50<4:07:10,  1.52it/s]

Running reward = -19.63110507479743
Successful runs = 57.0


 78%|███████████████████████████████████████████████████████▍               | 78001/100000 [7:41:01<2:07:39,  2.87it/s]

Running reward = -17.09908642201872
Successful runs = 52.8


 79%|███████████████████████████████████████████████████████▋               | 78501/100000 [7:44:34<3:18:42,  1.80it/s]

Running reward = -10.336646001164931
Successful runs = 72.8


 79%|████████████████████████████████████████████████████████               | 79001/100000 [7:48:24<2:53:47,  2.01it/s]

Running reward = -13.665663936820927
Successful runs = 69.6


 80%|████████████████████████████████████████████████████████▍              | 79501/100000 [7:51:58<3:00:32,  1.89it/s]

Running reward = -10.127824182873034
Successful runs = 74.2


 80%|████████████████████████████████████████████████████████▊              | 80001/100000 [7:55:41<2:41:10,  2.07it/s]

Running reward = -13.611741757108799
Successful runs = 72.2
[2 3 0 1] [1 1 3 2]
tf.Tensor(
[[-0.18085265 -0.1750826  -0.18206169 -0.16656137]
 [-0.18085265 -0.1750826  -0.18206169 -0.16656137]
 [-0.17009251 -0.14662232 -0.16755632 -0.15091959]
 [-0.17396514 -0.13697058 -0.16435882 -0.1689245 ]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-1.313372  -1.2878864 -1.2926067 -1.3335558]
 [-1.332265  -1.2936351 -1.2988458 -1.3295223]
 [-1.2826493 -1.2908256 -1.2795595 -1.3183384]
 [-1.2876393 -1.2960981 -1.2811373 -1.3120767]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-1.469948 ]
 [-1.4601965]
 [-1.4884309]
 [-1.418108 ]], shape=(4, 1), dtype=float32)


 81%|█████████████████████████████████████████████████████████▏             | 80501/100000 [7:59:19<1:34:13,  3.45it/s]

Running reward = -7.546720592792909
Successful runs = 73.0


 81%|█████████████████████████████████████████████████████████▌             | 81001/100000 [8:02:30<2:02:04,  2.59it/s]

Running reward = -6.419420131195154
Successful runs = 81.2


 82%|█████████████████████████████████████████████████████████▊             | 81501/100000 [8:05:37<1:41:05,  3.05it/s]

Running reward = -5.86291303789965
Successful runs = 79.8


 82%|██████████████████████████████████████████████████████████▏            | 82001/100000 [8:08:55<2:25:57,  2.06it/s]

Running reward = -9.297361221759553
Successful runs = 79.0


 83%|██████████████████████████████████████████████████████████▌            | 82501/100000 [8:12:13<2:04:02,  2.35it/s]

Running reward = -7.854952536998438
Successful runs = 77.4


 83%|██████████████████████████████████████████████████████████▉            | 83001/100000 [8:15:36<2:54:24,  1.62it/s]

Running reward = -11.444250063614449
Successful runs = 75.6


 84%|███████████████████████████████████████████████████████████▎           | 83501/100000 [8:18:53<1:59:52,  2.29it/s]

Running reward = -7.266591318692644
Successful runs = 78.8


 84%|███████████████████████████████████████████████████████████▋           | 84001/100000 [8:22:02<3:05:30,  1.44it/s]

Running reward = -5.589623199745067
Successful runs = 80.8


 85%|███████████████████████████████████████████████████████████▉           | 84501/100000 [8:24:51<1:39:22,  2.60it/s]

Running reward = -3.3341574274834427
Successful runs = 87.0


 85%|████████████████████████████████████████████████████████████▎          | 85001/100000 [8:27:41<1:44:18,  2.40it/s]

Running reward = -1.7484371557068752
Successful runs = 89.6


 86%|████████████████████████████████████████████████████████████▋          | 85501/100000 [8:30:32<1:46:20,  2.27it/s]

Running reward = -1.8820214224781924
Successful runs = 91.2


 86%|█████████████████████████████████████████████████████████████          | 86001/100000 [8:33:12<1:13:21,  3.18it/s]

Running reward = -0.6132082015083153
Successful runs = 92.0


 87%|█████████████████████████████████████████████████████████████▍         | 86501/100000 [8:35:48<1:01:37,  3.65it/s]

100
Saved Weights
Running reward = 2.93051746199115
Successful runs = 95.0


 87%|█████████████████████████████████████████████████████████████▊         | 87001/100000 [8:38:49<1:04:34,  3.36it/s]

Running reward = -0.5823412623092646
Successful runs = 87.0


 88%|███████████████████████████████████████████████████████████████▉         | 87501/100000 [8:41:19<57:38,  3.61it/s]

Running reward = 1.7174866480175113
Successful runs = 96.4


 88%|██████████████████████████████████████████████████████████████▍        | 88001/100000 [8:44:01<1:01:48,  3.24it/s]

Running reward = -1.4586365983592129
Successful runs = 93.0


 89%|██████████████████████████████████████████████████████████████▊        | 88501/100000 [8:46:41<1:21:11,  2.36it/s]

Running reward = -2.1342003063412056
Successful runs = 93.0


 89%|███████████████████████████████████████████████████████████████▏       | 89001/100000 [8:49:25<1:18:47,  2.33it/s]

Running reward = -6.0952644604353114
Successful runs = 91.8


 90%|███████████████████████████████████████████████████████████████▌       | 89501/100000 [8:54:45<2:15:05,  1.30it/s]

Running reward = -30.88906022331909
Successful runs = 31.4


 90%|███████████████████████████████████████████████████████████████▉       | 90001/100000 [9:00:34<2:09:39,  1.29it/s]

Running reward = -34.61635604837537
Successful runs = 16.8


 91%|████████████████████████████████████████████████████████████████▎      | 90501/100000 [9:06:29<1:56:20,  1.36it/s]

Running reward = -35.441411475564465
Successful runs = 14.2


 91%|████████████████████████████████████████████████████████████████▌      | 91001/100000 [9:12:29<1:48:03,  1.39it/s]

Running reward = -33.950737100245995
Successful runs = 12.2


 92%|████████████████████████████████████████████████████████████████▉      | 91501/100000 [9:18:29<1:44:44,  1.35it/s]

Running reward = -34.2131220629884
Successful runs = 11.6


 92%|█████████████████████████████████████████████████████████████████▎     | 92001/100000 [9:24:23<1:39:32,  1.34it/s]

Running reward = -32.41067016809084
Successful runs = 16.0


 93%|█████████████████████████████████████████████████████████████████▋     | 92501/100000 [9:30:25<1:36:58,  1.29it/s]

Running reward = -36.95262893843019
Successful runs = 11.8


 93%|██████████████████████████████████████████████████████████████████     | 93001/100000 [9:36:15<1:29:37,  1.30it/s]

Running reward = -33.069014382442795
Successful runs = 20.6


 94%|██████████████████████████████████████████████████████████████████▍    | 93501/100000 [9:42:03<1:07:54,  1.59it/s]

Running reward = -31.979512723836024
Successful runs = 19.8


 94%|████████████████████████████████████████████████████████████████████▌    | 94001/100000 [9:47:47<39:46,  2.51it/s]

Running reward = -31.358667958160666
Successful runs = 23.0


 95%|███████████████████████████████████████████████████████████████████    | 94501/100000 [9:53:35<1:04:30,  1.42it/s]

Running reward = -33.307198200019215
Successful runs = 19.6


 95%|███████████████████████████████████████████████████████████████████▍   | 95001/100000 [9:59:25<1:01:27,  1.36it/s]

Running reward = -30.583835742401753
Successful runs = 18.2


 96%|████████████████████████████████████████████████████████████████████▊   | 95501/100000 [10:05:06<52:32,  1.43it/s]

Running reward = -30.391517927136753
Successful runs = 23.4


 96%|█████████████████████████████████████████████████████████████████████   | 96001/100000 [10:10:56<41:31,  1.61it/s]

Running reward = -33.23722108749951
Successful runs = 19.0


 97%|█████████████████████████████████████████████████████████████████████▍  | 96501/100000 [10:16:51<39:01,  1.49it/s]

Running reward = -34.30798713628837
Successful runs = 15.6


 97%|█████████████████████████████████████████████████████████████████████▊  | 97001/100000 [10:22:48<37:14,  1.34it/s]

Running reward = -34.97929831167767
Successful runs = 14.0


 98%|██████████████████████████████████████████████████████████████████████▏ | 97501/100000 [10:28:29<24:27,  1.70it/s]

Running reward = -30.77389843776861
Successful runs = 21.0


 98%|██████████████████████████████████████████████████████████████████████▌ | 98001/100000 [10:34:07<19:20,  1.72it/s]

Running reward = -32.107068661127514
Successful runs = 23.6


 99%|██████████████████████████████████████████████████████████████████████▉ | 98501/100000 [10:39:53<18:07,  1.38it/s]

Running reward = -33.693213078117694
Successful runs = 20.4


 99%|███████████████████████████████████████████████████████████████████████▎| 99001/100000 [10:45:31<12:14,  1.36it/s]

Running reward = -27.760859806918894
Successful runs = 25.0


100%|███████████████████████████████████████████████████████████████████████▋| 99501/100000 [10:51:19<06:34,  1.26it/s]

Running reward = -32.23396163993609
Successful runs = 19.8


100%|███████████████████████████████████████████████████████████████████████| 100000/100000 [10:57:23<00:00,  2.54it/s]


In [7]:
model.user_model = tf.keras.models.load_model('user_model.h5')
model.assist_model = tf.keras.models.load_model('assist_model.h5')



In [8]:
model.infer()

[3 1 1 1] [1 3 3 1]
tf.Tensor(
[[0.18308994 0.17637497 0.03688546 0.0847222 ]
 [0.10390993 0.06150039 0.00904826 0.04528475]
 [0.13619444 0.10421284 0.0895853  0.06934924]
 [0.18308994 0.17637497 0.03688546 0.0847222 ]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[ 0.17399107  0.11753978 -0.12453113  0.12452833]
 [ 0.15034667  0.14374283 -0.13104458  0.12340852]
 [ 0.15133183  0.1557766  -0.12694012  0.13310549]
 [ 0.20472121  0.14842433 -0.09335863  0.14770529]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-2.0486636]
 [-2.0129147]
 [-2.0172217]
 [-2.0437934]], shape=(4, 1), dtype=float32)


In [9]:
model.epsilon = 0
done = False
episode_reward = 0
start, dest = env.give_start_dest()
ob_user = [start[0], start[1], dest[0], dest[1]]
prev_steps_assist = []
prev_steps_assist = give_prev_steps(prev_steps_assist, steps)
step = 0

while not done and step<max_steps:
    ob_user, prev_steps_assist, reward_user, reward_assist, done = model.step(ob_user, prev_steps_assist)
    episode_reward+=reward_user
    step+=1
    print(ob_user)
    
print(done)

[0.4, 0.8, 0.6, 0.2]
[0.4, 0.7, 0.6, 0.2]
[0.4, 0.6, 0.6, 0.2]
[0.4, 0.5, 0.6, 0.2]
[0.4, 0.4, 0.6, 0.2]
[0.4, 0.3, 0.6, 0.2]
[0.5, 0.3, 0.6, 0.2]
[0.5, 0.2, 0.6, 0.2]
[0.6, 0.2, 0.6, 0.2]
1


In [10]:
reached = 0
for i in range(1000):
    model.epsilon = 0
    done = False
    episode_reward = 0
    start, dest = env.give_start_dest()
    ob_user = [start[0], start[1], dest[0], dest[1]]
    prev_steps_assist = []
    prev_steps_assist = give_prev_steps(prev_steps_assist, steps)
    step = 0

    while not done and step<max_steps:
        ob_user, prev_steps_assist, reward_user, reward_assist, done = model.step(ob_user, prev_steps_assist)
        episode_reward+=reward_user
        step+=1
#         print(ob_user)
        if done:
            reached += 1
#     print(done)

print(reached)

978


In [11]:
ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist,\
reward_assist, next_ob_assist, done, importance, indices = model.sample_exp()
input_C = np.vstack(tuple([model.env_cell_mapping]*128))
ob_assist

array([[[ 0. ,  1. ,  0. ,  0. ,  1. ,  0.3],
        [ 0. ,  1. ,  0. ,  0. ,  1. ,  0.3],
        [ 0. ,  1. ,  0. ,  0. ,  1. ,  0.3],
        [ 0. ,  1. ,  0. ,  0. ,  1. ,  0.3]],

       [[ 0. ,  1. ,  0. ,  0. ,  0.2,  0.9],
        [ 0. ,  0. ,  1. ,  0. ,  0.3,  0.9],
        [ 0. ,  1. ,  0. ,  0. ,  0.3,  0.8],
        [ 0. ,  0. ,  1. ,  0. ,  0.4,  0.8]],

       [[ 0. ,  1. ,  0. ,  0. ,  1. ,  0.3],
        [ 0. ,  1. ,  0. ,  0. ,  1. ,  0.3],
        [ 0. ,  1. ,  0. ,  0. ,  1. ,  0.3],
        [ 0. ,  1. ,  0. ,  0. ,  1. ,  0.3]],

       ...,

       [[ 0. ,  1. ,  0. ,  0. ,  0.2,  0.8],
        [ 0. ,  1. ,  0. ,  0. ,  0.3,  0.8],
        [ 0. ,  1. ,  0. ,  0. ,  0.4,  0.8],
        [ 0. ,  0. ,  1. ,  0. ,  0.5,  0.8]],

       [[ 0. ,  1. ,  0. ,  0. ,  0.4,  0.6],
        [ 0. ,  0. ,  1. ,  0. ,  0.5,  0.6],
        [ 0. ,  1. ,  0. ,  0. ,  0.5,  0.5],
        [ 0. ,  0. ,  1. ,  0. ,  0.6,  0.5]],

       [[ 0. ,  0. ,  0. ,  0. , -1. , -1. ],
        [ 0

In [12]:
np.argmax(model.assist_model([ob_assist, input_C]), axis = 1) + 1

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)