In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, LSTM, Subtract
from tensorflow.keras.models import Model
from Environment import *
from tqdm import tqdm
import matplotlib.pyplot as plt
from copy import deepcopy
import os 
import numpy as np

In [2]:
dense_1_user =  Dense(32, activation = 'relu')
dense_2_user =  Dense(32, activation = 'relu')
# dense_3_user =  Dense(32, activation = 'relu')

dense_1_assist =  Dense(32, activation = 'relu')
lstm_1_assist = LSTM(32, activation = 'tanh')
dense_2_assist = Dense(32, activation = 'relu')

advantage_layer_user = Dense(4)
value_layer_user = Dense(1)

advantage_layer_assist = Dense(4)
value_layer_assist = Dense(1)

# advantage_layer = Dense(4)
# value_layer = Dense(1)

# advantage_layer_user = advantage_layer
# advantage_layer_assist = advantage_layer

# value_layer_user = value_layer
# value_layer_assist = value_layer

In [3]:
class AI_Design:
    def __init__(self, steps = 4):        
        self.loss_fn = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(lr = 0.0001)
        self.batch_size = 128
        self.replay_buffer_size = 1024
        self.replay_buffer = Replay_Buffer(self.replay_buffer_size)
        self.epsilon = 1
        self.gamma = 0.9
        self.env = Environment()
        self.env.cells = np.array([[0.7, 0.1], [0.1, 0.1], [0.5, 0.7], [0.6, 0.2], [0.7, 0.4], [0.2, 0.9]])
#         self.env_cells = np.array([[0.7, 0.1]])
        self.env_cell_mapping = give_mapping(self.env.cells)
        self.env_cell_mapping = self.env_cell_mapping[np.newaxis, :, :, np.newaxis]
        #-------------------------------------------------------------------------------------------------
        input_A = Input(shape = (4,))
        input_B = Input(shape = (steps,6))
        input_C = Input(shape = (11, 11, 1)) #Location of every icon
                 
        action_user = Input(shape = 1, dtype = tf.int32)
        action_assist = Input(shape = 1, dtype = tf.int32)
        
        
        
        #User Network 
        
        
        x = Subtract()([input_A[:, 2:], input_A[:, :2]])
        x = dense_1_user(x)
        x = dense_2_user(x)
#         x = dense_3_user(x)
        adv_user = advantage_layer_user(x)
        val_user = value_layer_user(x)
        output_user = adv_user - tf.reduce_mean(adv_user, axis = 1, keepdims = True) + val_user
        
        self.user_model = Model(inputs = input_A, outputs = output_user)
        self.user_model.summary()
        
        self.target_user_model = tf.keras.models.clone_model(self.user_model)
        self.target_user_model.set_weights(self.user_model.get_weights())
        

        
        #Assistant Network
        z = tf.keras.layers.Conv2D(filters = 2, kernel_size = 3, activation = 'relu')(input_C)
        z = tf.keras.layers.MaxPooling2D()(z)
        z = tf.keras.layers.Flatten()(z)
        z = tf.keras.layers.Dense(32, activation = 'relu')(z)
        
        y = dense_1_assist(input_B)
        y = lstm_1_assist(y)
        y = tf.keras.layers.Concatenate()([y,z])
        y = dense_2_assist(y)
        adv_assist = advantage_layer_assist(y)
        val_assist = value_layer_assist(y)
        output_assist = adv_assist - tf.reduce_mean(adv_assist, axis = 1, keepdims = True) + val_assist
        
        self.assist_model = Model(inputs = [input_B, input_C], outputs = output_assist)
        self.assist_model.summary()
        
        self.target_assist_model = tf.keras.models.clone_model(self.assist_model)
        self.target_assist_model.set_weights(self.assist_model.get_weights())
        
        
        #Complete Network

        mask_user = tf.reduce_sum(tf.one_hot(action_user, 4), axis = 1)
        mask_assist = tf.reduce_sum(tf.one_hot(action_assist, 4), axis = 1)
        output_user = output_user*mask_user
        output_assist = output_assist*mask_assist
        
        out = tf.reduce_sum(output_user + output_assist, axis = 1, keepdims = True)
        
        self.model = Model(inputs = [input_A, input_B, input_C, action_user, action_assist], outputs = out)  
        self.model.summary() 
        #-------------------------------------------------------------------------------------------------
    
    def infer(self):
        ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist,\
        reward_assist, next_ob_assist, done, importance, indices = self.sample_exp()
        
        ob_user = ob_user[:4]
        action_user = action_user[:4]
        reward_user = reward_user[:4]
        
        ob_assist = ob_assist[:4]
        action_assist = action_assist[:4]
        reward_assist = reward_assist[:4]
        
        print(action_user, action_assist)
        
        print(self.user_model(ob_user))
        print(self.assist_model([ob_assist, np.vstack(tuple([self.env_cell_mapping]*4))]))
        
        print(self.model([ob_user, ob_assist, np.vstack(tuple([self.env_cell_mapping]*4)), action_user, action_assist]))
    
    def exp_policy_user(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(4)
        else:
            state = np.array(state)[np.newaxis]
            Q_values = self.user_model(state)
            return np.argmax(Q_values[0])
    
    def exp_policy_assist(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(1,5)
        else:
            state = np.array(state)[np.newaxis]
            Q_values = self.assist_model([state, self.env_cell_mapping])
            return np.argmax(Q_values[0])+1
    
    def step(self, ob_user, prev_steps_assist):
        curr_loc = ob_user[:2]
        target_loc = ob_user[2:4]
        
        action_user = self.exp_policy_user(ob_user)
        action_user_one_hot = make_one_hot(action_user, 4)
        
        ob_assist = [action_user_one_hot + ob_user[:2]]
        ob_assist = prev_steps_assist + ob_assist 
        action_assist = self.exp_policy_assist(ob_assist)
        
        new_loc, reward_user, reward_assist, done = self.env.step(action_user, action_assist-1, target_loc, curr_loc)
        
        next_ob_user = new_loc[:]
        next_ob_user = next_ob_user + target_loc
        
        next_action_user = self.exp_policy_user(next_ob_user)
        next_action_user_one_hot = make_one_hot(next_action_user, 4)
        next_ob_assist = [next_action_user_one_hot + next_ob_user[:2]]
        next_ob_assist = ob_assist[1:] + next_ob_assist
        
        self.add_replay_buffer(ob_user, action_user, reward_user, next_ob_user, ob_assist,\
                          action_assist-1, reward_assist, next_ob_assist, done)
        
        return next_ob_user, ob_assist[1:], reward_user, reward_assist, done 
        
        
    
    def add_replay_buffer(self, ob_user, action_user, reward_user, next_ob_user, ob_assist,\
                         action_assist, reward_assist, next_ob_assist, done):
        
        self.replay_buffer.ob_user_history.append(ob_user)
        self.replay_buffer.action_user_history.append(action_user)
        self.replay_buffer.reward_user_history.append(reward_user)
        self.replay_buffer.next_ob_user_history.append(next_ob_user)
        self.replay_buffer.ob_assist_history.append(ob_assist)
        self.replay_buffer.action_assist_history.append(action_assist)
        self.replay_buffer.reward_assist_history.append(reward_assist)
        self.replay_buffer.next_ob_assist_history.append(next_ob_assist)
        self.replay_buffer.done_history.append(done)
        self.replay_buffer.priorities.append(self.replay_buffer.max_val)
    
    def sample_exp(self):
        sample_probs = self.replay_buffer.get_probabilities(priority_scale = 0.7)
        indices = np.random.choice(len(self.replay_buffer.done_history), size = self.batch_size, p = sample_probs)
        importance = self.replay_buffer.get_importance(sample_probs[indices])
        
        ob_user = np.array([self.replay_buffer.ob_user_history[i] for i in indices])
        action_user = np.array([self.replay_buffer.action_user_history[i] for i in indices])
        reward_user = np.array([self.replay_buffer.reward_user_history[i] for i in indices])
        next_ob_user = np.array([self.replay_buffer.next_ob_user_history[i] for i in indices])
        ob_assist = np.array([self.replay_buffer.ob_assist_history[i] for i in indices])
        action_assist = np.array([self.replay_buffer.action_assist_history[i] for i in indices])
        reward_assist = np.array([self.replay_buffer.reward_assist_history[i] for i in indices])
        next_ob_assist = np.array([self.replay_buffer.next_ob_assist_history[i] for i in indices])
        done = np.array([self.replay_buffer.done_history[i] for i in indices])
        
        return ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist, reward_assist, next_ob_assist, done,\
    importance, indices 
    
    def train(self):
        ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist,\
        reward_assist, next_ob_assist, done, importance, indices = self.sample_exp()
        
        input_A = ob_user
        input_B = ob_assist
        input_C = np.vstack(tuple([self.env_cell_mapping]*128))
        
        rewards = reward_user + reward_assist
        
        next_Q_values_user, next_Q_values_assist = self.user_model(next_ob_user), self.assist_model([next_ob_assist, input_C])
        best_next_actions_user, best_next_actions_assist = tf.math.argmax(next_Q_values_user, axis = 1), tf.math.argmax(next_Q_values_assist, axis = 1)
        next_Q_values_user, next_Q_values_assist = self.target_user_model(next_ob_user), self.target_assist_model([next_ob_assist, input_C])
        
        best_next_Q_values_user = tf.reduce_sum(next_Q_values_user*tf.one_hot(best_next_actions_user, 4), axis = 1)
        best_next_Q_values_assist = tf.reduce_sum(next_Q_values_user*tf.one_hot(best_next_actions_assist, 4), axis = 1)
        best_next_Q_values = best_next_Q_values_user + best_next_Q_values_assist
        
        target_Q_values = rewards + (1-done)*self.gamma*best_next_Q_values
        
        with tf.GradientTape() as tape:
            Q_values = self.model([input_A, input_B, input_C, action_user, action_assist])
            error = tf.multiply(self.loss_fn(target_Q_values, Q_values), importance**(1-self.epsilon))
            loss = tf.reduce_mean(error)
        
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        self.replay_buffer.set_priorities(indices, error)
        
        
#         Q_values_assist = self.assist_model([input_B, input_C])
#         with tf.GradientTape() as tape:
#             Q_values_user = self.user_model(input_A)
#             error = tf.multiply(self.loss_fn(Q_values_assist, Q_values_user), importance**(1-self.epsilon))
#             loss = tf.reduce_mean(error)
            
#         grads = tape.gradient(loss, self.user_model.trainable_variables)
#         self.optimizer.apply_gradients(zip(grads, self.user_model.trainable_variables))
            
#         with tf.GradientTape() as tape:
#             Q_values_assist = self.assist_model([input_B, input_C])
#             error = tf.multiply(self.loss_fn(Q_values_assist, Q_values_user), importance**(1-self.epsilon))
#             loss = tf.reduce_mean(error)
            
#         grads = tape.gradient(loss, self.assist_model.trainable_variables)
#         self.optimizer.apply_gradients(zip(grads, self.assist_model.trainable_variables))                          

        

In [4]:
steps = 4
model = AI_Design(steps)
env = model.env

# if os.path.exists('user_model.h5'):
#     model.user_model = tf.keras.models.load_model('user_model.h5')
#     model.assist_model = tf.keras.models.load_model('assist_model.h5')

Icon Locations:
[[0.  0.8]
 [0.2 0.1]
 [0.5 0.3]
 [0.1 0.8]
 [0.3 0.5]
 [0.1 0.2]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(None, 2)]          0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_1 (Te [(None, 2)]          0           input_1[0][0]                    
__________________________________________________________________________________________________
subtract (

In [5]:
def give_prev_steps(prev_steps_assist, steps):
    prev_steps_assist = [[0,0,0,0,-1,-1] for i in range(steps-1)]
    return prev_steps_assist

In [None]:
max_steps = 40
reached = 0
reached_history = []
max_reached = 0

running_reward = 0

for epoch in tqdm(range(100000)):
    done = False
    episode_reward = 0
    start, dest = env.give_start_dest()
    ob_user = [start[0], start[1], dest[0], dest[1]]
    prev_steps_assist = []
    prev_steps_assist = give_prev_steps(prev_steps_assist, steps)
    step = 0
    
    while not done and step<max_steps:
        ob_user, prev_steps_assist, reward_user, reward_assist, done = model.step(ob_user, prev_steps_assist)
        episode_reward+=reward_user
        step+=1
        if done:
            reached+=1
    
    if epoch:
        running_reward = 0.01 * episode_reward + (1 - 0.01) * running_reward
    else:
        running_reward = episode_reward
        
    if epoch>50:
        model.train()
        
        if epoch%100==0:
            model.target_user_model.set_weights(model.user_model.get_weights())
            model.target_assist_model.set_weights(model.assist_model.get_weights())
            reached_history.append(reached)
            rewards = []
            
            if reached>max_reached:
                print(reached)
                print('Saved Weights')
                max_reached = reached
                model.user_model.save('user_model.h5')
                model.assist_model.save('assist_model.h5')
                
            reached = 0
            
            if epoch%500==0:
                print(f'Running reward = {running_reward}')
                print(f'Successful runs = {np.mean(reached_history)}')
                reached_history = []
                
                if epoch%1000==0:
                    model.epsilon-=0.01
                    model.epsilon= max(model.epsilon, 0.1)
                    
                    if epoch%20000==0:
                        model.infer()

  0%|                                                                           | 100/100000 [00:11<4:41:38,  5.91it/s]

12
Saved Weights


  0%|▏                                                                          | 201/100000 [00:28<4:50:11,  5.73it/s]

14
Saved Weights


  0%|▏                                                                          | 301/100000 [00:47<6:26:20,  4.30it/s]

16
Saved Weights


  1%|▍                                                                          | 501/100000 [01:29<6:29:24,  4.26it/s]

17
Saved Weights
Running reward = -30.633880860077227
Successful runs = 13.6


  1%|▋                                                                         | 1001/100000 [03:14<5:25:09,  5.07it/s]

Running reward = -29.417984831975243
Successful runs = 13.6


  1%|▉                                                                         | 1201/100000 [03:57<7:02:04,  3.90it/s]

20
Saved Weights


  2%|█                                                                         | 1501/100000 [05:04<5:26:07,  5.03it/s]

Running reward = -32.79365674637838
Successful runs = 13.6


  2%|█▍                                                                        | 2002/100000 [06:40<4:51:25,  5.60it/s]

Running reward = -31.635547003232674
Successful runs = 14.2


  3%|█▊                                                                        | 2501/100000 [08:27<5:31:10,  4.91it/s]

Running reward = -28.243591650043665
Successful runs = 15.4


  3%|█▉                                                                        | 2701/100000 [09:08<6:04:49,  4.44it/s]

21
Saved Weights


  3%|██▏                                                                       | 3001/100000 [10:09<5:37:44,  4.79it/s]

Running reward = -31.7987824853473
Successful runs = 14.8


  4%|██▌                                                                       | 3501/100000 [11:52<6:16:54,  4.27it/s]

22
Saved Weights
Running reward = -28.655529865279846
Successful runs = 16.4


  4%|██▉                                                                       | 4001/100000 [13:46<7:35:00,  3.52it/s]

Running reward = -31.648986041770474
Successful runs = 15.0


  5%|███▎                                                                      | 4502/100000 [15:46<5:32:41,  4.78it/s]

Running reward = -31.7248061021258
Successful runs = 16.2


  5%|███▋                                                                      | 5001/100000 [17:29<5:02:57,  5.23it/s]

Running reward = -28.230189632337133
Successful runs = 16.0


  6%|████                                                                      | 5501/100000 [19:15<5:36:36,  4.68it/s]

Running reward = -29.991413001855054
Successful runs = 17.0


  6%|████▍                                                                     | 6001/100000 [21:12<6:21:51,  4.10it/s]

Running reward = -32.28708561210878
Successful runs = 12.2


  6%|████▋                                                                     | 6401/100000 [22:44<6:48:14,  3.82it/s]

28
Saved Weights


  7%|████▊                                                                     | 6501/100000 [23:09<6:10:49,  4.20it/s]

Running reward = -27.328291015005988
Successful runs = 19.4


  7%|█████▏                                                                    | 7001/100000 [25:05<5:57:25,  4.34it/s]

Running reward = -30.792720187124562
Successful runs = 15.4


  8%|█████▌                                                                    | 7501/100000 [27:01<6:08:59,  4.18it/s]

Running reward = -29.53134809815669
Successful runs = 15.4


  8%|█████▉                                                                    | 8001/100000 [28:57<6:39:08,  3.84it/s]

Running reward = -27.827936549726076
Successful runs = 18.0


  9%|██████▎                                                                   | 8501/100000 [31:10<5:58:51,  4.25it/s]

Running reward = -27.33095234112235
Successful runs = 19.8


  9%|██████▋                                                                   | 9001/100000 [33:25<7:17:52,  3.46it/s]

Running reward = -29.664648034673235
Successful runs = 14.4


 10%|███████                                                                   | 9501/100000 [35:41<8:39:09,  2.91it/s]

Running reward = -30.29084390673477
Successful runs = 18.8


 10%|███████▎                                                                 | 10001/100000 [37:47<6:26:08,  3.88it/s]

Running reward = -26.558916758577293
Successful runs = 17.6


 11%|███████▋                                                                 | 10501/100000 [39:54<6:39:46,  3.73it/s]

Running reward = -30.653474180783014
Successful runs = 16.2


 11%|████████                                                                 | 11001/100000 [42:09<6:18:06,  3.92it/s]

Running reward = -31.82239911533311
Successful runs = 18.2


 12%|████████▍                                                                | 11501/100000 [44:28<7:18:05,  3.37it/s]

Running reward = -26.0940040190414
Successful runs = 22.0


 12%|████████▊                                                                | 12001/100000 [46:48<6:18:52,  3.87it/s]

Running reward = -25.1588608234282
Successful runs = 21.6


 13%|█████████▏                                                               | 12501/100000 [49:24<9:05:15,  2.67it/s]

Running reward = -26.67014426820893
Successful runs = 24.0


 13%|█████████▍                                                               | 12900/100000 [51:33<6:46:23,  3.57it/s]

30
Saved Weights


 13%|█████████▍                                                               | 13001/100000 [52:05<7:34:00,  3.19it/s]

33
Saved Weights
Running reward = -21.186307757240897
Successful runs = 24.6


 14%|█████████▊                                                               | 13501/100000 [54:43<8:11:16,  2.93it/s]

Running reward = -23.01474604542712
Successful runs = 24.8


 14%|██████████▏                                                              | 14001/100000 [57:28<7:37:18,  3.13it/s]

Running reward = -24.974018776731928
Successful runs = 24.0


 15%|██████████▎                                                            | 14501/100000 [1:00:12<8:19:09,  2.85it/s]

Running reward = -24.7588507278764
Successful runs = 28.0


 15%|██████████▋                                                            | 15001/100000 [1:02:53<7:01:24,  3.36it/s]

Running reward = -21.781020581963492
Successful runs = 26.8


 16%|███████████                                                            | 15501/100000 [1:05:23<8:27:02,  2.78it/s]

Running reward = -21.273284852592408
Successful runs = 27.8


 16%|███████████▎                                                           | 16001/100000 [1:08:08<8:27:24,  2.76it/s]

Running reward = -26.74315433222402
Successful runs = 24.8


 16%|███████████▌                                                           | 16301/100000 [1:09:37<6:03:39,  3.84it/s]

36
Saved Weights


 17%|███████████▋                                                           | 16501/100000 [1:10:28<6:03:34,  3.83it/s]

Running reward = -27.030574299287455
Successful runs = 21.6


 17%|████████████                                                           | 17001/100000 [1:12:41<5:51:24,  3.94it/s]

Running reward = -23.242330135808025
Successful runs = 27.4


 18%|████████████▍                                                          | 17501/100000 [1:15:08<6:59:27,  3.28it/s]

Running reward = -22.500373295908833
Successful runs = 28.0


 18%|████████████▊                                                          | 18001/100000 [1:17:38<6:34:31,  3.46it/s]

Running reward = -24.25092626618072
Successful runs = 21.8


 19%|█████████████▏                                                         | 18501/100000 [1:19:56<6:52:01,  3.30it/s]

Running reward = -21.97286993675637
Successful runs = 30.4


 19%|█████████████▎                                                         | 18701/100000 [1:20:53<7:07:14,  3.17it/s]

39
Saved Weights


 19%|█████████████▍                                                         | 19001/100000 [1:22:15<6:38:48,  3.39it/s]

Running reward = -20.839730411295616
Successful runs = 35.4


 20%|█████████████▊                                                         | 19501/100000 [1:24:39<6:31:33,  3.43it/s]

Running reward = -22.691480811966464
Successful runs = 30.2


 20%|██████████████▏                                                        | 20000/100000 [1:27:07<7:16:30,  3.05it/s]

Running reward = -22.842959704689488
Successful runs = 30.8
[0 0 1 1] [3 2 1 3]
tf.Tensor(
[[-0.2627287  -0.24760523 -0.2669513  -0.25271034]
 [-0.2412944  -0.27092418 -0.25031787 -0.24932593]
 [-0.2172809  -0.24167678 -0.23716189 -0.24090867]
 [-0.3342098  -0.3712462  -0.32996187 -0.31738967]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-0.97310704 -0.9855081  -0.9607721  -0.9934648 ]
 [-1.0192771  -1.0122694  -0.9929316  -1.0677999 ]
 [-1.0392283  -1.0449231  -1.0185231  -1.0260987 ]
 [-1.0058209  -0.96975714 -0.96997166 -1.0019966 ]], shape=(4, 4), dtype=float32)


 20%|██████████████▏                                                        | 20001/100000 [1:27:08<9:44:13,  2.28it/s]

tf.Tensor(
[[-1.2561935]
 [-1.234226 ]
 [-1.2865999]
 [-1.3732429]], shape=(4, 1), dtype=float32)


 21%|██████████████▌                                                        | 20501/100000 [1:29:43<7:13:39,  3.06it/s]

Running reward = -20.075121148097757
Successful runs = 28.6


 21%|██████████████▉                                                        | 21001/100000 [1:32:21<6:45:04,  3.25it/s]

Running reward = -20.621131453484246
Successful runs = 28.2


 21%|███████████████                                                        | 21301/100000 [1:33:49<6:45:34,  3.23it/s]

43
Saved Weights


 21%|███████████████▏                                                       | 21401/100000 [1:34:18<6:12:46,  3.51it/s]

44
Saved Weights


 22%|███████████████▎                                                       | 21501/100000 [1:34:47<7:06:10,  3.07it/s]

Running reward = -16.757309519379334
Successful runs = 37.8


 22%|███████████████▍                                                       | 21701/100000 [1:35:46<6:52:58,  3.16it/s]

45
Saved Weights


 22%|███████████████▌                                                       | 22002/100000 [1:37:18<5:15:13,  4.12it/s]

Running reward = -17.285099152024497
Successful runs = 40.4


 23%|███████████████▉                                                       | 22501/100000 [1:40:02<7:21:30,  2.93it/s]

Running reward = -17.26720082294039
Successful runs = 37.2


 23%|████████████████▎                                                      | 23001/100000 [1:42:47<7:03:59,  3.03it/s]

Running reward = -21.115064958055214
Successful runs = 34.8


 24%|████████████████▋                                                      | 23501/100000 [1:45:25<6:25:49,  3.30it/s]

Running reward = -16.667324606919454
Successful runs = 36.8


 24%|█████████████████                                                      | 24001/100000 [1:48:09<7:30:11,  2.81it/s]

Running reward = -17.855408579190335
Successful runs = 36.2


 25%|█████████████████▍                                                     | 24501/100000 [1:51:00<7:16:04,  2.89it/s]

Running reward = -20.160425667229266
Successful runs = 35.6


 25%|█████████████████▌                                                     | 24701/100000 [1:52:02<6:21:57,  3.29it/s]

46
Saved Weights


 25%|█████████████████▊                                                     | 25001/100000 [1:53:38<6:08:39,  3.39it/s]

Running reward = -14.136860310031624
Successful runs = 41.0


 25%|██████████████████                                                     | 25401/100000 [1:55:43<6:40:42,  3.10it/s]

47
Saved Weights


 26%|██████████████████                                                     | 25501/100000 [1:56:14<6:45:55,  3.06it/s]

48
Saved Weights
Running reward = -11.00170443764255
Successful runs = 42.8


 26%|██████████████████▏                                                    | 25601/100000 [1:56:44<7:14:24,  2.85it/s]

51
Saved Weights


 26%|██████████████████▍                                                    | 26001/100000 [1:58:53<6:15:27,  3.28it/s]

Running reward = -11.053701287194528
Successful runs = 47.8


 27%|██████████████████▊                                                    | 26501/100000 [2:01:36<6:40:47,  3.06it/s]

Running reward = -12.749353156571466
Successful runs = 45.0


 27%|███████████████████▏                                                   | 27002/100000 [2:04:22<6:07:58,  3.31it/s]

Running reward = -15.742098013493788
Successful runs = 41.6


 27%|███████████████████▍                                                   | 27301/100000 [2:05:59<6:40:35,  3.02it/s]

55
Saved Weights


 28%|███████████████████▌                                                   | 27501/100000 [2:07:00<6:06:31,  3.30it/s]

64
Saved Weights
Running reward = -2.193050052965792
Successful runs = 52.0


 28%|███████████████████▉                                                   | 28001/100000 [2:09:40<7:12:10,  2.78it/s]

Running reward = -9.22799496234188
Successful runs = 51.8


 29%|████████████████████▏                                                  | 28501/100000 [2:12:25<6:45:12,  2.94it/s]

Running reward = -6.6085881908469695
Successful runs = 53.0


 29%|████████████████████▌                                                  | 29001/100000 [2:15:11<6:46:00,  2.91it/s]

Running reward = -10.156557889693818
Successful runs = 49.8


 30%|████████████████████▉                                                  | 29501/100000 [2:18:00<7:15:30,  2.70it/s]

Running reward = -3.872575236810329
Successful runs = 54.6


 30%|█████████████████████▎                                                 | 30001/100000 [2:20:47<6:36:06,  2.95it/s]

Running reward = -6.34220278625519
Successful runs = 53.0


 31%|█████████████████████▋                                                 | 30501/100000 [2:23:32<6:32:52,  2.95it/s]

Running reward = -6.966909500349879
Successful runs = 53.4


 31%|██████████████████████                                                 | 31001/100000 [2:26:22<6:13:27,  3.08it/s]

Running reward = -3.360716232262526
Successful runs = 60.2


 32%|██████████████████████▎                                                | 31501/100000 [2:29:23<7:00:34,  2.71it/s]

69
Saved Weights
Running reward = -0.34132303465840086
Successful runs = 60.2


 32%|██████████████████████▋                                                | 32001/100000 [2:32:23<5:56:29,  3.18it/s]

Running reward = -0.35017270926059496
Successful runs = 58.6


 33%|███████████████████████                                                | 32501/100000 [2:35:26<7:28:11,  2.51it/s]

71
Saved Weights
Running reward = 2.1083075520055905
Successful runs = 62.0


 33%|███████████████████████▍                                               | 33001/100000 [2:38:38<8:00:53,  2.32it/s]

Running reward = -6.642348198673049
Successful runs = 54.6


 34%|███████████████████████▊                                               | 33501/100000 [2:41:37<5:53:47,  3.13it/s]

Running reward = -2.6450933936675147
Successful runs = 59.6


 34%|████████████████████████▏                                              | 34001/100000 [2:44:26<5:05:20,  3.60it/s]

72
Saved Weights
Running reward = 2.983145888813917
Successful runs = 66.8


 35%|████████████████████████▍                                              | 34501/100000 [2:47:30<8:07:47,  2.24it/s]

Running reward = -3.1807007843488546
Successful runs = 58.2


 35%|████████████████████████▊                                              | 35001/100000 [2:50:25<7:39:59,  2.36it/s]

Running reward = -1.7972595200425916
Successful runs = 61.4


 36%|█████████████████████████▏                                             | 35501/100000 [2:53:20<6:25:06,  2.79it/s]

Running reward = -3.2030864790154387
Successful runs = 60.8


 36%|█████████████████████████▌                                             | 36001/100000 [2:56:13<5:09:24,  3.45it/s]

Running reward = -0.4460373370194873
Successful runs = 60.0


 37%|█████████████████████████▉                                             | 36501/100000 [2:59:09<5:25:32,  3.25it/s]

Running reward = -0.5588169931633031
Successful runs = 61.4


 37%|██████████████████████████▎                                            | 37001/100000 [3:02:08<5:50:19,  3.00it/s]

Running reward = -2.6468111489406088
Successful runs = 59.0


 38%|██████████████████████████▋                                            | 37501/100000 [3:05:02<5:38:05,  3.08it/s]

Running reward = -0.20201868182412022
Successful runs = 63.6


 38%|██████████████████████████▉                                            | 38001/100000 [3:08:00<6:16:12,  2.75it/s]

Running reward = -2.6879223009237063
Successful runs = 61.8


 39%|███████████████████████████▎                                           | 38501/100000 [3:10:55<5:42:00,  3.00it/s]

Running reward = 1.062374076439713
Successful runs = 63.4


 39%|███████████████████████████▋                                           | 39001/100000 [3:13:53<5:08:44,  3.29it/s]

Running reward = -4.466295193685521
Successful runs = 60.0


 40%|████████████████████████████                                           | 39501/100000 [3:16:58<6:35:58,  2.55it/s]

Running reward = -5.85026735960263
Successful runs = 57.2


 40%|████████████████████████████▍                                          | 40001/100000 [3:19:59<6:56:09,  2.40it/s]

Running reward = -1.9640776752534028
Successful runs = 62.8
[2 3 2 0] [3 2 1 1]
tf.Tensor(
[[0.04600435 0.11070371 0.08411548 0.06998654]
 [0.05287066 0.10821956 0.1170404  0.0681136 ]
 [0.07162166 0.05312154 0.04351017 0.06202672]
 [0.06862701 0.08594199 0.10627617 0.06721988]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[0.2697099  0.35862592 0.34327027 0.2785078 ]
 [0.27934635 0.36157566 0.34032667 0.3323682 ]
 [0.2899724  0.37641102 0.3641339  0.2998568 ]
 [0.28223965 0.3705279  0.35780331 0.31715098]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[0.36262327]
 [0.40844026]
 [0.4199212 ]
 [0.4391549 ]], shape=(4, 1), dtype=float32)


 41%|████████████████████████████▊                                          | 40501/100000 [3:23:05<5:33:08,  2.98it/s]

Running reward = -1.9537805040037952
Successful runs = 60.6


 41%|█████████████████████████████                                          | 41001/100000 [3:26:07<5:18:27,  3.09it/s]

Running reward = -1.425892074559423
Successful runs = 62.6


 42%|█████████████████████████████▍                                         | 41501/100000 [3:29:12<5:52:50,  2.76it/s]

Running reward = -1.5924773125637353
Successful runs = 63.4


 42%|█████████████████████████████▊                                         | 42001/100000 [3:32:17<6:43:18,  2.40it/s]

Running reward = -0.8782844236720686
Successful runs = 61.8


 43%|██████████████████████████████▏                                        | 42501/100000 [3:35:29<5:36:38,  2.85it/s]

Running reward = -0.4733114383766217
Successful runs = 59.0


 43%|██████████████████████████████▌                                        | 43001/100000 [3:38:34<6:17:41,  2.52it/s]

Running reward = 2.3998163422313925
Successful runs = 66.0


 44%|██████████████████████████████▉                                        | 43501/100000 [3:41:48<5:01:30,  3.12it/s]

Running reward = -4.46797049023026
Successful runs = 62.6


 44%|███████████████████████████████▏                                       | 44001/100000 [3:45:04<5:24:00,  2.88it/s]

Running reward = -3.0190843837690875
Successful runs = 56.8


 44%|███████████████████████████████▍                                       | 44301/100000 [3:46:53<6:13:50,  2.48it/s]

75
Saved Weights


 45%|███████████████████████████████▌                                       | 44501/100000 [3:48:10<5:29:52,  2.80it/s]

Running reward = -0.7010092221531338
Successful runs = 64.6


 45%|███████████████████████████████▉                                       | 45001/100000 [3:51:21<6:44:06,  2.27it/s]

Running reward = 0.5232357044819146
Successful runs = 63.6


 46%|████████████████████████████████▎                                      | 45501/100000 [3:54:36<6:02:37,  2.50it/s]

Running reward = 1.3433496362166366
Successful runs = 65.0


 46%|████████████████████████████████▋                                      | 46001/100000 [3:57:54<6:31:57,  2.30it/s]

Running reward = -1.9269050746243428
Successful runs = 59.0


 47%|█████████████████████████████████                                      | 46501/100000 [4:01:13<5:35:22,  2.66it/s]

Running reward = -4.153156626924755
Successful runs = 58.2


 47%|█████████████████████████████████▎                                     | 47001/100000 [4:04:22<6:41:30,  2.20it/s]

Running reward = 2.4633593588754366
Successful runs = 64.0


 48%|█████████████████████████████████▋                                     | 47501/100000 [4:07:40<6:43:49,  2.17it/s]

Running reward = -5.724971186848084
Successful runs = 60.0


 48%|██████████████████████████████████                                     | 48001/100000 [4:11:05<5:52:49,  2.46it/s]

Running reward = -4.217135247262468
Successful runs = 57.2


 49%|██████████████████████████████████▍                                    | 48501/100000 [4:14:24<7:12:52,  1.98it/s]

Running reward = 1.3438643687537
Successful runs = 67.0


 49%|██████████████████████████████████▊                                    | 49001/100000 [4:17:40<5:02:27,  2.81it/s]

Running reward = 1.6377970005143165
Successful runs = 67.0


 50%|███████████████████████████████████▏                                   | 49501/100000 [4:20:58<4:30:02,  3.12it/s]

Running reward = 1.7476636510899128
Successful runs = 67.6


 50%|███████████████████████████████████▌                                   | 50001/100000 [4:24:12<5:59:34,  2.32it/s]

Running reward = 5.584162577994431
Successful runs = 70.0


 50%|███████████████████████████████████▌                                   | 50101/100000 [4:24:51<5:05:01,  2.73it/s]

76
Saved Weights


 51%|███████████████████████████████████▊                                   | 50501/100000 [4:27:53<5:01:58,  2.73it/s]

Running reward = 4.708198794622291
Successful runs = 69.6


 51%|███████████████████████████████████▉                                   | 50601/100000 [4:28:35<7:44:18,  1.77it/s]

77
Saved Weights


 51%|████████████████████████████████████▏                                  | 51001/100000 [4:31:19<6:58:33,  1.95it/s]

82
Saved Weights
Running reward = 8.525807692501068
Successful runs = 72.0


 52%|████████████████████████████████████▌                                  | 51501/100000 [4:35:01<6:58:39,  1.93it/s]

Running reward = -0.5243155454007435
Successful runs = 69.2


 52%|████████████████████████████████████▉                                  | 52001/100000 [4:38:28<4:30:00,  2.96it/s]

Running reward = 5.619649069312089
Successful runs = 73.8


 53%|█████████████████████████████████████▎                                 | 52501/100000 [4:42:09<5:39:45,  2.33it/s]

Running reward = 8.399713064978787
Successful runs = 74.2


 53%|█████████████████████████████████████▋                                 | 53001/100000 [4:45:41<5:07:56,  2.54it/s]

Running reward = 3.3018548427477175
Successful runs = 70.2


 53%|█████████████████████████████████████▊                                 | 53201/100000 [4:47:05<5:04:53,  2.56it/s]

87
Saved Weights


 54%|█████████████████████████████████████▉                                 | 53501/100000 [4:49:12<5:52:23,  2.20it/s]

Running reward = 8.81170569498837
Successful runs = 77.4


 54%|██████████████████████████████████████▎                                | 54001/100000 [4:52:39<5:21:42,  2.38it/s]

Running reward = 9.262053974582956
Successful runs = 74.2


 55%|██████████████████████████████████████▋                                | 54501/100000 [4:56:33<4:56:36,  2.56it/s]

Running reward = 5.091172555102897
Successful runs = 63.4


 55%|███████████████████████████████████████                                | 55001/100000 [5:00:13<6:36:16,  1.89it/s]

Running reward = 5.36857387538804
Successful runs = 71.8


 56%|███████████████████████████████████████▍                               | 55501/100000 [5:03:53<5:47:13,  2.14it/s]

Running reward = 3.741717271712059
Successful runs = 74.0


 56%|███████████████████████████████████████▊                               | 56001/100000 [5:07:33<6:00:25,  2.03it/s]

Running reward = 5.915677021130952
Successful runs = 75.6


 57%|████████████████████████████████████████                               | 56501/100000 [5:11:00<5:13:54,  2.31it/s]

Running reward = 6.278706291170147
Successful runs = 78.2


 57%|████████████████████████████████████████▍                              | 57001/100000 [5:14:26<5:37:45,  2.12it/s]

Running reward = 7.3953852880501065
Successful runs = 73.0


 58%|████████████████████████████████████████▊                              | 57501/100000 [5:17:42<3:58:12,  2.97it/s]

Running reward = 12.972471023744813
Successful runs = 81.0


 58%|█████████████████████████████████████████▏                             | 58001/100000 [5:21:10<6:30:23,  1.79it/s]

Running reward = 7.351985833266279
Successful runs = 74.6


 59%|█████████████████████████████████████████▌                             | 58501/100000 [5:24:48<4:52:26,  2.37it/s]

Running reward = 7.8717229017351125
Successful runs = 75.6


 59%|█████████████████████████████████████████▉                             | 59001/100000 [5:28:10<4:00:11,  2.84it/s]

Running reward = 12.34569146939636
Successful runs = 78.4


 60%|██████████████████████████████████████████▏                            | 59501/100000 [5:31:36<4:47:08,  2.35it/s]

Running reward = 9.891906109704241
Successful runs = 77.4


 60%|██████████████████████████████████████████▌                            | 60001/100000 [5:34:57<4:44:35,  2.34it/s]

Running reward = 12.797347933593626
Successful runs = 79.0
[0 0 3 1] [2 2 2 1]
tf.Tensor(
[[0.29270327 0.24293087 0.23144758 0.26722515]
 [0.28068188 0.2645824  0.24785584 0.28172976]
 [0.2954297  0.29121566 0.2974667  0.2871577 ]
 [0.28987926 0.37438726 0.32613707 0.29419655]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[1.5174825 1.5245311 1.5346    1.496735 ]
 [1.5091457 1.5218645 1.531671  1.4861438]
 [1.5037628 1.5135084 1.5286292 1.4884179]
 [1.4538372 1.4852178 1.4472198 1.4401951]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[1.8273033]
 [1.8123529]
 [1.8157868]
 [1.8596051]], shape=(4, 1), dtype=float32)


 61%|██████████████████████████████████████████▉                            | 60501/100000 [5:38:19<4:28:42,  2.45it/s]

Running reward = 8.545479471189362
Successful runs = 80.8


 61%|███████████████████████████████████████████▎                           | 61001/100000 [5:41:36<4:38:56,  2.33it/s]

Running reward = 9.566538751669377
Successful runs = 80.6


 62%|███████████████████████████████████████████▋                           | 61501/100000 [5:44:57<3:25:09,  3.13it/s]

88
Saved Weights
Running reward = 16.570820663049766
Successful runs = 82.8


 62%|███████████████████████████████████████████▋                           | 61538/100000 [5:45:12<4:15:02,  2.51it/s]

In [None]:
model.infer()

In [None]:
model.epsilon = 0
done = False
episode_reward = 0
start, dest = env.give_start_dest()
ob_user = [start[0], start[1], dest[0], dest[1]]
prev_steps_assist = []
prev_steps_assist = give_prev_steps(prev_steps_assist, steps)
step = 0

while not done and step<max_steps:
    ob_user, prev_steps_assist, reward_user, reward_assist, done = model.step(ob_user, prev_steps_assist)
    episode_reward+=reward_user
    step+=1
    print(ob_user)
    
print(done)