In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, LSTM, Subtract
from tensorflow.keras.models import Model
from Environment import *
from tqdm import tqdm
import matplotlib.pyplot as plt
from copy import deepcopy
import os 
import numpy as np

In [2]:
dense_1_user =  Dense(32, activation = 'relu')
dense_2_user =  Dense(32, activation = 'relu')
# dense_3_user =  Dense(32, activation = 'relu')

dense_1_assist =  Dense(32, activation = 'relu')
lstm_1_assist = LSTM(32, activation = 'tanh')
dense_2_assist = Dense(32, activation = 'relu')

advantage_layer_user = Dense(4)
value_layer_user = Dense(1)

advantage_layer_assist = Dense(4)
value_layer_assist = Dense(1)

# advantage_layer = Dense(4)
# value_layer = Dense(1)

# advantage_layer_user = advantage_layer
# advantage_layer_assist = advantage_layer

# value_layer_user = value_layer
# value_layer_assist = value_layer

In [3]:
class AI_Design:
    def __init__(self, steps = 4):        
        self.loss_fn = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(lr = 0.0001)
        self.batch_size = 128
        self.replay_buffer_size = 1024
        self.replay_buffer = Replay_Buffer(self.replay_buffer_size)
        self.epsilon = 1
        self.gamma = 0.9
        self.env = Environment()
        self.env.cells = np.array([[0.7, 0.1], [0.1, 0.1], [0.5, 0.7], [0.6, 0.2], [0.7, 0.4], [0.2, 0.9]])
#         self.env_cells = np.array([[0.7, 0.1]])
        self.env_cell_mapping = give_mapping(self.env.cells)
        self.env_cell_mapping = self.env_cell_mapping[np.newaxis, :, :, np.newaxis]
        #-------------------------------------------------------------------------------------------------
        input_A = Input(shape = (4,))
        input_B = Input(shape = (steps,6))
        input_C = Input(shape = (11, 11, 1)) #Location of every icon
                 
        action_user = Input(shape = 1, dtype = tf.int32)
        action_assist = Input(shape = 1, dtype = tf.int32)
        
        
        
        #User Network 
        
        
        x = Subtract()([input_A[:, 2:], input_A[:, :2]])
        x = dense_1_user(x)
        x = dense_2_user(x)
#         x = dense_3_user(x)
        adv_user = advantage_layer_user(x)
        val_user = value_layer_user(x)
        output_user = adv_user - tf.reduce_mean(adv_user, axis = 1, keepdims = True) + val_user
        
        self.user_model = Model(inputs = input_A, outputs = output_user)
        self.user_model.summary()
        
        self.target_user_model = tf.keras.models.clone_model(self.user_model)
        self.target_user_model.set_weights(self.user_model.get_weights())
        

        
        #Assistant Network
        z = tf.keras.layers.Conv2D(filters = 2, kernel_size = 3, activation = 'relu')(input_C)
        z = tf.keras.layers.MaxPooling2D()(z)
        z = tf.keras.layers.Flatten()(z)
        z = tf.keras.layers.Dense(32, activation = 'relu')(z)
        
        y = dense_1_assist(input_B)
        y = lstm_1_assist(y)
        y = tf.keras.layers.Concatenate()([y,z])
        y = dense_2_assist(y)
        adv_assist = advantage_layer_assist(y)
        val_assist = value_layer_assist(y)
        output_assist = adv_assist - tf.reduce_mean(adv_assist, axis = 1, keepdims = True) + val_assist
        
        self.assist_model = Model(inputs = [input_B, input_C], outputs = output_assist)
        self.assist_model.summary()
        
        self.target_assist_model = tf.keras.models.clone_model(self.assist_model)
        self.target_assist_model.set_weights(self.assist_model.get_weights())
        
        
        #Complete Network

        mask_user = tf.reduce_sum(tf.one_hot(action_user, 4), axis = 1)
        mask_assist = tf.reduce_sum(tf.one_hot(action_assist, 4), axis = 1)
        output_user = output_user*mask_user
        output_assist = output_assist*mask_assist
        
        out = tf.reduce_sum(output_user + output_assist, axis = 1, keepdims = True)
        
        self.model = Model(inputs = [input_A, input_B, input_C, action_user, action_assist], outputs = out)  
        self.model.summary() 
        #-------------------------------------------------------------------------------------------------
    
    def infer(self):
        ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist,\
        reward_assist, next_ob_assist, done = self.sample_exp()
        
        ob_user = ob_user[:4]
        action_user = action_user[:4]
        reward_user = reward_user[:4]
        
        ob_assist = ob_assist[:4]
        action_assist = action_assist[:4]
        reward_assist = reward_assist[:4]
        
        print(action_user, action_assist)
        
        print(self.user_model(ob_user))
        print(self.assist_model([ob_assist, np.vstack(tuple([self.env_cell_mapping]*4))]))
        
        print(self.model([ob_user, ob_assist, np.vstack(tuple([self.env_cell_mapping]*4)), action_user, action_assist]))
    
    def exp_policy_user(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(4)
        else:
            state = np.array(state)[np.newaxis]
            Q_values = self.user_model(state)
            return np.argmax(Q_values[0])
    
    def exp_policy_assist(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(1,5)
        else:
            state = np.array(state)[np.newaxis]
            Q_values = self.assist_model([state, self.env_cell_mapping])
            return np.argmax(Q_values[0])+1
    
    def step(self, ob_user, prev_steps_assist):
        curr_loc = ob_user[:2]
        target_loc = ob_user[2:4]
        
        action_user = self.exp_policy_user(ob_user)
        action_user_one_hot = make_one_hot(action_user, 4)
        
        ob_assist = [action_user_one_hot + ob_user[:2]]
        ob_assist = prev_steps_assist + ob_assist 
        action_assist = self.exp_policy_assist(ob_assist)
        
        new_loc, reward_user, reward_assist, done = self.env.step(action_user, action_assist-1, target_loc, curr_loc)
        
        next_ob_user = new_loc[:]
        next_ob_user = next_ob_user + target_loc
        
        next_action_user = self.exp_policy_user(next_ob_user)
        next_action_user_one_hot = make_one_hot(next_action_user, 4)
        next_ob_assist = [next_action_user_one_hot + next_ob_user[:2]]
        next_ob_assist = ob_assist[1:] + next_ob_assist
        
        self.add_replay_buffer(ob_user, action_user, reward_user, next_ob_user, ob_assist,\
                          action_assist-1, reward_assist, next_ob_assist, done)
        
        return next_ob_user, ob_assist[1:], reward_user, reward_assist, done 
        
        
    
    def add_replay_buffer(self, ob_user, action_user, reward_user, next_ob_user, ob_assist,\
                         action_assist, reward_assist, next_ob_assist, done):
        
        self.replay_buffer.ob_user_history.append(ob_user)
        self.replay_buffer.action_user_history.append(action_user)
        self.replay_buffer.reward_user_history.append(reward_user)
        self.replay_buffer.next_ob_user_history.append(next_ob_user)
        self.replay_buffer.ob_assist_history.append(ob_assist)
        self.replay_buffer.action_assist_history.append(action_assist)
        self.replay_buffer.reward_assist_history.append(reward_assist)
        self.replay_buffer.next_ob_assist_history.append(next_ob_assist)
        self.replay_buffer.done_history.append(done)
    
    def sample_exp(self):
        indices = np.random.randint(len(self.replay_buffer.done_history), size = self.batch_size)
        
        ob_user = np.array([self.replay_buffer.ob_user_history[i] for i in indices])
        action_user = np.array([self.replay_buffer.action_user_history[i] for i in indices])
        reward_user = np.array([self.replay_buffer.reward_user_history[i] for i in indices])
        next_ob_user = np.array([self.replay_buffer.next_ob_user_history[i] for i in indices])
        ob_assist = np.array([self.replay_buffer.ob_assist_history[i] for i in indices])
        action_assist = np.array([self.replay_buffer.action_assist_history[i] for i in indices])
        reward_assist = np.array([self.replay_buffer.reward_assist_history[i] for i in indices])
        next_ob_assist = np.array([self.replay_buffer.next_ob_assist_history[i] for i in indices])
        done = np.array([self.replay_buffer.done_history[i] for i in indices])
        
        return ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist, reward_assist, next_ob_assist, done 
    
    def train(self):
        ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist,\
        reward_assist, next_ob_assist, done = self.sample_exp()
        
        input_A = ob_user
        input_B = ob_assist
        input_C = np.vstack(tuple([self.env_cell_mapping]*128))
        
        rewards = reward_user + reward_assist
        
        next_Q_values_user, next_Q_values_assist = self.user_model(next_ob_user), self.assist_model([next_ob_assist, input_C])
        best_next_actions_user, best_next_actions_assist = tf.math.argmax(next_Q_values_user, axis = 1), tf.math.argmax(next_Q_values_assist, axis = 1)
        next_Q_values_user, next_Q_values_assist = self.target_user_model(next_ob_user), self.target_assist_model([next_ob_assist, input_C])
        
        best_next_Q_values_user = tf.reduce_sum(next_Q_values_user*tf.one_hot(best_next_actions_user, 4), axis = 1)
        best_next_Q_values_assist = tf.reduce_sum(next_Q_values_user*tf.one_hot(best_next_actions_assist, 4), axis = 1)
        best_next_Q_values = best_next_Q_values_user + best_next_Q_values_assist
        
        target_Q_values = rewards + (1-done)*self.gamma*best_next_Q_values
        
        with tf.GradientTape() as tape:
            Q_values = self.model([input_A, input_B, input_C, action_user, action_assist])
            loss = tf.reduce_mean(self.loss_fn(target_Q_values, Q_values))
        
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        
        Q_values_assist = self.assist_model([input_B, input_C])
        with tf.GradientTape() as tape:
            Q_values_user = self.user_model(input_A)
            loss = tf.reduce_mean(self.loss_fn(Q_values_assist, Q_values_user))
            
        grads = tape.gradient(loss, self.user_model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.user_model.trainable_variables))
            
        with tf.GradientTape() as tape:
            Q_values_assist = self.assist_model([input_B, input_C])
            loss = tf.reduce_mean(self.loss_fn(Q_values_assist, Q_values_user))
            
        grads = tape.gradient(loss, self.assist_model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.assist_model.trainable_variables))                          

        

In [4]:
steps = 4
model = AI_Design(steps)
env = model.env

# if os.path.exists('user_model.h5'):
#     model.user_model = tf.keras.models.load_model('user_model.h5')
#     model.assist_model = tf.keras.models.load_model('assist_model.h5')

Icon Locations:
[[0.  0.7]
 [0.4 0.1]
 [0.5 0.2]
 [0.1 0.6]
 [0.  0.5]
 [0.6 0.2]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(None, 2)]          0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_1 (Te [(None, 2)]          0           input_1[0][0]                    
__________________________________________________________________________________________________
subtract (

In [5]:
def give_prev_steps(prev_steps_assist, steps):
    prev_steps_assist = [[0,0,0,0,-1,-1] for i in range(steps-1)]
    return prev_steps_assist

In [None]:
max_steps = 40
reached = 0
reached_history = []
max_reached = 0

running_reward = 0

for epoch in tqdm(range(100000)):
    done = False
    episode_reward = 0
    start, dest = env.give_start_dest()
    ob_user = [start[0], start[1], dest[0], dest[1]]
    prev_steps_assist = []
    prev_steps_assist = give_prev_steps(prev_steps_assist, steps)
    step = 0
    
    while not done and step<max_steps:
        ob_user, prev_steps_assist, reward_user, reward_assist, done = model.step(ob_user, prev_steps_assist)
        episode_reward+=reward_user
        step+=1
        if done:
            reached+=1
    
    if epoch:
        running_reward = 0.01 * episode_reward + (1 - 0.01) * running_reward
    else:
        running_reward = episode_reward
        
    if epoch>50:
        model.train()
        
        if epoch%100==0:
            model.target_user_model.set_weights(model.user_model.get_weights())
            model.target_assist_model.set_weights(model.assist_model.get_weights())
            reached_history.append(reached)
            rewards = []
            
            if reached>max_reached:
                print(reached)
                print('Saved Weights')
                max_reached = reached
                model.user_model.save('user_model.h5')
                model.assist_model.save('assist_model.h5')
                
            reached = 0
            
            if epoch%500==0:
                print(f'Running reward = {running_reward}')
                print(f'Successful runs = {np.mean(reached_history)}')
                reached_history = []
                
                if epoch%1000==0:
                    model.epsilon-=0.01
                    model.epsilon= max(model.epsilon, 0.1)
                    
                    if epoch%20000==0:
                        model.infer()

  0%|                                                                           | 101/100000 [00:07<3:07:17,  8.89it/s]

11
Saved Weights


  0%|▏                                                                          | 201/100000 [00:17<2:55:49,  9.46it/s]

16
Saved Weights


  0%|▏                                                                          | 301/100000 [00:26<2:51:53,  9.67it/s]

19
Saved Weights


  1%|▍                                                                          | 503/100000 [00:44<2:33:29, 10.80it/s]

Running reward = -31.43650802638874
Successful runs = 14.4


  1%|▋                                                                         | 1001/100000 [01:29<2:40:26, 10.28it/s]

20
Saved Weights
Running reward = -28.697423303229648
Successful runs = 14.6


  2%|█                                                                         | 1503/100000 [02:17<2:37:33, 10.42it/s]

Running reward = -31.486625444917436
Successful runs = 14.8


  2%|█▍                                                                        | 2002/100000 [03:05<2:39:40, 10.23it/s]

Running reward = -32.25125480890509
Successful runs = 14.4


  2%|█▋                                                                        | 2302/100000 [03:35<2:53:37,  9.38it/s]

22
Saved Weights


  3%|█▊                                                                        | 2502/100000 [03:55<3:04:16,  8.82it/s]

Running reward = -31.802785195423734
Successful runs = 15.2


  3%|██▏                                                                       | 3002/100000 [04:45<3:06:26,  8.67it/s]

Running reward = -33.35336209271583
Successful runs = 13.2


  4%|██▌                                                                       | 3502/100000 [05:48<3:40:52,  7.28it/s]

Running reward = -32.70429251798662
Successful runs = 13.4


  4%|██▉                                                                       | 4002/100000 [06:58<3:49:08,  6.98it/s]

Running reward = -31.784694320147896
Successful runs = 14.8


  5%|███▎                                                                      | 4502/100000 [08:10<4:00:38,  6.61it/s]

Running reward = -29.798331049464785
Successful runs = 14.0


  5%|███▌                                                                      | 4801/100000 [08:53<4:10:21,  6.34it/s]

23
Saved Weights


  5%|███▋                                                                      | 5002/100000 [09:22<3:52:16,  6.82it/s]

Running reward = -31.25184639215573
Successful runs = 14.2


  6%|████                                                                      | 5502/100000 [10:37<4:02:27,  6.50it/s]

Running reward = -29.56894491694164
Successful runs = 16.2


  6%|████▍                                                                     | 6002/100000 [11:52<3:39:30,  7.14it/s]

Running reward = -28.42417049613679
Successful runs = 15.8


  7%|████▊                                                                     | 6502/100000 [13:11<4:23:28,  5.91it/s]

Running reward = -30.2786235835914
Successful runs = 16.8


  7%|█████▏                                                                    | 7002/100000 [14:30<4:06:51,  6.28it/s]

Running reward = -30.6136866028099
Successful runs = 15.4


  8%|█████▌                                                                    | 7502/100000 [15:53<4:12:23,  6.11it/s]

Running reward = -30.077604349725814
Successful runs = 17.4


  8%|█████▉                                                                    | 8001/100000 [17:14<3:57:01,  6.47it/s]

Running reward = -32.56120721780768
Successful runs = 16.0


  9%|██████▎                                                                   | 8502/100000 [18:37<3:54:35,  6.50it/s]

Running reward = -30.526333020452252
Successful runs = 16.2


  9%|██████▋                                                                   | 9001/100000 [20:02<4:40:18,  5.41it/s]

Running reward = -29.117516972366165
Successful runs = 13.8


 10%|███████                                                                   | 9501/100000 [21:28<4:49:17,  5.21it/s]

Running reward = -32.17239496409615
Successful runs = 15.2


 10%|███████▎                                                                  | 9802/100000 [22:20<4:24:14,  5.69it/s]

24
Saved Weights


 10%|███████▎                                                                 | 10002/100000 [22:54<4:47:55,  5.21it/s]

Running reward = -27.945505628535404
Successful runs = 19.4


 11%|███████▋                                                                 | 10501/100000 [24:27<4:30:30,  5.51it/s]

Running reward = -31.321583315043494
Successful runs = 15.2


 11%|████████                                                                 | 11002/100000 [26:00<4:37:22,  5.35it/s]

Running reward = -28.390166722473143
Successful runs = 19.0


 12%|████████▍                                                                | 11502/100000 [27:34<4:40:47,  5.25it/s]

Running reward = -31.546375672439872
Successful runs = 14.8


 12%|████████▊                                                                | 12001/100000 [29:08<5:21:14,  4.57it/s]

Running reward = -27.22590970169884
Successful runs = 16.8


 13%|█████████▏                                                               | 12501/100000 [30:44<4:51:39,  5.00it/s]

Running reward = -30.318996208228555
Successful runs = 16.0


 13%|█████████▍                                                               | 13002/100000 [32:22<4:37:43,  5.22it/s]

Running reward = -31.545352945133484
Successful runs = 15.0


 14%|█████████▊                                                               | 13501/100000 [34:01<4:41:58,  5.11it/s]

Running reward = -31.24771875393635
Successful runs = 16.0


 14%|██████████▏                                                              | 14001/100000 [35:39<5:01:33,  4.75it/s]

Running reward = -30.88920553231847
Successful runs = 14.4


 15%|██████████▌                                                              | 14501/100000 [37:22<4:32:48,  5.22it/s]

Running reward = -30.685496636902347
Successful runs = 13.2


 15%|██████████▉                                                              | 15002/100000 [39:06<4:51:32,  4.86it/s]

Running reward = -33.579405355078606
Successful runs = 13.0


 16%|███████████▎                                                             | 15501/100000 [40:54<5:13:17,  4.50it/s]

Running reward = -31.580890106774806
Successful runs = 14.6


 16%|███████████▋                                                             | 16002/100000 [42:44<5:04:51,  4.59it/s]

Running reward = -31.475232634507933
Successful runs = 15.2


 17%|████████████                                                             | 16501/100000 [44:35<5:04:00,  4.58it/s]

Running reward = -30.94476711634773
Successful runs = 14.6


 17%|████████████▍                                                            | 17001/100000 [46:26<4:53:06,  4.72it/s]

Running reward = -30.388362627024577
Successful runs = 14.2


 18%|████████████▊                                                            | 17501/100000 [48:21<4:59:34,  4.59it/s]

Running reward = -31.019678946917406
Successful runs = 14.4


 18%|█████████████▏                                                           | 18001/100000 [50:16<5:04:35,  4.49it/s]

Running reward = -30.903150058331917
Successful runs = 12.4


 19%|█████████████▌                                                           | 18501/100000 [52:10<5:29:43,  4.12it/s]

Running reward = -30.87239103126247
Successful runs = 17.0


 19%|█████████████▊                                                           | 19001/100000 [54:08<5:33:17,  4.05it/s]

Running reward = -33.187506816873096
Successful runs = 11.6


 20%|██████████████▏                                                          | 19501/100000 [56:08<5:17:25,  4.23it/s]

Running reward = -30.970381890680116
Successful runs = 16.2


 20%|██████████████▌                                                          | 20000/100000 [58:10<5:45:25,  3.86it/s]

Running reward = -30.303536842920447
Successful runs = 14.4
[0 2 2 2] [2 2 3 2]
tf.Tensor(
[[-7.79068   -7.7958894 -7.7708178 -7.7831235]
 [-7.7747803 -7.804318  -7.758773  -7.787738 ]
 [-7.790104  -7.7949305 -7.762935  -7.786096 ]
 [-7.859129  -7.8803444 -7.839949  -7.8496323]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-7.8550425 -7.8533134 -7.8723316 -7.8477383]
 [-7.868834  -7.8555293 -7.874868  -7.8699155]
 [-7.8560615 -7.8626227 -7.870289  -7.857396 ]
 [-7.8920875 -7.886419  -7.883605  -7.8810596]], shape=(4, 4), dtype=float32)


 20%|██████████████▌                                                          | 20001/100000 [58:11<7:50:07,  2.84it/s]

tf.Tensor(
[[-15.663012]
 [-15.63364 ]
 [-15.620331]
 [-15.723555]], shape=(4, 1), dtype=float32)


 21%|██████████████▌                                                        | 20501/100000 [1:00:16<6:12:30,  3.56it/s]

Running reward = -33.0653722433872
Successful runs = 12.2


 21%|██████████████▉                                                        | 21001/100000 [1:02:20<5:09:07,  4.26it/s]

Running reward = -28.17583888204841
Successful runs = 16.8


 22%|███████████████▎                                                       | 21501/100000 [1:04:27<6:32:33,  3.33it/s]

Running reward = -30.301982247471848
Successful runs = 18.0


 22%|███████████████▌                                                       | 22001/100000 [1:06:34<5:28:51,  3.95it/s]

Running reward = -31.705593460341994
Successful runs = 15.4


 23%|███████████████▉                                                       | 22501/100000 [1:08:43<6:11:55,  3.47it/s]

Running reward = -30.063223123365365
Successful runs = 16.0


 23%|████████████████▎                                                      | 23001/100000 [1:10:50<5:21:58,  3.99it/s]

Running reward = -32.08405963639622
Successful runs = 17.2


 24%|████████████████▋                                                      | 23501/100000 [1:13:01<5:40:08,  3.75it/s]

Running reward = -31.800162017703368
Successful runs = 15.2


 24%|█████████████████                                                      | 24001/100000 [1:15:14<6:11:18,  3.41it/s]

Running reward = -28.60540435332821
Successful runs = 15.6


 25%|█████████████████▍                                                     | 24501/100000 [1:17:30<5:47:01,  3.63it/s]

Running reward = -31.007510525893444
Successful runs = 17.6


 25%|█████████████████▊                                                     | 25001/100000 [1:19:46<5:49:03,  3.58it/s]

Running reward = -30.918856032962132
Successful runs = 14.8


 26%|██████████████████                                                     | 25501/100000 [1:22:04<6:39:33,  3.11it/s]

Running reward = -29.423688154822344
Successful runs = 15.0


 26%|██████████████████▍                                                    | 26001/100000 [1:24:24<5:53:52,  3.49it/s]

Running reward = -30.86347405879402
Successful runs = 15.6


 27%|██████████████████▊                                                    | 26501/100000 [1:26:46<6:38:54,  3.07it/s]

Running reward = -32.17759886426692
Successful runs = 13.0


 27%|███████████████████▏                                                   | 27001/100000 [1:29:08<5:51:17,  3.46it/s]

Running reward = -30.65809943626632
Successful runs = 13.8


 28%|███████████████████▌                                                   | 27501/100000 [1:31:32<6:25:56,  3.13it/s]

Running reward = -29.92510099543198
Successful runs = 15.6


 28%|███████████████████▉                                                   | 28001/100000 [1:33:58<6:03:34,  3.30it/s]

Running reward = -31.63744600463226
Successful runs = 16.0


 29%|████████████████████▏                                                  | 28501/100000 [1:36:25<5:10:19,  3.84it/s]

Running reward = -28.439147174428676
Successful runs = 17.0


 29%|████████████████████▌                                                  | 29001/100000 [1:38:54<5:58:55,  3.30it/s]

Running reward = -30.795454079876496
Successful runs = 17.0


 30%|████████████████████▉                                                  | 29501/100000 [1:41:30<6:25:08,  3.05it/s]

Running reward = -31.83450701294327
Successful runs = 11.2


 30%|█████████████████████▎                                                 | 30001/100000 [1:44:04<6:00:01,  3.24it/s]

Running reward = -30.703967889643437
Successful runs = 14.2


 31%|█████████████████████▋                                                 | 30501/100000 [1:46:42<6:46:13,  2.85it/s]

Running reward = -30.00955647647404
Successful runs = 14.8


 31%|██████████████████████                                                 | 31001/100000 [1:49:17<5:34:58,  3.43it/s]

Running reward = -29.402410006461295
Successful runs = 16.2


 31%|██████████████████████                                                 | 31101/100000 [1:49:46<6:58:46,  2.74it/s]

30
Saved Weights


 32%|██████████████████████▎                                                | 31501/100000 [1:51:54<6:25:52,  2.96it/s]

Running reward = -30.428788939835314
Successful runs = 17.6


 32%|██████████████████████▋                                                | 32001/100000 [1:54:45<5:21:37,  3.52it/s]

Running reward = -29.83707651278348
Successful runs = 15.2


 33%|███████████████████████                                                | 32501/100000 [1:57:31<7:13:00,  2.60it/s]

Running reward = -29.131139586829235
Successful runs = 18.2


 33%|███████████████████████▍                                               | 33001/100000 [2:00:18<6:50:23,  2.72it/s]

Running reward = -30.52047146866378
Successful runs = 17.4


 34%|███████████████████████▊                                               | 33501/100000 [2:03:10<6:27:57,  2.86it/s]

Running reward = -30.40467961825848
Successful runs = 14.6


 34%|████████████████████████▏                                              | 34001/100000 [2:06:03<5:43:04,  3.21it/s]

Running reward = -28.424565557112754
Successful runs = 15.0


 35%|████████████████████████▍                                              | 34502/100000 [2:08:56<5:47:16,  3.14it/s]

Running reward = -31.933447574630655
Successful runs = 13.6


 35%|████████████████████████▊                                              | 35001/100000 [2:11:45<6:31:30,  2.77it/s]

Running reward = -28.87366839340527
Successful runs = 18.0


 36%|█████████████████████████▏                                             | 35501/100000 [2:14:37<6:37:45,  2.70it/s]

Running reward = -28.934105715126712
Successful runs = 17.2


 36%|█████████████████████████▌                                             | 36001/100000 [2:17:27<5:46:32,  3.08it/s]

Running reward = -29.036767771085326
Successful runs = 19.4


 37%|█████████████████████████▉                                             | 36501/100000 [2:20:19<7:08:26,  2.47it/s]

Running reward = -28.97104143043622
Successful runs = 19.2


 37%|██████████████████████████▎                                            | 37002/100000 [2:23:13<5:10:59,  3.38it/s]

Running reward = -29.911228088473738
Successful runs = 16.2


 38%|██████████████████████████▋                                            | 37501/100000 [2:26:07<6:21:49,  2.73it/s]

Running reward = -27.334432303791548
Successful runs = 19.0


 38%|██████████████████████████▉                                            | 38001/100000 [2:29:03<5:58:28,  2.88it/s]

Running reward = -28.67841002607562
Successful runs = 17.4


 39%|███████████████████████████▎                                           | 38501/100000 [2:32:04<6:48:43,  2.51it/s]

Running reward = -30.268474517132482
Successful runs = 20.8


 39%|███████████████████████████▋                                           | 39001/100000 [2:35:07<5:45:02,  2.95it/s]

Running reward = -29.551989870139554
Successful runs = 17.0


 40%|████████████████████████████                                           | 39501/100000 [2:38:13<7:26:13,  2.26it/s]

Running reward = -29.285323101484867
Successful runs = 17.6


 40%|████████████████████████████▍                                          | 40001/100000 [2:41:28<7:48:11,  2.14it/s]

Running reward = -31.47359767445458
Successful runs = 17.4
[2 0 1 2] [2 2 2 0]
tf.Tensor(
[[-7.787983  -7.8182344 -7.824981  -7.8415165]
 [-7.8267107 -7.8281994 -7.858528  -7.8709955]
 [-7.8040953 -7.814914  -7.8300796 -7.8488655]
 [-7.7932525 -7.812662  -7.822915  -7.830002 ]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-7.8905807 -7.875672  -7.8410797 -7.8869686]
 [-7.890931  -7.869622  -7.844295  -7.881903 ]
 [-7.8900456 -7.8759847 -7.8272476 -7.8797445]
 [-7.884653  -7.8644867 -7.8415422 -7.8777823]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-15.666061]
 [-15.671005]
 [-15.642162]
 [-15.707568]], shape=(4, 1), dtype=float32)


 41%|████████████████████████████▊                                          | 40501/100000 [2:44:37<6:18:01,  2.62it/s]

Running reward = -27.663281391717106
Successful runs = 20.2


 41%|█████████████████████████████                                          | 41001/100000 [2:47:47<6:24:06,  2.56it/s]

Running reward = -27.13940723459444
Successful runs = 20.0


 42%|█████████████████████████████▍                                         | 41501/100000 [2:50:59<5:32:20,  2.93it/s]

Running reward = -25.12853485519664
Successful runs = 21.2


 42%|█████████████████████████████▊                                         | 42001/100000 [2:54:11<7:11:16,  2.24it/s]

Running reward = -30.16529690953231
Successful runs = 19.0


 43%|██████████████████████████████▏                                        | 42501/100000 [2:57:21<6:43:17,  2.38it/s]

Running reward = -28.02923557682393
Successful runs = 21.8


 43%|██████████████████████████████▌                                        | 43001/100000 [3:00:33<6:09:09,  2.57it/s]

Running reward = -24.418523738761312
Successful runs = 21.8


 44%|██████████████████████████████▉                                        | 43501/100000 [3:03:45<5:17:20,  2.97it/s]

Running reward = -28.0281073737585
Successful runs = 20.4


 44%|███████████████████████████████▏                                       | 44002/100000 [3:06:59<5:36:44,  2.77it/s]

Running reward = -27.7127338291723
Successful runs = 19.4


 45%|███████████████████████████████▌                                       | 44501/100000 [3:10:15<5:57:55,  2.58it/s]

Running reward = -23.61732156261155
Successful runs = 22.6


 45%|███████████████████████████████▉                                       | 45001/100000 [3:13:33<5:27:34,  2.80it/s]

Running reward = -29.789332357449606
Successful runs = 18.6


 46%|████████████████████████████████▎                                      | 45501/100000 [3:16:59<6:43:31,  2.25it/s]

Running reward = -30.966795104292185
Successful runs = 17.2


 46%|████████████████████████████████▋                                      | 46001/100000 [3:20:33<5:55:51,  2.53it/s]

Running reward = -25.333654452212464
Successful runs = 18.8


 47%|█████████████████████████████████                                      | 46501/100000 [3:24:01<6:50:54,  2.17it/s]

Running reward = -27.383142982334217
Successful runs = 18.8


 47%|█████████████████████████████████▎                                     | 47001/100000 [3:27:23<6:35:48,  2.23it/s]

Running reward = -23.989145798391654
Successful runs = 24.6


 48%|█████████████████████████████████▋                                     | 47501/100000 [3:30:49<6:15:08,  2.33it/s]

Running reward = -26.11009900327337
Successful runs = 24.4


 48%|██████████████████████████████████                                     | 48001/100000 [3:34:18<6:32:06,  2.21it/s]

Running reward = -23.627923486420027
Successful runs = 21.4


 48%|██████████████████████████████████▎                                    | 48401/100000 [3:37:07<8:29:52,  1.69it/s]

31
Saved Weights


 49%|██████████████████████████████████▍                                    | 48502/100000 [3:37:48<5:17:51,  2.70it/s]

Running reward = -23.02438784446687
Successful runs = 25.0


 49%|██████████████████████████████████▌                                    | 48701/100000 [3:39:09<6:20:56,  2.24it/s]

34
Saved Weights


 49%|██████████████████████████████████▊                                    | 49001/100000 [3:41:11<5:29:21,  2.58it/s]

Running reward = -21.501758548875372
Successful runs = 29.2


 50%|███████████████████████████████████▏                                   | 49501/100000 [3:44:46<6:45:41,  2.07it/s]

Running reward = -25.514901402987356
Successful runs = 23.2


 50%|███████████████████████████████████▌                                   | 50001/100000 [3:48:21<6:17:48,  2.21it/s]

Running reward = -26.466366717838092
Successful runs = 22.0


 51%|███████████████████████████████████▊                                   | 50501/100000 [3:52:03<6:57:44,  1.97it/s]

Running reward = -30.1324265262029
Successful runs = 17.6


 51%|████████████████████████████████████▏                                  | 51001/100000 [3:55:40<6:23:38,  2.13it/s]

Running reward = -22.69120885988031
Successful runs = 24.2


 51%|████████████████████████████████████▍                                  | 51401/100000 [3:58:36<6:06:45,  2.21it/s]

39
Saved Weights


 52%|████████████████████████████████████▌                                  | 51501/100000 [3:59:20<6:28:45,  2.08it/s]

Running reward = -23.42791736777305
Successful runs = 26.0


 52%|████████████████████████████████████▉                                  | 52001/100000 [4:02:59<6:36:00,  2.02it/s]

Running reward = -29.416739885901425
Successful runs = 21.4


 53%|█████████████████████████████████████▎                                 | 52501/100000 [4:06:42<5:03:47,  2.61it/s]

Running reward = -25.517565693578394
Successful runs = 24.2


 53%|█████████████████████████████████████▋                                 | 53001/100000 [4:10:22<6:13:52,  2.10it/s]

Running reward = -24.923783939538033
Successful runs = 24.2


 54%|█████████████████████████████████████▉                                 | 53501/100000 [4:14:07<6:36:34,  1.95it/s]

Running reward = -27.6345413311252
Successful runs = 24.2


 54%|██████████████████████████████████████▎                                | 54001/100000 [4:17:53<5:24:12,  2.36it/s]

Running reward = -21.385224747504388
Successful runs = 26.4


 55%|██████████████████████████████████████▋                                | 54501/100000 [4:21:43<6:54:02,  1.83it/s]

Running reward = -29.491689826121938
Successful runs = 20.0


 55%|███████████████████████████████████████                                | 55001/100000 [4:25:38<6:26:59,  1.94it/s]

Running reward = -27.691728285123194
Successful runs = 19.2


 56%|███████████████████████████████████████▍                               | 55501/100000 [4:29:38<6:32:52,  1.89it/s]

Running reward = -28.998457180135237
Successful runs = 19.0


 56%|███████████████████████████████████████▊                               | 56001/100000 [4:33:26<6:40:17,  1.83it/s]

Running reward = -24.212798887618934
Successful runs = 25.2


 57%|████████████████████████████████████████                               | 56501/100000 [4:37:21<6:31:45,  1.85it/s]

Running reward = -27.76143498467789
Successful runs = 21.0


 57%|████████████████████████████████████████▍                              | 57001/100000 [4:41:16<4:47:24,  2.49it/s]

Running reward = -21.89106078836871
Successful runs = 24.4


 58%|████████████████████████████████████████▊                              | 57501/100000 [4:45:10<5:59:01,  1.97it/s]

Running reward = -26.044128475542024
Successful runs = 23.0


 58%|█████████████████████████████████████████▏                             | 58001/100000 [4:49:08<5:39:18,  2.06it/s]

Running reward = -23.67726658041073
Successful runs = 24.6


 59%|█████████████████████████████████████████▌                             | 58501/100000 [4:53:09<4:01:50,  2.86it/s]

Running reward = -23.793973337397567
Successful runs = 22.6


 59%|█████████████████████████████████████████▉                             | 59001/100000 [4:57:06<5:26:13,  2.09it/s]

Running reward = -25.785210625403508
Successful runs = 22.6


 60%|██████████████████████████████████████████▏                            | 59501/100000 [5:01:14<5:12:46,  2.16it/s]

Running reward = -20.991780322685354
Successful runs = 23.4


 60%|██████████████████████████████████████████▍                            | 59701/100000 [5:02:48<6:12:03,  1.81it/s]

47
Saved Weights


 60%|██████████████████████████████████████████▌                            | 60001/100000 [5:05:10<5:26:01,  2.04it/s]

Running reward = -21.4657034226876
Successful runs = 29.8
[3 1 1 0] [0 3 3 0]
tf.Tensor(
[[-6.532217  -6.553175  -6.53979   -6.5302973]
 [-6.5905075 -6.5521355 -6.5485935 -6.5877724]
 [-6.618246  -6.5689435 -6.5808744 -6.6017714]
 [-6.5038395 -6.526138  -6.4938693 -6.510546 ]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-6.4731336 -6.5115495 -6.5084176 -6.4727945]
 [-6.4904747 -6.532207  -6.4996624 -6.480332 ]
 [-6.4990706 -6.531042  -6.5068665 -6.4781322]
 [-6.48803   -6.527807  -6.5189176 -6.4746447]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-13.003431]
 [-13.032467]
 [-13.047075]
 [-12.991869]], shape=(4, 1), dtype=float32)


 61%|██████████████████████████████████████████▉                            | 60501/100000 [5:09:08<4:39:15,  2.36it/s]

Running reward = -17.426066045473263
Successful runs = 29.4


 61%|███████████████████████████████████████████▎                           | 61001/100000 [5:13:04<6:03:09,  1.79it/s]

Running reward = -24.588790768970764
Successful runs = 28.0


 62%|███████████████████████████████████████████▋                           | 61501/100000 [5:17:07<4:55:37,  2.17it/s]

Running reward = -20.491116631639777
Successful runs = 29.2


 62%|████████████████████████████████████████████                           | 62001/100000 [5:21:14<5:42:22,  1.85it/s]

Running reward = -30.67903883055426
Successful runs = 27.4


 63%|████████████████████████████████████████████▍                          | 62501/100000 [5:25:09<4:12:57,  2.47it/s]

Running reward = -14.612279811856684
Successful runs = 36.0


 63%|████████████████████████████████████████████▋                          | 63001/100000 [5:29:07<5:48:39,  1.77it/s]

Running reward = -19.94813987519082
Successful runs = 34.8


 64%|█████████████████████████████████████████████                          | 63501/100000 [5:33:13<4:46:32,  2.12it/s]

Running reward = -22.002317704450007
Successful runs = 31.0


 64%|█████████████████████████████████████████████▍                         | 64001/100000 [5:37:13<4:14:44,  2.36it/s]

Running reward = -20.127071044581132
Successful runs = 34.6


 65%|█████████████████████████████████████████████▊                         | 64501/100000 [5:41:19<5:48:01,  1.70it/s]

Running reward = -19.614979023822734
Successful runs = 30.4


 65%|██████████████████████████████████████████████                         | 64902/100000 [5:44:30<4:30:04,  2.17it/s]

52
Saved Weights


 65%|██████████████████████████████████████████████▏                        | 65001/100000 [5:45:21<4:27:14,  2.18it/s]

Running reward = -19.324269531071078
Successful runs = 35.0


 66%|██████████████████████████████████████████████▌                        | 65501/100000 [5:49:24<3:51:12,  2.49it/s]

Running reward = -17.337569820939976
Successful runs = 35.2


 66%|██████████████████████████████████████████████▊                        | 66001/100000 [5:53:16<4:50:55,  1.95it/s]

Running reward = -10.863994780204823
Successful runs = 43.0


 67%|███████████████████████████████████████████████▏                       | 66502/100000 [5:57:26<2:59:41,  3.11it/s]

Running reward = -15.296252748261502
Successful runs = 35.2


 67%|███████████████████████████████████████████████▌                       | 67001/100000 [6:01:39<4:15:06,  2.16it/s]

Running reward = -18.80467680328297
Successful runs = 32.8


 68%|███████████████████████████████████████████████▉                       | 67501/100000 [6:05:45<4:48:01,  1.88it/s]

Running reward = -17.95025978387407
Successful runs = 35.2


 68%|████████████████████████████████████████████████▎                      | 68001/100000 [6:10:06<5:10:00,  1.72it/s]

Running reward = -20.539787439548395
Successful runs = 28.4


 69%|████████████████████████████████████████████████▋                      | 68501/100000 [6:14:26<5:28:45,  1.60it/s]

Running reward = -26.540976990955855
Successful runs = 25.8


 69%|████████████████████████████████████████████████▉                      | 69001/100000 [6:18:41<4:02:35,  2.13it/s]

Running reward = -19.96725637004155
Successful runs = 31.2


 70%|█████████████████████████████████████████████████▎                     | 69501/100000 [6:23:05<4:49:57,  1.75it/s]

Running reward = -22.95852154383612
Successful runs = 26.8


 70%|█████████████████████████████████████████████████▋                     | 70001/100000 [6:27:27<4:10:46,  1.99it/s]

Running reward = -19.332340215499848
Successful runs = 29.6


 71%|██████████████████████████████████████████████████                     | 70501/100000 [6:31:52<5:00:10,  1.64it/s]

Running reward = -28.46345464798025
Successful runs = 29.0


 71%|██████████████████████████████████████████████████▍                    | 71001/100000 [6:36:26<4:49:11,  1.67it/s]

Running reward = -20.97357506796954
Successful runs = 24.2


 72%|██████████████████████████████████████████████████▊                    | 71501/100000 [6:40:58<2:50:11,  2.79it/s]

Running reward = -21.80694730631571
Successful runs = 25.8


 72%|███████████████████████████████████████████████████                    | 72001/100000 [6:45:50<4:33:48,  1.70it/s]

Running reward = -24.29403290874008
Successful runs = 26.6


 73%|███████████████████████████████████████████████████▍                   | 72501/100000 [6:50:44<3:47:25,  2.02it/s]

Running reward = -20.5320475563788
Successful runs = 30.6


 73%|███████████████████████████████████████████████████▊                   | 73001/100000 [6:55:37<3:54:12,  1.92it/s]

Running reward = -12.616534707848587
Successful runs = 32.0


 74%|████████████████████████████████████████████████████▏                  | 73502/100000 [7:00:59<3:05:49,  2.38it/s]

Running reward = -20.14204752192884
Successful runs = 30.2


 74%|████████████████████████████████████████████████████▌                  | 74001/100000 [7:06:00<4:15:57,  1.69it/s]

Running reward = -15.795500411151602
Successful runs = 32.6


 75%|████████████████████████████████████████████████████▉                  | 74501/100000 [7:10:57<2:52:49,  2.46it/s]

Running reward = -14.810623904652616
Successful runs = 36.6


 75%|█████████████████████████████████████████████████████▎                 | 75001/100000 [7:15:24<3:43:25,  1.86it/s]

Running reward = -13.131617172291202
Successful runs = 35.8


 76%|█████████████████████████████████████████████████████▌                 | 75501/100000 [7:20:01<4:19:17,  1.57it/s]

Running reward = -13.65549434486931
Successful runs = 34.2


 76%|█████████████████████████████████████████████████████▉                 | 76001/100000 [7:24:54<4:33:12,  1.46it/s]

Running reward = -15.913274354459958
Successful runs = 34.2


 77%|██████████████████████████████████████████████████████▎                | 76502/100000 [7:30:12<3:23:42,  1.92it/s]

Running reward = -26.31297519738608
Successful runs = 24.0


 77%|██████████████████████████████████████████████████████▋                | 77001/100000 [7:35:35<4:31:02,  1.41it/s]

Running reward = -21.870284586948255
Successful runs = 26.0


 78%|███████████████████████████████████████████████████████                | 77501/100000 [7:40:50<3:54:41,  1.60it/s]

Running reward = -21.733164887000786
Successful runs = 32.4


 78%|███████████████████████████████████████████████████████▍               | 78001/100000 [7:46:07<4:04:01,  1.50it/s]

Running reward = -17.389016296279106
Successful runs = 32.4


 79%|███████████████████████████████████████████████████████▋               | 78501/100000 [7:51:21<3:40:57,  1.62it/s]

Running reward = -20.05893777136737
Successful runs = 31.2


 79%|████████████████████████████████████████████████████████               | 79001/100000 [7:56:22<3:48:33,  1.53it/s]

Running reward = -22.348375558684115
Successful runs = 29.6


 80%|████████████████████████████████████████████████████████▍              | 79501/100000 [8:01:04<3:00:56,  1.89it/s]

Running reward = -20.40083572358375
Successful runs = 32.8


 80%|████████████████████████████████████████████████████████▊              | 80001/100000 [8:05:40<3:02:12,  1.83it/s]

Running reward = -14.574413689235483
Successful runs = 33.2
[2 2 3 0] [1 0 1 1]
tf.Tensor(
[[-5.5085535 -5.5370593 -5.431566  -5.4577246]
 [-5.5024157 -5.500074  -5.4404697 -5.444213 ]
 [-5.497534  -5.5158463 -5.4491315 -5.439283 ]
 [-5.4870586 -5.4847755 -5.444903  -5.4266295]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-5.412379  -5.2956285 -5.357549  -5.447106 ]
 [-5.4033375 -5.2728114 -5.347072  -5.429857 ]
 [-5.411518  -5.2996006 -5.3613286 -5.442031 ]
 [-5.400357  -5.2769184 -5.3287954 -5.4090023]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-10.727195]
 [-10.843807]
 [-10.738884]
 [-10.763977]], shape=(4, 1), dtype=float32)


 81%|█████████████████████████████████████████████████████████▏             | 80501/100000 [8:10:14<2:35:50,  2.09it/s]

Running reward = -14.157307464721463
Successful runs = 37.8


 81%|█████████████████████████████████████████████████████████▌             | 81002/100000 [8:14:55<2:22:37,  2.22it/s]

Running reward = -17.315782460173182
Successful runs = 35.8


 82%|█████████████████████████████████████████████████████████▊             | 81501/100000 [8:19:49<3:38:18,  1.41it/s]

Running reward = -25.056623259248962
Successful runs = 28.0


 82%|██████████████████████████████████████████████████████████▏            | 82001/100000 [8:24:47<2:19:37,  2.15it/s]

Running reward = -19.465618926788018
Successful runs = 23.6


 83%|██████████████████████████████████████████████████████████▌            | 82501/100000 [8:29:43<3:18:47,  1.47it/s]

Running reward = -23.379774923504453
Successful runs = 28.4


 83%|██████████████████████████████████████████████████████████▉            | 83001/100000 [8:34:29<2:58:51,  1.58it/s]

Running reward = -24.399635609574663
Successful runs = 29.0


 84%|███████████████████████████████████████████████████████████▎           | 83501/100000 [8:39:25<2:04:30,  2.21it/s]

Running reward = -22.0323165491062
Successful runs = 28.2


 84%|███████████████████████████████████████████████████████████▋           | 84001/100000 [8:44:14<2:12:06,  2.02it/s]

Running reward = -18.42117207886849
Successful runs = 31.2


 85%|███████████████████████████████████████████████████████████▉           | 84501/100000 [8:49:20<2:54:59,  1.48it/s]

Running reward = -23.78669029958153
Successful runs = 23.2


 85%|████████████████████████████████████████████████████████████▎          | 85001/100000 [8:54:24<2:37:37,  1.59it/s]

Running reward = -26.80729535816823
Successful runs = 25.4


 85%|████████████████████████████████████████████████████████████▎          | 85035/100000 [8:54:50<2:37:57,  1.58it/s]

In [7]:
model.infer()

[2 1 2 2] [3 0 3 0]
tf.Tensor(
[[-6.5603757 -6.5349255 -6.5428753 -6.5529613]
 [-6.5629635 -6.51628   -6.548561  -6.5517893]
 [-6.5603757 -6.5349255 -6.5428753 -6.552961 ]
 [-6.5908575 -6.665     -6.6308084 -6.6365347]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-6.600309  -6.635875  -6.6165543 -6.585725 ]
 [-6.6174035 -6.7201543 -6.600407  -6.607241 ]
 [-6.613685  -6.6635113 -6.6119084 -6.6085525]
 [-6.6126423 -6.6687827 -6.618299  -6.608508 ]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[-13.1286  ]
 [-13.133684]
 [-13.151428]
 [-13.24345 ]], shape=(4, 1), dtype=float32)


In [10]:
done = False
episode_reward = 0
start, dest = env.give_start_dest()
ob_user = [start[0], start[1], dest[0], dest[1]]
prev_steps_assist = []
prev_steps_assist = give_prev_steps(prev_steps_assist, steps)
step = 0

while not done and step<max_steps:
    ob_user, prev_steps_assist, reward_user, reward_assist, done = model.step(ob_user, prev_steps_assist)
    episode_reward+=reward_user
    step+=1
    print(ob_user)
    
print(done)

[0.2, 0.1, 0.7, 0.4]
[0.5, 0.1, 0.7, 0.4]
[0.8, 0.1, 0.7, 0.4]
[0.6, 0.1, 0.7, 0.4]
[0.7, 0.1, 0.7, 0.4]
[1.0, 0.1, 0.7, 0.4]
[0.7, 0.1, 0.7, 0.4]
[1.0, 0.1, 0.7, 0.4]
[0.7, 0.1, 0.7, 0.4]
[1.0, 0.1, 0.7, 0.4]
[0.7, 0.1, 0.7, 0.4]
[1.0, 0.1, 0.7, 0.4]
[0.7, 0.1, 0.7, 0.4]
[0.7, 0.0, 0.7, 0.4]
[0.4, 0.0, 0.7, 0.4]
[0.4, 0.0, 0.7, 0.4]
[0.7, 0.0, 0.7, 0.4]
[1.0, 0.0, 0.7, 0.4]
[0.9, 0.0, 0.7, 0.4]
[0.6, 0.0, 0.7, 0.4]
[0.9, 0.0, 0.7, 0.4]
[0.9, 0.0, 0.7, 0.4]
[0.9, 0.3, 0.7, 0.4]
[0.6, 0.3, 0.7, 0.4]
[0.8, 0.3, 0.7, 0.4]
[0.5, 0.3, 0.7, 0.4]
[0.8, 0.3, 0.7, 0.4]
[0.5, 0.3, 0.7, 0.4]
[0.8, 0.3, 0.7, 0.4]
[0.5, 0.3, 0.7, 0.4]
[0.8, 0.3, 0.7, 0.4]
[0.5, 0.3, 0.7, 0.4]
[0.8, 0.3, 0.7, 0.4]
[0.7, 0.3, 0.7, 0.4]
[0.4, 0.3, 0.7, 0.4]
[0.4, 0.3, 0.7, 0.4]
[0.7, 0.3, 0.7, 0.4]
[0.7, 0.5, 0.7, 0.4]
[0.7, 0.8, 0.7, 0.4]
[0.7, 0.6, 0.7, 0.4]
0
