In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, LSTM, Subtract
from tensorflow.keras.models import Model
from Environment import *
from tqdm import tqdm
import matplotlib.pyplot as plt
from copy import deepcopy
import os 

In [2]:
dense_1_user =  Dense(32, activation = 'relu')
dense_2_user =  Dense(32, activation = 'relu')
# dense_3_user =  Dense(32, activation = 'relu')

dense_1_assist =  Dense(32, activation = 'relu')
lstm_1_assist = LSTM(32, activation = 'relu')
# dense_2_assist = Dense(32, activation = 'relu')

advantage_layer_user = Dense(4)
value_layer_user = Dense(1)

advantage_layer_assist = Dense(4)
value_layer_assist = Dense(1)

advantage_layer = Dense(4)
value_layer = Dense(1)

advantage_layer_user = advantage_layer
advantage_layer_assist = advantage_layer

value_layer_user = value_layer
value_layer_assist = value_layer



In [3]:
class AI_Design:
    def __init__(self, steps = 4):        
        self.loss_fn = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(lr = 0.0001)
        self.batch_size = 128
        self.replay_buffer_size = 1024
        self.replay_buffer = Replay_Buffer(self.replay_buffer_size)
        self.epsilon = 1
        self.gamma = 0.9
        self.env = Environment()
        self.env.cells = np.array([[0.7, 0.1], [0.1, 0.1], [0.5, 0.7], [0.6, 0.2], [0.7, 0.4], [0.2, 0.9]])
        
        #-------------------------------------------------------------------------------------------------
        input_A = Input(shape = (4,))
        input_B = Input(shape = (steps,6))
        action_user = Input(shape = 1, dtype = tf.int32)
        action_assist = Input(shape = 1, dtype = tf.int32)
        
        x = Subtract()([input_A[:, 2:], input_A[:, :2]])
        x = dense_1_user(x)
        x = dense_2_user(x)
#         x = dense_3_user(x)
        adv_user = advantage_layer_user(x)
        val_user = value_layer_user(x)
        output_user = adv_user - tf.reduce_mean(adv_user, axis = 1, keepdims = True) + val_user
        
        self.user_model = Model(inputs = input_A, outputs = output_user)
        self.user_model.summary()
        
        self.target_user_model = tf.keras.models.clone_model(self.user_model)
        self.target_user_model.set_weights(self.user_model.get_weights())
        

        
        y = dense_1_assist(input_B)
        y = lstm_1_assist(y)
#         y = dense_2_assist(y)
        adv_assist = advantage_layer_assist(y)
        val_assist = value_layer_assist(y)
        output_assist = adv_assist - tf.reduce_mean(adv_assist, axis = 1, keepdims = True) + val_assist
        
        self.assist_model = Model(inputs = input_B, outputs = output_assist)
        self.assist_model.summary()
        
        self.target_assist_model = tf.keras.models.clone_model(self.assist_model)
        self.target_assist_model.set_weights(self.assist_model.get_weights())
        
        mask_user = tf.reduce_sum(tf.one_hot(action_user, 4), axis = 1)
        mask_assist = tf.reduce_sum(tf.one_hot(action_assist, 4), axis = 1)
        output_user = output_user*mask_user
        output_assist = output_assist*mask_assist
        
        out = tf.reduce_sum(output_user + output_assist, axis = 1, keepdims = True)
        
        self.model = Model(inputs = [input_A, input_B, action_user, action_assist], outputs = out)  
        self.model.summary() 
        #-------------------------------------------------------------------------------------------------
    
    def infer(self):
        ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist,\
        reward_assist, next_ob_assist, done = self.sample_exp()
        
        ob_user = ob_user[1:4]
        action_user = action_user[1:4]
        reward_user = reward_user[1:4]
        
        ob_assist = ob_assist[1:4]
        action_assist = action_assist[1:4]
        reward_assist = reward_assist[1:4]
        
        print(action_user, action_assist)
        
        print(self.user_model(ob_user))
        print(self.assist_model(ob_assist))
        
        print(self.model([ob_user, ob_assist, action_user, action_assist]))
    
    def exp_policy_user(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(4)
        else:
            state = np.array(state)[np.newaxis]
            Q_values = self.user_model(state)
            return np.argmax(Q_values[0])
    
    def exp_policy_assist(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(1,5)
        else:
            state = np.array(state)[np.newaxis]
            Q_values = self.assist_model(state)
            return np.argmax(Q_values[0])+1
    
    def step(self, ob_user, prev_steps_assist):
        curr_loc = ob_user[:2]
        target_loc = ob_user[2:4]
        
        action_user = self.exp_policy_user(ob_user)
        action_user_one_hot = make_one_hot(action_user, 4)
        
        ob_assist = [action_user_one_hot + ob_user[:2]]
        ob_assist = prev_steps_assist + ob_assist 
        action_assist = self.exp_policy_assist(ob_assist)
        
        new_loc, reward_user, reward_assist, done = self.env.step(action_user, action_assist-1, target_loc, curr_loc)
        
        next_ob_user = new_loc[:]
        next_ob_user = next_ob_user + target_loc
        
        next_action_user = self.exp_policy_user(next_ob_user)
        next_action_user_one_hot = make_one_hot(next_action_user, 4)
        next_ob_assist = [next_action_user_one_hot + next_ob_user[:2]]
        next_ob_assist = ob_assist[1:] + next_ob_assist
        
        self.add_replay_buffer(ob_user, action_user, reward_user, next_ob_user, ob_assist,\
                          action_assist-1, reward_assist, next_ob_assist, done)
        
        return next_ob_user, ob_assist[1:], reward_user, reward_assist, done 
        
        
    
    def add_replay_buffer(self, ob_user, action_user, reward_user, next_ob_user, ob_assist,\
                         action_assist, reward_assist, next_ob_assist, done):
        
        self.replay_buffer.ob_user_history.append(ob_user)
        self.replay_buffer.action_user_history.append(action_user)
        self.replay_buffer.reward_user_history.append(reward_user)
        self.replay_buffer.next_ob_user_history.append(next_ob_user)
        self.replay_buffer.ob_assist_history.append(ob_assist)
        self.replay_buffer.action_assist_history.append(action_assist)
        self.replay_buffer.reward_assist_history.append(reward_assist)
        self.replay_buffer.next_ob_assist_history.append(next_ob_assist)
        self.replay_buffer.done_history.append(done)
    
    def sample_exp(self):
        indices = np.random.randint(len(self.replay_buffer.done_history), size = self.batch_size)
        
        ob_user = np.array([self.replay_buffer.ob_user_history[i] for i in indices])
        action_user = np.array([self.replay_buffer.action_user_history[i] for i in indices])
        reward_user = np.array([self.replay_buffer.reward_user_history[i] for i in indices])
        next_ob_user = np.array([self.replay_buffer.next_ob_user_history[i] for i in indices])
        ob_assist = np.array([self.replay_buffer.ob_assist_history[i] for i in indices])
        action_assist = np.array([self.replay_buffer.action_assist_history[i] for i in indices])
        reward_assist = np.array([self.replay_buffer.reward_assist_history[i] for i in indices])
        next_ob_assist = np.array([self.replay_buffer.next_ob_assist_history[i] for i in indices])
        done = np.array([self.replay_buffer.done_history[i] for i in indices])
        
        return ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist, reward_assist, next_ob_assist, done 
    
    def train(self):
        ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist,\
        reward_assist, next_ob_assist, done = self.sample_exp()
        
        input_A = ob_user
        input_B = ob_assist
        
        rewards = reward_user + reward_assist
        
        next_Q_values_user, next_Q_values_assist = self.user_model(next_ob_user), self.assist_model(next_ob_assist)
        best_next_actions_user, best_next_actions_assist = tf.math.argmax(next_Q_values_user, axis = 1), tf.math.argmax(next_Q_values_assist, axis = 1)
        next_Q_values_user, next_Q_values_assist = self.target_user_model(next_ob_user), self.target_assist_model(next_ob_assist)
        
        best_next_Q_values_user = tf.reduce_sum(next_Q_values_user*tf.one_hot(best_next_actions_user, 4), axis = 1)
        best_next_Q_values_assist = tf.reduce_sum(next_Q_values_user*tf.one_hot(best_next_actions_assist, 4), axis = 1)
        best_next_Q_values = best_next_Q_values_user + best_next_Q_values_assist
        
        target_Q_values = rewards + (1-done)*self.gamma*best_next_Q_values
        
        with tf.GradientTape() as tape:
            Q_values = self.model([input_A, input_B, action_user, action_assist])
            loss = tf.reduce_mean(self.loss_fn(target_Q_values, Q_values))
        
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        
        Q_values_assist = self.assist_model(input_B)
        with tf.GradientTape() as tape:
            Q_values_user = self.user_model(input_A)
            loss = tf.reduce_mean(self.loss_fn(Q_values_assist, Q_values_user))
            
        grads = tape.gradient(loss, self.user_model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.user_model.trainable_variables))
            
        with tf.GradientTape() as tape:
            Q_values_assist = self.assist_model(input_B)
            loss = tf.reduce_mean(self.loss_fn(Q_values_assist, Q_values_user))
            
        grads = tape.gradient(loss, self.assist_model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.assist_model.trainable_variables))                          

        

In [4]:
steps = 4
model = AI_Design(steps)
env = model.env

if os.path.exists('user_model.h5'):
    model.user_model = tf.keras.models.load_model('user_model.h5')
    model.assist_model = tf.keras.models.load_model('assist_model.h5')

Icon Locations:
[[0.1 0.6]
 [0.1 0.1]
 [0.2 0.5]
 [0.6 0. ]
 [0.1 0.4]
 [0.  0.9]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(None, 2)]          0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_1 (Te [(None, 2)]          0           input_1[0][0]                    
__________________________________________________________________________________________________
subtract (

In [5]:
def give_prev_steps(prev_steps_assist, steps):
    prev_steps_assist = [[0,0,0,0,-1,-1] for i in range(steps-1)]
    return prev_steps_assist

In [None]:
max_steps = 40
reached = 0
reached_history = []
max_reached = 0

running_reward = 0

for epoch in tqdm(range(100000)):
    done = False
    episode_reward = 0
    start, dest = env.give_start_dest()
    ob_user = [start[0], start[1], dest[0], dest[1]]
    prev_steps_assist = []
    prev_steps_assist = give_prev_steps(prev_steps_assist, steps)
    step = 0
    
    while not done and step<max_steps:
        ob_user, prev_steps_assist, reward_user, reward_assist, done = model.step(ob_user, prev_steps_assist)
        episode_reward+=reward_user
        step+=1
        if done:
            reached+=1
    
    if epoch:
        running_reward = 0.01 * episode_reward + (1 - 0.01) * running_reward
    else:
        running_reward = episode_reward
        
    if epoch>50:
        model.train()
        
        if epoch%100==0:
            model.target_user_model.set_weights(model.user_model.get_weights())
            model.target_assist_model.set_weights(model.assist_model.get_weights())
            reached_history.append(reached)
            rewards = []
            
            if reached>max_reached:
                print(reached)
                print('Saved Weights')
                max_reached = reached
                model.user_model.save('user_model.h5')
                model.assist_model.save('assist_model.h5')
                
            reached = 0
            
            if epoch%500==0:
                print(f'Running reward = {running_reward}')
                print(f'Successful runs = {np.mean(reached_history)}')
                reached_history = []
                
                if epoch%1000==0:
                    model.epsilon-=0.01
                    model.epsilon= max(model.epsilon, 0.1)
                    
                    if epoch%20000==0:
                        model.infer()

  0%|                                                                           | 102/100000 [00:06<3:06:47,  8.91it/s]

16
Saved Weights


  1%|▍                                                                          | 502/100000 [00:45<2:44:14, 10.10it/s]

Running reward = -32.41778122504205
Successful runs = 13.0


  1%|▍                                                                          | 601/100000 [00:54<2:54:54,  9.47it/s]

17
Saved Weights


  1%|▋                                                                         | 1001/100000 [01:32<2:40:44, 10.26it/s]

Running reward = -30.641966614442545
Successful runs = 15.0


  2%|█                                                                         | 1503/100000 [02:27<2:58:53,  9.18it/s]

Running reward = -31.112644621492922
Successful runs = 13.4


  2%|█▏                                                                        | 1602/100000 [02:39<3:17:22,  8.31it/s]

20
Saved Weights


  2%|█▍                                                                        | 2002/100000 [03:23<3:29:53,  7.78it/s]

Running reward = -32.725387275470105
Successful runs = 15.0


  3%|█▊                                                                        | 2502/100000 [04:41<4:18:08,  6.29it/s]

Running reward = -29.91318328153107
Successful runs = 16.8


  3%|██▏                                                                       | 3001/100000 [06:03<4:27:27,  6.04it/s]

Running reward = -30.735659997900022
Successful runs = 14.6


  4%|██▌                                                                       | 3501/100000 [07:31<5:37:42,  4.76it/s]

21
Saved Weights
Running reward = -28.29327190681303
Successful runs = 16.6


  4%|██▉                                                                       | 4002/100000 [08:59<4:34:41,  5.82it/s]

Running reward = -27.690108583965888
Successful runs = 17.6


  4%|███                                                                       | 4201/100000 [09:35<5:21:40,  4.96it/s]

22
Saved Weights


  5%|███▎                                                                      | 4502/100000 [10:30<5:07:15,  5.18it/s]

Running reward = -28.170246288479323
Successful runs = 17.0


  5%|███▋                                                                      | 5002/100000 [12:01<4:33:26,  5.79it/s]

Running reward = -30.95385432649238
Successful runs = 17.8


  6%|████                                                                      | 5502/100000 [13:41<4:57:58,  5.29it/s]

23
Saved Weights
Running reward = -26.107164742038314
Successful runs = 20.0


  6%|████▏                                                                     | 5701/100000 [14:22<6:05:16,  4.30it/s]

28
Saved Weights


  6%|████▎                                                                     | 5801/100000 [14:42<6:02:44,  4.33it/s]

29
Saved Weights


  6%|████▍                                                                     | 6002/100000 [15:24<5:22:53,  4.85it/s]

Running reward = -29.214718769660085
Successful runs = 21.4


  7%|████▊                                                                     | 6501/100000 [17:14<5:55:13,  4.39it/s]

Running reward = -26.042083219641164
Successful runs = 20.2


  7%|█████▏                                                                    | 7001/100000 [19:01<6:10:39,  4.18it/s]

Running reward = -28.317113414938802
Successful runs = 21.8


  8%|█████▌                                                                    | 7501/100000 [20:54<6:04:36,  4.23it/s]

Running reward = -28.707388364225217
Successful runs = 17.0


  8%|█████▉                                                                    | 8001/100000 [22:46<5:35:10,  4.57it/s]

Running reward = -27.091445496690376
Successful runs = 20.4


  9%|██████▎                                                                   | 8501/100000 [24:49<6:06:04,  4.17it/s]

Running reward = -29.27101308375608
Successful runs = 20.2


  9%|██████▋                                                                   | 9001/100000 [27:02<6:52:21,  3.68it/s]

Running reward = -27.807565312665627
Successful runs = 21.8


  9%|██████▉                                                                   | 9401/100000 [28:58<9:48:57,  2.56it/s]

30
Saved Weights


 10%|███████                                                                   | 9502/100000 [29:25<5:45:25,  4.37it/s]

Running reward = -22.690843246249386
Successful runs = 26.8


 10%|███████▎                                                                 | 10001/100000 [31:44<7:25:56,  3.36it/s]

Running reward = -29.363942007591753
Successful runs = 16.8


 11%|███████▋                                                                 | 10501/100000 [34:17<9:16:26,  2.68it/s]

Running reward = -26.78960644990264
Successful runs = 20.0


 11%|████████                                                                 | 11001/100000 [37:03<9:58:18,  2.48it/s]

Running reward = -27.698025806361205
Successful runs = 23.2


 12%|████████▍                                                                | 11502/100000 [39:54<7:16:56,  3.38it/s]

Running reward = -25.828827065415012
Successful runs = 23.8


 12%|████████▊                                                                | 12001/100000 [42:15<6:35:22,  3.71it/s]

Running reward = -27.02014157670438
Successful runs = 23.0


 12%|████████▉                                                                | 12201/100000 [43:14<8:13:23,  2.97it/s]

32
Saved Weights


 13%|█████████▏                                                               | 12501/100000 [44:43<7:40:58,  3.16it/s]

Running reward = -23.314190112750193
Successful runs = 25.6


 13%|█████████▍                                                               | 13001/100000 [47:15<7:23:29,  3.27it/s]

Running reward = -26.344576635790713
Successful runs = 24.8


 14%|█████████▊                                                               | 13501/100000 [49:50<7:09:08,  3.36it/s]

33
Saved Weights
Running reward = -22.46917729761364
Successful runs = 25.0


 14%|██████████▏                                                              | 14001/100000 [52:19<6:46:32,  3.53it/s]

Running reward = -24.182022839314477
Successful runs = 24.2


 15%|██████████▌                                                              | 14501/100000 [55:02<7:04:44,  3.35it/s]

Running reward = -21.933302984750238
Successful runs = 22.4


 15%|██████████▉                                                              | 15001/100000 [57:46<8:02:02,  2.94it/s]

Running reward = -26.167952915758747
Successful runs = 25.2


 16%|███████████                                                            | 15501/100000 [1:00:35<8:32:12,  2.75it/s]

Running reward = -24.564069366089324
Successful runs = 25.4


 16%|███████████▎                                                           | 16001/100000 [1:03:20<7:30:57,  3.10it/s]

Running reward = -22.53369964626457
Successful runs = 23.6


 17%|███████████▋                                                           | 16501/100000 [1:06:19<9:04:45,  2.55it/s]

Running reward = -25.895191741167956
Successful runs = 24.4


 17%|████████████                                                           | 17001/100000 [1:09:18<8:05:54,  2.85it/s]

Running reward = -24.689613405353132
Successful runs = 23.4


 18%|████████████▍                                                          | 17501/100000 [1:12:19<8:29:36,  2.70it/s]

Running reward = -25.125778561339303
Successful runs = 25.8


 18%|████████████▊                                                          | 18001/100000 [1:15:21<8:56:14,  2.55it/s]

Running reward = -24.236039394216196
Successful runs = 25.4


 18%|████████████▉                                                          | 18201/100000 [1:16:37<7:51:18,  2.89it/s]

34
Saved Weights


 18%|█████████████                                                          | 18401/100000 [1:17:52<8:47:56,  2.58it/s]

35
Saved Weights


 19%|█████████████▏                                                         | 18501/100000 [1:18:30<8:45:11,  2.59it/s]

Running reward = -22.520504486755442
Successful runs = 29.2


 19%|█████████████▍                                                         | 19001/100000 [1:21:35<7:58:35,  2.82it/s]

Running reward = -22.364909009903496
Successful runs = 27.6


 20%|█████████████▊                                                         | 19501/100000 [1:24:49<9:11:09,  2.43it/s]

Running reward = -21.97750401603591
Successful runs = 29.6


 20%|██████████████▏                                                        | 20000/100000 [1:28:07<8:02:10,  2.77it/s]

Running reward = -25.112139827620528
Successful runs = 25.0
[3 0 0] [2 3 1]
tf.Tensor(
[[-6.442294  -6.432993  -6.4115744 -6.4221053]
 [-6.413287  -6.364658  -6.387073  -6.3658047]
 [-6.436667  -6.4498434 -6.430295  -6.4461913]], shape=(3, 4), dtype=float32)
tf.Tensor(
[[-6.4835234 -6.4884295 -6.47299   -6.5308094]
 [-6.461915  -6.475451  -6.444982  -6.4591627]
 [-6.4413056 -6.4174757 -6.4377503 -6.5185127]], shape=(3, 4), dtype=float32)


 20%|██████████████                                                        | 20001/100000 [1:28:08<10:31:04,  2.11it/s]

tf.Tensor(
[[-12.895096]
 [-12.87245 ]
 [-12.854143]], shape=(3, 1), dtype=float32)


 21%|██████████████▌                                                        | 20501/100000 [1:31:27<7:46:03,  2.84it/s]

Running reward = -23.93484977422451
Successful runs = 24.0


 21%|██████████████▉                                                        | 21001/100000 [1:34:46<7:31:24,  2.92it/s]

Running reward = -20.063093148496804
Successful runs = 25.0


 22%|███████████████                                                       | 21501/100000 [1:38:21<10:02:50,  2.17it/s]

Running reward = -23.909313775314956
Successful runs = 24.4


 22%|███████████████▍                                                      | 22001/100000 [1:41:49<10:22:58,  2.09it/s]

Running reward = -26.342401495724232
Successful runs = 26.0


 23%|███████████████▉                                                       | 22501/100000 [1:45:24<9:26:23,  2.28it/s]

Running reward = -25.504862537617914
Successful runs = 27.4


 23%|████████████████▎                                                      | 23001/100000 [1:49:03<9:29:04,  2.26it/s]

Running reward = -25.783925432721496
Successful runs = 26.0


 24%|████████████████▋                                                      | 23501/100000 [1:52:43<9:44:59,  2.18it/s]

Running reward = -22.404943492580898
Successful runs = 27.4


 24%|█████████████████                                                      | 24001/100000 [1:56:27<9:48:53,  2.15it/s]

Running reward = -24.216878724132847
Successful runs = 27.6


 25%|█████████████████▍                                                     | 24501/100000 [2:00:19<8:16:51,  2.53it/s]

Running reward = -22.671322381267206
Successful runs = 25.4


 25%|█████████████████▌                                                    | 25001/100000 [2:04:02<11:22:17,  1.83it/s]

Running reward = -23.606966800606767
Successful runs = 27.8


 26%|██████████████████                                                     | 25501/100000 [2:08:01<9:57:53,  2.08it/s]

Running reward = -22.143986170797895
Successful runs = 27.8


 26%|██████████████████▏                                                    | 25701/100000 [2:09:33<8:57:08,  2.31it/s]

37
Saved Weights


 26%|██████████████████▏                                                   | 26001/100000 [2:11:54<10:03:21,  2.04it/s]

Running reward = -23.491165868723698
Successful runs = 29.0


 26%|██████████████████▌                                                    | 26201/100000 [2:13:28<8:58:19,  2.28it/s]

42
Saved Weights


 27%|██████████████████▌                                                   | 26501/100000 [2:15:54<10:14:06,  1.99it/s]

Running reward = -20.608078049805144
Successful runs = 30.4


 27%|██████████████████▉                                                   | 27001/100000 [2:19:56<10:37:35,  1.91it/s]

Running reward = -22.09532313897857
Successful runs = 29.2


 28%|███████████████████▎                                                  | 27501/100000 [2:23:56<11:02:22,  1.82it/s]

Running reward = -21.258021632070513
Successful runs = 30.8


 28%|███████████████████▉                                                   | 28002/100000 [2:28:14<9:04:21,  2.20it/s]

Running reward = -24.37395295264031
Successful runs = 27.6


 29%|███████████████████▉                                                  | 28501/100000 [2:32:30<10:49:20,  1.84it/s]

Running reward = -24.321288883090645
Successful runs = 26.0


 29%|████████████████████▌                                                  | 29001/100000 [2:36:46<9:14:05,  2.14it/s]

Running reward = -23.359612381373253
Successful runs = 27.2


 30%|████████████████████▋                                                 | 29501/100000 [2:41:05<10:02:13,  1.95it/s]

Running reward = -21.31664157799834
Successful runs = 30.8


 30%|█████████████████████                                                 | 30001/100000 [2:45:22<11:12:01,  1.74it/s]

Running reward = -19.652362374076056
Successful runs = 32.2


 30%|█████████████████████▎                                                 | 30101/100000 [2:46:12<7:44:32,  2.51it/s]

43
Saved Weights


 31%|█████████████████████▎                                                | 30501/100000 [2:49:48<10:17:48,  1.87it/s]

Running reward = -22.900663121122736
Successful runs = 31.2


 31%|█████████████████████▋                                                | 31001/100000 [2:54:03<11:02:37,  1.74it/s]

Running reward = -19.29626759431808
Successful runs = 33.4


 32%|██████████████████████▎                                                | 31501/100000 [2:58:38<7:59:02,  2.38it/s]

Running reward = -21.567751789176477
Successful runs = 30.2


 32%|██████████████████████▎                                               | 31801/100000 [3:01:13<10:42:49,  1.77it/s]

45
Saved Weights


 32%|██████████████████████▍                                               | 32001/100000 [3:03:01<12:03:26,  1.57it/s]

Running reward = -20.423436801731015
Successful runs = 32.4


 33%|██████████████████████▊                                               | 32501/100000 [3:07:34<12:22:57,  1.51it/s]

Running reward = -23.12508409903733
Successful runs = 31.6


 33%|███████████████████████▍                                               | 33001/100000 [3:12:07<8:56:31,  2.08it/s]

Running reward = -20.939863244789386
Successful runs = 33.4


 34%|███████████████████████▊                                               | 33501/100000 [3:16:47<8:24:07,  2.20it/s]

Running reward = -21.695738252032918
Successful runs = 31.8


 34%|████████████████████████▏                                              | 34001/100000 [3:21:29<8:28:39,  2.16it/s]

Running reward = -20.496313863320427
Successful runs = 33.0


 35%|████████████████████████▏                                             | 34501/100000 [3:26:18<12:28:55,  1.46it/s]

Running reward = -18.83407793332157
Successful runs = 31.6


 35%|████████████████████████▊                                              | 35001/100000 [3:30:59<6:51:36,  2.63it/s]

Running reward = -17.741352354357662
Successful runs = 35.6


 36%|████████████████████████▊                                             | 35501/100000 [3:35:50<11:03:13,  1.62it/s]

Running reward = -22.88979928697616
Successful runs = 30.4


 36%|█████████████████████████▏                                            | 36001/100000 [3:40:46<11:19:05,  1.57it/s]

Running reward = -16.38331550196379
Successful runs = 33.6


 36%|█████████████████████████▋                                             | 36101/100000 [3:41:40<8:32:41,  2.08it/s]

49
Saved Weights


 37%|█████████████████████████▌                                            | 36501/100000 [3:45:40<11:00:59,  1.60it/s]

Running reward = -17.411793615756846
Successful runs = 35.0


 37%|█████████████████████████▉                                            | 37001/100000 [3:50:31<10:32:54,  1.66it/s]

Running reward = -17.576595673867086
Successful runs = 36.2


 38%|██████████████████████████▋                                            | 37501/100000 [3:55:32<9:41:10,  1.79it/s]

Running reward = -17.48210430668684
Successful runs = 32.2


 38%|██████████████████████████▌                                           | 38001/100000 [4:00:40<11:35:18,  1.49it/s]

Running reward = -20.731771439469775
Successful runs = 30.6


 39%|██████████████████████████▉                                           | 38501/100000 [4:05:45<10:41:18,  1.60it/s]

Running reward = -20.840980775792357
Successful runs = 33.6


 39%|███████████████████████████▎                                          | 39001/100000 [4:10:47<11:08:53,  1.52it/s]

Running reward = -23.325482109845108
Successful runs = 34.8


 40%|███████████████████████████▋                                          | 39501/100000 [4:16:01<11:08:14,  1.51it/s]

Running reward = -17.142934589188574
Successful runs = 27.0


 40%|████████████████████████████                                          | 40001/100000 [4:21:08<11:36:37,  1.44it/s]

Running reward = -14.28460465310754
Successful runs = 32.4
[3 1 1] [3 3 1]
tf.Tensor(
[[-5.4151063 -5.4348426 -5.411813  -5.372867 ]
 [-5.479199  -5.4002004 -5.414652  -5.3972635]
 [-5.4246492 -5.4014688 -5.4204726 -5.3447256]], shape=(3, 4), dtype=float32)
tf.Tensor(
[[-5.0727863 -5.098233  -5.0765576 -5.0310903]
 [-5.1797633 -5.176692  -5.175257  -5.126334 ]
 [-5.250063  -5.187414  -5.249463  -5.1413565]], shape=(3, 4), dtype=float32)
tf.Tensor(
[[-10.403957]
 [-10.526535]
 [-10.588882]], shape=(3, 1), dtype=float32)


 41%|████████████████████████████▎                                         | 40501/100000 [4:26:24<12:32:35,  1.32it/s]

Running reward = -15.101704454105409
Successful runs = 35.0


 41%|█████████████████████████████                                          | 41002/100000 [4:31:43<5:20:39,  3.07it/s]

Running reward = -14.764279805962003
Successful runs = 35.8


 42%|█████████████████████████████▍                                         | 41501/100000 [4:37:21<5:31:18,  2.94it/s]

Running reward = -20.901182111186575
Successful runs = 28.2


 42%|█████████████████████████████▍                                        | 42001/100000 [4:42:57<12:03:31,  1.34it/s]

Running reward = -21.40496504686274
Successful runs = 27.2


 43%|█████████████████████████████▊                                        | 42501/100000 [4:48:27<10:04:54,  1.58it/s]

Running reward = -23.304219462668794
Successful runs = 28.0


 43%|██████████████████████████████                                        | 43001/100000 [4:54:08<11:05:49,  1.43it/s]

Running reward = -21.273672170090993
Successful runs = 22.0


 44%|██████████████████████████████▍                                       | 43501/100000 [5:00:15<11:34:42,  1.36it/s]

Running reward = -20.97617777624015
Successful runs = 25.6


 44%|███████████████████████████████▏                                       | 44001/100000 [5:05:58<9:37:52,  1.62it/s]

Running reward = -20.25084572111084
Successful runs = 29.2


 45%|███████████████████████████████▏                                      | 44501/100000 [5:12:47<16:08:07,  1.05s/it]

Running reward = -24.375868765012342
Successful runs = 19.8


 45%|███████████████████████████████▌                                      | 45001/100000 [5:22:49<18:30:09,  1.21s/it]

Running reward = -22.46457240855401
Successful runs = 26.0


 46%|███████████████████████████████▊                                      | 45501/100000 [5:32:20<14:32:41,  1.04it/s]

Running reward = -25.143194374969926
Successful runs = 22.0


 46%|████████████████████████████████▏                                     | 46001/100000 [5:41:06<10:37:28,  1.41it/s]

Running reward = -26.250184788494405
Successful runs = 22.4


 47%|████████████████████████████████▌                                     | 46501/100000 [5:49:23<14:05:34,  1.05it/s]

Running reward = -19.701376243477814
Successful runs = 27.2


 47%|████████████████████████████████▉                                     | 47001/100000 [5:57:20<16:46:04,  1.14s/it]

Running reward = -23.69831342185893
Successful runs = 25.0


 48%|█████████████████████████████████▎                                    | 47501/100000 [6:04:49<14:42:57,  1.01s/it]

Running reward = -23.5368798191943
Successful runs = 29.6


 48%|█████████████████████████████████▌                                    | 48001/100000 [6:12:34<12:38:00,  1.14it/s]

Running reward = -15.768506152467559
Successful runs = 29.6


 49%|█████████████████████████████████▉                                    | 48501/100000 [6:20:26<11:40:54,  1.22it/s]

Running reward = -16.48034485185356
Successful runs = 25.8


 49%|██████████████████████████████████▎                                   | 49001/100000 [6:28:24<16:21:35,  1.15s/it]

Running reward = -23.95732470675894
Successful runs = 28.2


 50%|██████████████████████████████████▋                                   | 49501/100000 [6:36:36<11:07:18,  1.26it/s]

Running reward = -21.07627503109076
Successful runs = 28.8


 50%|███████████████████████████████████                                   | 50001/100000 [6:45:14<12:54:33,  1.08it/s]

Running reward = -23.648312932332225
Successful runs = 27.4


 51%|███████████████████████████████████▊                                   | 50501/100000 [6:51:07<8:22:25,  1.64it/s]

Running reward = -22.87666835624772
Successful runs = 20.0


 51%|████████████████████████████████████▏                                  | 51001/100000 [6:56:14<9:57:44,  1.37it/s]

Running reward = -22.10654489524009
Successful runs = 23.2


 52%|████████████████████████████████████▌                                  | 51501/100000 [7:01:15<7:22:35,  1.83it/s]

Running reward = -21.07204298956129
Successful runs = 30.2


 52%|████████████████████████████████████▍                                 | 52001/100000 [7:06:24<11:19:01,  1.18it/s]

Running reward = -26.125173709872016
Successful runs = 25.4


 53%|█████████████████████████████████████▎                                 | 52501/100000 [7:11:28<8:37:25,  1.53it/s]

Running reward = -22.45150568955725
Successful runs = 28.2


 53%|█████████████████████████████████████▋                                 | 53001/100000 [7:16:37<8:38:08,  1.51it/s]

Running reward = -20.896383842663173
Successful runs = 29.8


 54%|█████████████████████████████████████▉                                 | 53501/100000 [7:21:52<7:15:18,  1.78it/s]

Running reward = -24.121536717891495
Successful runs = 21.4


 54%|██████████████████████████████████████▎                                | 54001/100000 [7:26:59<8:19:43,  1.53it/s]

Running reward = -22.160074063031278
Successful runs = 27.8


 55%|██████████████████████████████████████▋                                | 54501/100000 [7:32:13<5:18:54,  2.38it/s]

Running reward = -25.600266553329845
Successful runs = 25.2


 55%|███████████████████████████████████████                                | 55001/100000 [7:37:59<9:36:06,  1.30it/s]

Running reward = -23.332464562123246
Successful runs = 25.0


 56%|██████████████████████████████████████▊                               | 55501/100000 [7:43:17<10:47:52,  1.14it/s]

Running reward = -25.837589636176556
Successful runs = 21.6


 56%|███████████████████████████████████████▊                               | 56001/100000 [7:49:39<9:51:40,  1.24it/s]

Running reward = -21.334141486743697
Successful runs = 32.2


 57%|████████████████████████████████████████                               | 56501/100000 [7:55:25<8:31:14,  1.42it/s]

Running reward = -19.9807109794127
Successful runs = 27.8


 57%|████████████████████████████████████████▍                              | 57001/100000 [8:02:35<7:39:32,  1.56it/s]

Running reward = -26.686012509657132
Successful runs = 20.2


 58%|████████████████████████████████████████▊                              | 57501/100000 [8:09:17<8:55:18,  1.32it/s]

Running reward = -17.33325592512497
Successful runs = 29.6


 58%|████████████████████████████████████████▌                             | 58001/100000 [8:15:43<11:33:11,  1.01it/s]

Running reward = -24.44242510033677
Successful runs = 24.8


 59%|█████████████████████████████████████████▌                             | 58501/100000 [8:22:01<9:48:37,  1.18it/s]

Running reward = -19.908444691926093
Successful runs = 27.0


 59%|█████████████████████████████████████████▉                             | 59001/100000 [8:28:35<8:42:17,  1.31it/s]

Running reward = -25.957365470667796
Successful runs = 24.6


 60%|██████████████████████████████████████████▏                            | 59501/100000 [8:34:38<8:55:08,  1.26it/s]

Running reward = -19.286450847239916
Successful runs = 25.2


 60%|██████████████████████████████████████████▌                            | 60001/100000 [8:40:37<6:56:10,  1.60it/s]

Running reward = -21.592971099817394
Successful runs = 28.2
[2 2 3] [2 2 1]
tf.Tensor(
[[-6.1818    -6.0893226 -6.122959  -6.1108336]
 [-6.1265273 -6.1544237 -6.1257052 -6.111162 ]
 [-6.1260757 -6.1377926 -6.1261144 -6.0951495]], shape=(3, 4), dtype=float32)
tf.Tensor(
[[-6.2804413 -6.2137194 -6.232189  -6.2106347]
 [-6.2484674 -6.2331104 -6.2509656 -6.224486 ]
 [-6.2643833 -6.2302957 -6.259958  -6.262737 ]], shape=(3, 4), dtype=float32)
tf.Tensor(
[[-12.355148]
 [-12.376671]
 [-12.325445]], shape=(3, 1), dtype=float32)


 61%|██████████████████████████████████████████▎                           | 60501/100000 [8:47:24<20:37:27,  1.88s/it]

Running reward = -21.713149322415834
Successful runs = 25.4


 61%|██████████████████████████████████████████▋                           | 61001/100000 [8:54:55<12:09:36,  1.12s/it]

Running reward = -24.653391169659308
Successful runs = 26.4


 62%|███████████████████████████████████████████▋                           | 61501/100000 [9:03:19<8:10:05,  1.31it/s]

Running reward = -22.728419342675856
Successful runs = 25.6


 62%|███████████████████████████████████████████▍                          | 62001/100000 [9:11:09<10:47:20,  1.02s/it]

Running reward = -20.34983667973341
Successful runs = 26.4


 63%|████████████████████████████████████████████▍                          | 62501/100000 [9:19:00<8:19:12,  1.25it/s]

Running reward = -22.074326106080562
Successful runs = 28.4


 63%|████████████████████████████████████████████▋                          | 63001/100000 [9:26:58<9:13:45,  1.11it/s]

Running reward = -21.600820187131298
Successful runs = 22.2


 64%|████████████████████████████████████████████▍                         | 63501/100000 [9:34:47<11:08:29,  1.10s/it]

Running reward = -16.71076170936959
Successful runs = 28.6


 64%|█████████████████████████████████████████████▍                         | 64001/100000 [9:42:44<8:09:27,  1.23it/s]

Running reward = -17.96226710873181
Successful runs = 28.2


 65%|█████████████████████████████████████████████▊                         | 64501/100000 [9:51:42<9:02:44,  1.09it/s]

Running reward = -26.583112953335995
Successful runs = 26.4


 65%|████████████████████████████████████████████▊                        | 65001/100000 [10:00:28<11:03:53,  1.14s/it]

Running reward = -23.458206278988687
Successful runs = 23.4


 66%|█████████████████████████████████████████████▊                        | 65501/100000 [10:08:25<6:50:53,  1.40it/s]

Running reward = -23.25734563382413
Successful runs = 22.6


 66%|██████████████████████████████████████████████▏                       | 66001/100000 [10:16:36<7:17:40,  1.29it/s]

Running reward = -24.198432446871234
Successful runs = 22.8


 67%|█████████████████████████████████████████████▉                       | 66501/100000 [10:24:59<10:09:31,  1.09s/it]

Running reward = -24.50549770935961
Successful runs = 24.4


 67%|██████████████████████████████████████████████▉                       | 67001/100000 [10:33:06<5:27:08,  1.68it/s]

Running reward = -13.9618271179124
Successful runs = 34.0


 68%|███████████████████████████████████████████████▎                      | 67501/100000 [10:41:10<5:09:26,  1.75it/s]

Running reward = -18.61309183725575
Successful runs = 29.4


 68%|███████████████████████████████████████████████▌                      | 68001/100000 [10:48:57<9:55:41,  1.12s/it]

Running reward = -20.797011594525785
Successful runs = 28.2


 69%|███████████████████████████████████████████████▉                      | 68501/100000 [10:56:23<9:41:25,  1.11s/it]

Running reward = -20.53343684433262
Successful runs = 30.2


 69%|████████████████████████████████████████████████▎                     | 69001/100000 [11:04:10<9:07:08,  1.06s/it]

Running reward = -24.56472670054497
Successful runs = 21.8


 69%|████████████████████████████████████████████████▌                     | 69339/100000 [11:09:53<6:46:28,  1.26it/s]

In [None]:
model.infer()