In [1]:
import os
from copy import deepcopy

from tqdm import tqdm
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

from Environment import Environment, make_one_hot, give_mapping
from Networks import UserActor, AsstActor, CentralizedCritic

In [2]:
def give_prev_steps(prev_steps_assist, steps):
    prev_steps_assist = [[0,0,0,0,-1,-1] for i in range(steps-1)]
    return prev_steps_assist

In [3]:
class Agent:
    def __init__(self):
        self.memory_len = 6
        self.user_actor = UserActor()
        self.asst_actor = AsstActor(self.memory_len)
        
        self.user_actor.model = tf.keras.models.load_model('user.h5')
        self.asst_actor.model = tf.keras.models.load_model('asst.h5')
        
        self.optimizer_actors = tf.keras.optimizers.Adam(lr = 0.0001)
        self.optimizer_critic = tf.keras.optimizers.Adam(lr = 0.0002)
        self.huber_loss = tf.keras.losses.Huber()
        
        
        self.gamma = 0.90
#         self.env = Environment()
#         self.env.cells = np.array([[0.1, 0.1], [0.9, 0.9], [0.1, 0.9], [0.9, 0.1], [0.5, 0.3], [0.5, 0.7]])
#         self.env_cell_mapping = give_mapping(self.env.cells)
#         self.env_cell_mapping = self.env_cell_mapping[np.newaxis, :, :, np.newaxis]
        self.eps = 10e-6
                
    def trial(self, i):
        if i%100==0:
            self.env = Environment()
            self.env_cell_mapping = give_mapping(self.env.cells)
            self.env_cell_mapping = self.env_cell_mapping[np.newaxis, :, :, np.newaxis]
        env = self.env
        max_steps = 20
        done = False
        episode_reward = 0
        start, dest = env.give_start_dest()
        ob_user = [start[0], start[1], dest[0], dest[1]]
        prev_steps_assist = []
        prev_steps_assist = give_prev_steps(prev_steps_assist, self.memory_len)
        step = 0
        episode_reward = 0

        while not done and step<max_steps:
            curr_loc = ob_user[:2]
            target_loc = ob_user[2:4]
            step+=1
#             print(ob_user)

            ob_user = np.array(ob_user)[np.newaxis]
            user_probs = self.user_actor.model(ob_user)
            user_action = np.argmax(np.squeeze(user_probs))

            action_user_one_hot = make_one_hot(user_action, 4)

            ob_assist = [action_user_one_hot + curr_loc] 
            ob_assist = prev_steps_assist + ob_assist
            ob_assist = np.array(ob_assist)[np.newaxis]

            asst_probs = self.asst_actor.model([ob_assist, self.env_cell_mapping])
#             print(asst_probs)
            asst_action = np.argmax(np.squeeze(asst_probs)) 
            new_loc, reward_user, reward_assist, done = self.env.step(user_action, asst_action + 1, target_loc, curr_loc)

            next_ob_user = new_loc[:]
            next_ob_user = next_ob_user + target_loc

            ob_user = next_ob_user
            prev_steps_assist = np.squeeze(ob_assist).tolist()[1:]
            episode_reward+=reward_user
            
        return (start, dest, step)




In [4]:
agent = Agent()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(None, 2)]          0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_1 (Te [(None, 2)]          0           input_1[0][0]                    
__________________________________________________________________________________________________
subtract (Subtract)             (None, 2)            0           tf_op_layer_strided_slice[0][0]  
                                                                 tf_op_layer_strided_sl

In [5]:
norm_steps = []
agent_steps = []
for i in tqdm(range(1000)):
    start, dest, step = agent.trial(i)
    norm_steps.append(abs(start[0]-dest[0])*10 + abs(start[1]-dest[1])*10)
    agent_steps.append(step)

  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]

Icon Locations:
[[0.1 0.9]
 [0.5 0. ]
 [0.4 0.6]
 [0.7 0.7]
 [0.7 0.9]
 [0.2 0.4]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]


 10%|███████▉                                                                       | 101/1000 [00:05<00:36, 24.82it/s]

Icon Locations:
[[0.4 0.4]
 [0.8 0.1]
 [0.4 0.4]
 [0.  0.4]
 [0.  0.6]
 [0.2 0.3]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]


 20%|████████████████                                                               | 204/1000 [00:10<00:35, 22.21it/s]

Icon Locations:
[[0.7 0. ]
 [0.5 0.3]
 [0.1 0.5]
 [0.6 0.4]
 [0.2 0.1]
 [0.8 0.7]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]


 30%|███████████████████████▊                                                       | 302/1000 [00:14<00:29, 23.37it/s]

Icon Locations:
[[0.4 0.5]
 [0.2 0.2]
 [0.3 0.2]
 [0.1 0.6]
 [0.7 0.1]
 [0.7 0.3]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]


 40%|███████████████████████████████▊                                               | 402/1000 [00:18<00:22, 26.34it/s]

Icon Locations:
[[0.4 0.2]
 [0.3 0.3]
 [0.8 0.8]
 [0.8 0.4]
 [0.5 0.6]
 [0.4 0.1]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]


 50%|███████████████████████████████████████▋                                       | 503/1000 [00:23<00:20, 24.26it/s]

Icon Locations:
[[0.9 0.8]
 [0.1 0.2]
 [0.8 0.5]
 [0.9 0.4]
 [0.8 0. ]
 [0.3 0. ]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]


 60%|███████████████████████████████████████████████▋                               | 603/1000 [00:27<00:16, 24.26it/s]

Icon Locations:
[[0.  0. ]
 [0.5 0.8]
 [0.6 0.4]
 [0.2 0.4]
 [0.  0.9]
 [0.  0.7]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]


 70%|███████████████████████████████████████████████████████▍                       | 701/1000 [00:30<00:11, 26.27it/s]

Icon Locations:
[[0.8 0.4]
 [0.4 0.6]
 [0.8 0.8]
 [0.8 0.4]
 [0.2 0.5]
 [0.6 0.8]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]


 80%|███████████████████████████████████████████████████████████████▍               | 803/1000 [00:34<00:08, 23.18it/s]

Icon Locations:
[[0.  0.2]
 [0.3 0.5]
 [0.1 0.9]
 [0.1 0.1]
 [0.7 0.8]
 [0.8 0.6]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]


 90%|███████████████████████████████████████████████████████████████████████▎       | 902/1000 [00:39<00:04, 22.81it/s]

Icon Locations:
[[0.  0.1]
 [0.6 0.5]
 [0.5 0.1]
 [0.7 0.5]
 [0.2 0.9]
 [0.1 0.2]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:43<00:00, 22.98it/s]


In [6]:
sum(norm_steps)

6504.0

In [7]:
sum(agent_steps)

4861

In [8]:
sum(agent_steps)/sum(norm_steps)

0.7473862238622386