In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from Environment import *
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
'''
Direction mapping:
0: left = [-1, 0]
1: right = [1, 0]
2: up = [0, -1]
3: down = [0, 1]
'''

class User_Agent:
    def __init__(self):
        #model
        #-----------------------------------------------------
        input_A = Input(shape = (4,))    #curr_x, curr_y, target_x, target_y
        x = Dense(32, activation = 'relu')(input_A)
        x = Dense(4, activation = 'softmax')(x) #left,right, down, up
        
        self.model = Model(inputs = input_A, outputs = x)
        print(self.model.summary())
        #---------------------------------------------------
        
        self.target_model = tf.keras.models.clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())
        
        self.loss_fn = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(lr = 0.005)
        self.batch_size = 128
        self.replay_buffer_size = 1024
        self.replay_buffer = Replay_Buffer(self.replay_buffer_size)
        self.epsilon = 0.5
        self.gamma = 0.9
        
    def exp_policy(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(4)
        else:
            state = np.array(state)[np.newaxis]
            Q_values = self.model(state)
            return np.argmax(Q_values[0])
        
    def sample_experience(self):
        indices = np.random.randint(len(self.replay_buffer.state_history), size = self.batch_size)
        
        states = np.array([self.replay_buffer.state_history[i] for i in indices])
        actions = np.array([self.replay_buffer.action_history[i] for i in indices])
        next_states = np.array([self.replay_buffer.next_state_history[i] for i in indices])
        rewards = np.array([self.replay_buffer.rewards_history[i] for i in indices])
        dones = np.array([self.replay_buffer.done_history[i] for i in indices])
        
        return states, actions, next_states, rewards, dones
    

    def play_one_step(self, env, state, mod_agent):
        action_user = self.exp_policy(state)
        action_user_one_hot = make_one_hot(action_user, 4)
        curr_loc = state[:2]
        target_loc = state[2:]
        action_user_one_hot.extend(target_loc)
        mod_state = action_user_one_hot[:]
        mod_state = np.array(mod_state)
        new_loc, reward, done = mod_agent.play_one_step(env, mod_state, curr_loc, target_loc, self)
        next_state = [new_loc[0], new_loc[1], target_loc[0], target_loc[1]]
        self.replay_buffer.append(state, action_user, reward, next_state, done)
        
        return next_state, reward, done
    
    def train(self):
        states, actions, next_states, rewards, dones = self.sample_experience()
        next_Q_values = self.target_model(next_states)
        max_next_Q_values = np.max(next_Q_values, axis= 1)
        target_Q_values = rewards + (1-dones)*self.gamma*max_next_Q_values
        
        mask = tf.one_hot(actions, 4)
        
        with tf.GradientTape() as tape:
            all_Q_values = self.model(states)
            Q_values = tf.reduce_sum(all_Q_values*mask, axis = 1, keepdims = True)
            loss = tf.reduce_mean(self.loss_fn(target_Q_values, Q_values))

        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

In [3]:
class Mod_Agent:
    def __init__(self):
        #model
        #-----------------------------------------------------
        input_A = Input(shape = (6,))   #direction of motion_one_hot(4), curr_x, curr_y
        x = Dense(32, activation = 'relu')(input_A)
        x = Dense(4, activation = 'softmax')(x) #modulate by 1,2,3,4 
        
        self.model = Model(inputs = input_A, outputs = x)
        print(self.model.summary())
        #---------------------------------------------------
        
        self.target_model = tf.keras.models.clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())
        
        self.loss_fn = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(lr = 0.005)
        self.batch_size = 128
        self.replay_buffer_size = 1024
        self.replay_buffer = Replay_Buffer(self.replay_buffer_size)
        self.epsilon = 0.5
        self.steps_per_epoch = 1
        self.gamma = 0.9
    
    def exp_policy(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(4)
        else:
            state = np.array(state)[np.newaxis]
            Q_values = self.model(state)
            return np.argmax(Q_values[0])
        
        
    def sample_experience(self):
        indices = np.random.randint(len(self.replay_buffer.state_history), size = self.batch_size)
        
        states = np.array([self.replay_buffer.state_history[i] for i in indices])
        actions = np.array([self.replay_buffer.action_history[i] for i in indices])
        next_states = np.array([self.replay_buffer.next_state_history[i] for i in indices])
        rewards = np.array([self.replay_buffer.rewards_history[i] for i in indices])
        dones = np.array([self.replay_buffer.done_history[i] for i in indices])
        
        return states, actions, next_states, rewards, dones
    
    def play_one_step(self, env, state, curr_loc, target_loc, user_agent):
        #Agent not aware of target location
        action_mod = self.exp_policy(state)
        action_user = state[0]
        
        new_loc, reward, done = env.step(action_user, action_mod, target_loc, curr_loc)
        next_dir = user_agent.exp_policy(np.array([[new_loc[0], new_loc[1], target_loc[0], target_loc[1]]]))
        
        next_dir_one_hot = make_one_hot(next_dir, 4)
        next_dir_one_hot.extend(target_loc)
        next_state = next_dir_one_hot[:]
        next_state = np.array(next_state)
        
        self.replay_buffer.append(state, action_mod, reward, next_state, done)
        
        
        return new_loc, reward, done
    
    def train(self):
        states, actions, next_states, rewards, dones = self.sample_experience()
        next_Q_values = self.target_model(next_states)
        max_next_Q_values = np.max(next_Q_values, axis= 1)
        target_Q_values = rewards + (1-dones)*self.gamma*max_next_Q_values
        
        mask = tf.one_hot(actions, 4)
        
        with tf.GradientTape() as tape:
            all_Q_values = self.model(states)
            Q_values = tf.reduce_sum(all_Q_values*mask, axis = 1, keepdims = True)
            loss = tf.reduce_mean(self.loss_fn(target_Q_values, Q_values))

        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
            

In [4]:
env = Environment()
user_agent = User_Agent()
mod_agent = Mod_Agent()

Icon Locations:
[[0.6 0.4]
 [0.3 0.7]
 [0.  0. ]
 [0.4 0.3]
 [0.3 0.7]
 [0.2 0.7]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 4)]               0         
_________________________________________________________________
dense (Dense)                (None, 32)                160       
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 132       
Total params: 292
Trainable params: 292
Non-trainable params: 0
_________________________________________________________________
None
Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 6)]           

In [None]:
rewards = []
mean_rewards = []
max_steps = 200
reached = 0
for epoch in tqdm(range(1000)):
    done = False
    episode_reward = 0
    step = 0
    while not done and step<max_steps:
        start, dest = env.give_start_dest()
        state = [start[0], start[1], dest[0], dest[1]]
        state = np.array(state)
        next_state, reward, done = user_agent.play_one_step(env, state, mod_agent)
        state = next_state
        episode_reward+=reward
        step+=1
        if done:
            reached+=1
        if epoch>50:
            user_agent.train()
            mod_agent.train()
            mod_agent.epsilon*=0.9
            user_agent.epsilon*=0.9
            
    mean_rewards.append(episode_reward)
    if epoch%10==0:
        rewards.append(np.mean(mean_rewards))
        print(f'Mean Reward = {rewards[-1]}')
        print(reached)
        reached = 0
    

  0%|▏                                                                                | 2/1000 [00:00<01:18, 12.64it/s]

Mean Reward = 8.9
1


  1%|▉                                                                               | 12/1000 [00:02<03:11,  5.16it/s]

Mean Reward = -108.5181818181818
6


  2%|█▋                                                                              | 21/1000 [00:03<02:54,  5.63it/s]

Mean Reward = -104.67142857142856
7


  3%|██▍                                                                             | 31/1000 [00:06<04:21,  3.71it/s]

Mean Reward = -121.36774193548388
3


  4%|███▎                                                                            | 41/1000 [00:08<03:57,  4.03it/s]

Mean Reward = -123.32439024390244
5


  5%|████                                                                            | 50/1000 [00:11<03:16,  4.82it/s]

Mean Reward = -120.39999999999999
6


  6%|████▉                                                                           | 61/1000 [00:26<25:04,  1.60s/it]

Mean Reward = -115.38196721311479
6


  7%|█████▋                                                                          | 71/1000 [00:46<23:39,  1.53s/it]

Mean Reward = -116.28732394366196
4


  8%|██████▍                                                                         | 81/1000 [01:01<23:15,  1.52s/it]

Mean Reward = -115.30987654320988
6


  9%|███████▎                                                                        | 91/1000 [01:17<26:11,  1.73s/it]

Mean Reward = -115.07032967032967
7


 10%|███████▉                                                                       | 101/1000 [01:33<22:54,  1.53s/it]

Mean Reward = -114.89207920792082
4


 11%|████████▊                                                                      | 111/1000 [01:49<26:35,  1.79s/it]

Mean Reward = -115.19189189189188
4


 12%|█████████▌                                                                     | 121/1000 [02:04<25:02,  1.71s/it]

Mean Reward = -114.96198347107438
6


 13%|██████████▎                                                                    | 131/1000 [02:21<24:50,  1.72s/it]

Mean Reward = -115.57022900763359
5


 14%|███████████▏                                                                   | 141/1000 [02:37<26:58,  1.88s/it]

Mean Reward = -115.81985815602836
5


 15%|███████████▉                                                                   | 151/1000 [02:44<08:03,  1.76it/s]

Mean Reward = -111.41655629139073
9


 16%|████████████▋                                                                  | 161/1000 [03:00<23:08,  1.66s/it]

Mean Reward = -111.92795031055901
5


 17%|█████████████▌                                                                 | 171/1000 [03:15<21:52,  1.58s/it]

Mean Reward = -111.41988304093569
6


 18%|██████████████▎                                                                | 181/1000 [03:28<22:54,  1.68s/it]

Mean Reward = -110.41546961325969
5


 19%|███████████████                                                                | 191/1000 [03:48<26:09,  1.94s/it]

Mean Reward = -112.60314136125653
2


 20%|███████████████▉                                                               | 201/1000 [03:58<10:00,  1.33it/s]

Mean Reward = -110.56169154228857
7


 21%|████████████████▋                                                              | 211/1000 [04:13<17:39,  1.34s/it]

Mean Reward = -110.31611374407585
5


 22%|█████████████████▍                                                             | 221/1000 [04:27<19:22,  1.49s/it]

Mean Reward = -109.85656108597287
8


 23%|██████████████████▎                                                            | 232/1000 [04:48<19:11,  1.50s/it]

Mean Reward = -111.8809523809524
2


 24%|███████████████████                                                            | 241/1000 [04:59<12:48,  1.01s/it]

Mean Reward = -109.8871369294606
9


 25%|███████████████████▊                                                           | 251/1000 [05:12<23:26,  1.88s/it]

Mean Reward = -109.57211155378485
5


 26%|████████████████████▌                                                          | 261/1000 [05:27<12:28,  1.01s/it]

Mean Reward = -109.52988505747128
6


 27%|█████████████████████▍                                                         | 271/1000 [05:41<15:36,  1.28s/it]

Mean Reward = -109.32250922509226
6


 28%|██████████████████████▏                                                        | 281/1000 [05:55<15:41,  1.31s/it]

Mean Reward = -108.8202846975089
7


 29%|██████████████████████▉                                                        | 291/1000 [06:09<12:38,  1.07s/it]

Mean Reward = -108.72405498281789
7


 30%|███████████████████████▊                                                       | 301/1000 [06:24<13:53,  1.19s/it]

Mean Reward = -108.71029900332226
6


 31%|████████████████████████▌                                                      | 311/1000 [06:36<10:29,  1.09it/s]

Mean Reward = -108.12861736334405
6


 32%|█████████████████████████▎                                                     | 321/1000 [06:53<19:17,  1.70s/it]

Mean Reward = -108.58909657320874
6


 33%|██████████████████████████▏                                                    | 331/1000 [07:07<11:42,  1.05s/it]

Mean Reward = -108.41903323262841
6


 34%|██████████████████████████▉                                                    | 341/1000 [07:21<15:41,  1.43s/it]

Mean Reward = -108.35014662756599
8


 35%|███████████████████████████▋                                                   | 351/1000 [07:41<22:18,  2.06s/it]

Mean Reward = -109.68717948717948
2


 36%|████████████████████████████▌                                                  | 361/1000 [07:54<14:04,  1.32s/it]

Mean Reward = -109.32437673130195
6


 37%|█████████████████████████████▎                                                 | 371/1000 [08:11<19:36,  1.87s/it]

Mean Reward = -109.60080862533692
5


 38%|██████████████████████████████                                                 | 380/1000 [08:25<19:32,  1.89s/it]