In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from Environment import *
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
'''
Direction mapping:
0: left = [-1, 0]
1: right = [1, 0]
2: up = [0, -1]
3: down = [0, 1]
'''

class User_Agent:
    def __init__(self):
        #model
        #-----------------------------------------------------
        input_A = Input(shape = (4,))    #curr_x, curr_y, target_x, target_y
        x = Dense(32, activation = 'relu')(input_A)
        x = Dense(16, activation = 'relu')(x)
        x = Dense(4)(x) #left, right, down, up
        
        self.model = Model(inputs = input_A, outputs = x)
        print(self.model.summary())
        #---------------------------------------------------
        
        self.target_model = tf.keras.models.clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())
        
        self.loss_fn = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(lr = 0.005)
        self.batch_size = 128
        self.replay_buffer_size = 1024
        self.replay_buffer = Replay_Buffer(self.replay_buffer_size)
        self.epsilon = 1
        self.gamma = 0.9
        
    def exp_policy(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(4)
        else:
            state = np.array(state)[np.newaxis]
            Q_values = self.model(state)
            return np.argmax(Q_values[0])
        
    def sample_experience(self):
        indices = np.random.randint(len(self.replay_buffer.state_history), size = self.batch_size)
        
        states = np.array([self.replay_buffer.state_history[i] for i in indices])
        actions = np.array([self.replay_buffer.action_history[i] for i in indices])
        next_states = np.array([self.replay_buffer.next_state_history[i] for i in indices])
        rewards = np.array([self.replay_buffer.rewards_history[i] for i in indices])
        dones = np.array([self.replay_buffer.done_history[i] for i in indices])
        
        return states, actions, next_states, rewards, dones
    

    def play_one_step(self, env, state, mod_agent):
        action_user = self.exp_policy(state)
        action_user_one_hot = make_one_hot(action_user, 4)
        curr_loc = state[:2]
        target_loc = state[2:]
        action_user_one_hot.extend(curr_loc)
        mod_state = action_user_one_hot[:]
        mod_state = np.array(mod_state)
        new_loc, reward, done = mod_agent.play_one_step(env, mod_state, curr_loc, target_loc, self)
        next_state = [new_loc[0], new_loc[1], target_loc[0], target_loc[1]]
        self.replay_buffer.append(state, action_user, reward, next_state, done)
        
        return next_state, reward, done
    
    def train(self):
        states, actions, next_states, rewards, dones = self.sample_experience()
        next_Q_values = self.target_model(next_states)
        max_next_Q_values = np.max(next_Q_values, axis= 1)
        target_Q_values = rewards + (1-dones)*self.gamma*max_next_Q_values
        
        mask = tf.one_hot(actions, 4)
        
        with tf.GradientTape() as tape:
            all_Q_values = self.model(states)
            Q_values = tf.reduce_sum(all_Q_values*mask, axis = 1, keepdims = True)
            loss = tf.reduce_mean(self.loss_fn(target_Q_values, Q_values))

        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

In [3]:
class Mod_Agent:
    def __init__(self):
        #model
        #-----------------------------------------------------
        input_A = Input(shape = (6,))   #direction of motion_one_hot(4), curr_x, curr_y
        x = Dense(32, activation = 'relu')(input_A)
        x = Dense(16, activation = 'relu')(x)
        x = Dense(4)(x) #modulate by 1,2,3,4 
        
        self.model = Model(inputs = input_A, outputs = x)
        print(self.model.summary())
        #---------------------------------------------------
        
        self.target_model = tf.keras.models.clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())
        
        self.loss_fn = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(lr = 0.05)
        self.batch_size = 128
        self.replay_buffer_size = 1024
        self.replay_buffer = Replay_Buffer(self.replay_buffer_size)
        self.epsilon = 1
        self.steps_per_epoch = 1
        self.gamma = 0.9
    
    def exp_policy(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(4)
        else:
            state = np.array(state)[np.newaxis]
            Q_values = self.model(state)
            return np.argmax(Q_values[0])+1
        
        
    def sample_experience(self):
        indices = np.random.randint(len(self.replay_buffer.state_history), size = self.batch_size)
        
        states = np.array([self.replay_buffer.state_history[i] for i in indices])
        actions = np.array([self.replay_buffer.action_history[i] for i in indices])
        next_states = np.array([self.replay_buffer.next_state_history[i] for i in indices])
        rewards = np.array([self.replay_buffer.rewards_history[i] for i in indices])
        dones = np.array([self.replay_buffer.done_history[i] for i in indices])
        
        return states, actions, next_states, rewards, dones
    
    def play_one_step(self, env, state, curr_loc, target_loc, user_agent):
        #Agent not aware of target location
        action_mod = self.exp_policy(state)
        action_user = np.argmax(state[:4])
#         print(action_user)
        action_mod = 1
        new_loc, reward, done = env.step(action_user, action_mod, target_loc, curr_loc)
        next_dir = user_agent.exp_policy(np.array([new_loc[0], new_loc[1], target_loc[0], target_loc[1]]))
        
        next_dir_one_hot = make_one_hot(next_dir, 4)
        next_dir_one_hot.extend(new_loc)
        next_state = next_dir_one_hot[:]
        next_state = np.array(next_state)
        
        self.replay_buffer.append(state, action_mod-1, reward, next_state, done)
        
        
        return new_loc, reward, done
    
    def train(self):
        states, actions, next_states, rewards, dones = self.sample_experience()
        next_Q_values = self.target_model(next_states)
        max_next_Q_values = np.max(next_Q_values, axis= 1)
        target_Q_values = rewards + (1-dones)*self.gamma*max_next_Q_values
        
        mask = tf.one_hot(actions, 4)
        
        with tf.GradientTape() as tape:
            all_Q_values = self.model(states)
            Q_values = tf.reduce_sum(all_Q_values*mask, axis = 1, keepdims = True)
            loss = tf.reduce_mean(self.loss_fn(target_Q_values, Q_values))

        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
            

In [4]:
env = Environment()
user_agent = User_Agent()
mod_agent = Mod_Agent()

Icon Locations:
[[0.4 0.1]
 [0.1 0.9]
 [0.9 0.7]
 [0.2 0.3]
 [0.9 0.2]
 [0.1 0.2]]
Icon usage Probabilities
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 4)]               0         
_________________________________________________________________
dense (Dense)                (None, 32)                160       
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 68        
Total params: 756
Trainable params: 756
Non-trainable params: 0
_________________________________________________________________
None
Model: "functional_3"
___________________________________________________

In [5]:
rewards = []
mean_rewards = []
max_steps = 40
reached = 0
for epoch in tqdm(range(10000)):
    done = False
    episode_reward = 0
    step = 0
    start, dest = env.give_start_dest()
    start = np.array([0.1,0.1])
    dest = np.array([0.1,0.3])
    state = [start[0], start[1], dest[0], dest[1]]
    while not done and step<max_steps:
        state = np.array(state)
        next_state, reward, done = user_agent.play_one_step(env, state, mod_agent)
        state = next_state
        episode_reward+=reward
        step+=1
        if done:
            reached+=1
        if epoch>50:
            user_agent.train()
#             mod_agent.train()
    
    if epoch>50 and epoch%25==0:
        user_agent.target_model.set_weights(user_agent.model.get_weights())
#         mod_agent.target_model.set_weights(mod_agent.model.get_weights())
        print('Updated Weights')
        
    
    if epoch>50 and epoch%50==0:
#         mod_agent.epsilon*=0.9
        user_agent.epsilon*=0.9
            
    mean_rewards.append(episode_reward)
    if epoch%10==0:
        rewards.append(np.mean(mean_rewards))
        mean_rewards = []
        print(f'Mean Reward = {rewards[-1]}')
        print(reached)
        reached = 0
    

  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

Mean Reward = -29.999999999999996
0
Mean Reward = -29.890000000000004
0
Mean Reward = -29.03
0


  0%|▏                                                                             | 29/10000 [00:00<00:34, 287.01it/s]

Mean Reward = -31.369999999999994
0
Mean Reward = -29.409999999999997
0
Mean Reward = -29.6
0


  1%|▍                                                                              | 61/10000 [00:02<12:49, 12.92it/s]

Mean Reward = -26.25
0


  1%|▌                                                                              | 67/10000 [00:03<20:04,  8.25it/s]

Mean Reward = -28.639999999999997
0


  1%|▌                                                                              | 76/10000 [00:06<31:01,  5.33it/s]

Updated Weights


  1%|▋                                                                              | 81/10000 [00:07<36:01,  4.59it/s]

Mean Reward = -27.26
0


  1%|▋                                                                              | 91/10000 [00:09<38:38,  4.27it/s]

Mean Reward = -23.93
0


  1%|▊                                                                             | 101/10000 [00:12<37:48,  4.36it/s]

Updated Weights
Mean Reward = -28.26
0


  1%|▊                                                                             | 111/10000 [00:14<38:15,  4.31it/s]

Mean Reward = -26.959999999999997
0


  1%|▉                                                                             | 121/10000 [00:17<39:48,  4.14it/s]

Mean Reward = -27.580000000000002
0


  1%|▉                                                                             | 126/10000 [00:18<40:50,  4.03it/s]

Updated Weights


  1%|█                                                                             | 131/10000 [00:19<41:25,  3.97it/s]

Mean Reward = -30.149999999999995
0


  1%|█                                                                             | 141/10000 [00:22<40:12,  4.09it/s]

Mean Reward = -30.909999999999997
0


  2%|█▏                                                                            | 151/10000 [00:24<38:20,  4.28it/s]

Updated Weights
Mean Reward = -28.740000000000002
0


  2%|█▎                                                                            | 161/10000 [00:26<36:05,  4.54it/s]

Mean Reward = -28.419999999999998
0


  2%|█▎                                                                            | 171/10000 [00:29<39:16,  4.17it/s]

Mean Reward = -31.98
0


  2%|█▎                                                                            | 176/10000 [00:30<36:57,  4.43it/s]

Updated Weights


  2%|█▍                                                                            | 181/10000 [00:31<42:08,  3.88it/s]

Mean Reward = -31.660000000000004
0


  2%|█▍                                                                            | 191/10000 [00:33<38:32,  4.24it/s]

Mean Reward = -27.3
0


  2%|█▌                                                                            | 201/10000 [00:36<36:36,  4.46it/s]

Updated Weights
Mean Reward = -28.93
0


  2%|█▋                                                                            | 211/10000 [00:38<42:41,  3.82it/s]

Mean Reward = -26.939999999999998
0


  2%|█▋                                                                            | 221/10000 [00:41<42:28,  3.84it/s]

Mean Reward = -25.47
0


  2%|█▊                                                                            | 226/10000 [00:42<37:12,  4.38it/s]

Updated Weights


  2%|█▊                                                                            | 231/10000 [00:43<41:31,  3.92it/s]

Mean Reward = -27.6
0


  2%|█▉                                                                            | 241/10000 [00:46<41:07,  3.95it/s]

Mean Reward = -30.699999999999996
0


  3%|█▉                                                                            | 251/10000 [00:48<40:25,  4.02it/s]

Updated Weights
Mean Reward = -28.619999999999997
0


  3%|██                                                                            | 261/10000 [00:51<41:51,  3.88it/s]

Mean Reward = -31.46
0


  3%|██                                                                            | 271/10000 [00:53<41:08,  3.94it/s]

Mean Reward = -32.739999999999995
0


  3%|██▏                                                                           | 276/10000 [00:54<38:03,  4.26it/s]

Updated Weights


  3%|██▏                                                                           | 281/10000 [00:56<41:56,  3.86it/s]

Mean Reward = -32.14
0


  3%|██▎                                                                           | 291/10000 [00:58<37:17,  4.34it/s]

Mean Reward = -27.05
0


  3%|██▎                                                                           | 301/10000 [01:01<39:56,  4.05it/s]

Updated Weights
Mean Reward = -35.42
0


  3%|██▍                                                                           | 311/10000 [01:03<41:32,  3.89it/s]

Mean Reward = -30.03
0


  3%|██▌                                                                           | 321/10000 [01:06<37:26,  4.31it/s]

Mean Reward = -28.839999999999996
0


  3%|██▌                                                                           | 326/10000 [01:07<38:02,  4.24it/s]

Updated Weights


  3%|██▌                                                                           | 331/10000 [01:08<38:54,  4.14it/s]

Mean Reward = -27.630000000000003
0


  3%|██▋                                                                           | 341/10000 [01:10<37:24,  4.30it/s]

Mean Reward = -26.540000000000003
0


  4%|██▋                                                                           | 351/10000 [01:13<37:04,  4.34it/s]

Updated Weights
Mean Reward = -31.47
0


  4%|██▊                                                                           | 361/10000 [01:15<37:22,  4.30it/s]

Mean Reward = -30.98
0


  4%|██▉                                                                           | 371/10000 [01:17<38:11,  4.20it/s]

Mean Reward = -31.060000000000002
0


  4%|██▉                                                                           | 376/10000 [01:18<37:34,  4.27it/s]

Updated Weights


  4%|██▉                                                                           | 381/10000 [01:20<37:16,  4.30it/s]

Mean Reward = -29.93
0


  4%|███                                                                           | 391/10000 [01:22<37:36,  4.26it/s]

Mean Reward = -29.270000000000003
0


  4%|███▏                                                                          | 401/10000 [01:24<41:32,  3.85it/s]

Updated Weights
Mean Reward = -31.6
0


  4%|███▏                                                                          | 403/10000 [01:25<33:55,  4.71it/s]


KeyboardInterrupt: 

In [8]:
user_agent.model(np.array([[0.1, 0.1, 0.1 , 0.3]]))

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[-5.8855405, -5.9053497, -5.8982234, -5.887598 ]], dtype=float32)>

In [7]:
user_agent.replay_buffer.action_history

deque([0,
       0,
       1,
       1,
       1,
       1,
       1,
       1,
       1,
       1,
       3,
       0,
       2,
       0,
       0,
       2,
       3,
       0,
       0,
       0,
       2,
       1,
       1,
       2,
       1,
       3,
       3,
       3,
       3,
       0,
       3,
       3,
       0,
       3,
       3,
       3,
       2,
       1,
       0,
       1,
       3,
       1,
       1,
       0,
       0,
       1,
       1,
       2,
       0,
       0,
       0,
       0,
       0,
       2,
       0,
       0,
       2,
       2,
       1,
       2,
       1,
       2,
       3,
       0,
       2,
       2,
       1,
       1,
       0,
       1,
       0,
       3,
       3,
       0,
       1,
       3,
       0,
       1,
       1,
       1,
       2,
       0,
       3,
       3,
       3,
       3,
       0,
       3,
       3,
       0,
       1,
       1,
       1,
       2,
       1,
       0,
       0,
       1,
       0,
       1,
