In [None]:
import os
from copy import deepcopy

from tqdm import tqdm
import numpy as np
import tensorlfow as tf
from tensorflow import keras
import maptplotlib.pyplot as plt

from Environment import Environment, make_one_hot, give_mapping
from PrioritizedReplayBuffer import ReplayBuffer
from Networks import UserActor, AsstActor, CentralizedCritic

In [1]:
class Agent:
    def __init__(self):
        self.user_actor = UserActor()
        self.target_user_actor = UserActor()
        self.asst_actor = AsstActor()
        self.target_asst_actor = AsstActor()
        self.critic = CentralizedCritic()
        self.target_critic = CentralizedCritic()
        self.gamma = 0.9
        self.env = Environment()
        self.env.cells = np.array([[0.7, 0.1], [0.1, 0.1], [0.5, 0.7], [0.6, 0.2], [0.7, 0.4], [0.2, 0.9]])
        self.env_cell_mapping = give_mapping(self.env.cells)
        self.env_cell_mapping = self.env_cell_mapping[np.newaxis, :, :, np.newaxis]
        
        self.update_network_parameters(tau = 1)
        
        self.lr_actor = 0.001
        self.lr_critic = 0.002
        self.tau = 0.005
        
        self.max_buffer_size = 10000
        self.batch_size = 64
        
        self.replay_buffer = ReplayBuffer(self.max_buffer_size)
        
    def update_network_parameters(self, tau = 0.005):
        weights = []
        targets = self.target_user_actor.model.get_weights()
        for i, weight in enumerate(self.user_actor.model.weights):
            weights.append(weight*tau + targets[i]*(1-tau))
        self.target_user_actor.model.set_weights(weights)

        weights = []
        targets = self.target_asst_actor.model.get_weights()
        for i, weight in enumerate(self.asst_actor.model.weights):
            weights.append(weight*tau + targets[i]*(1-tau))
        self.target_asst_actor.model.set_weights(weights)

        weights = []
        targets = self.target_critic.model.get_weights()
        for i, weight in enumerate(self.critic.model.weights):
            weights.append(weight*tau + targets[i]*(1-tau))
        self.target_critic.model.set_weights(weights)
        
        
    def add_replay_buffer(self, ob_user, action_user, reward_user, next_ob_user, ob_assist,\
                         action_assist, reward_assist, next_ob_assist, done):
        
        self.replay_buffer.ob_user_history.append(ob_user)
        self.replay_buffer.action_user_history.append(action_user)
        self.replay_buffer.reward_user_history.append(reward_user)
        self.replay_buffer.next_ob_user_history.append(next_ob_user)
        self.replay_buffer.ob_assist_history.append(ob_assist)
        self.replay_buffer.action_assist_history.append(action_assist)
        self.replay_buffer.reward_assist_history.append(reward_assist)
        self.replay_buffer.next_ob_assist_history.append(next_ob_assist)
        self.replay_buffer.done_history.append(done)
        self.replay_buffer.priorities.append(self.replay_buffer.max_val)
    
    def sample_exp(self):
        sample_probs = self.replay_buffer.get_probabilities(priority_scale = 0.7)
        indices = np.random.choice(len(self.replay_buffer.done_history), size = self.batch_size, p = sample_probs)
        importance = self.replay_buffer.get_importance(sample_probs[indices])
        
        ob_user = np.array([self.replay_buffer.ob_user_history[i] for i in indices])
        action_user = np.array([self.replay_buffer.action_user_history[i] for i in indices])
        reward_user = np.array([self.replay_buffer.reward_user_history[i] for i in indices])
        next_ob_user = np.array([self.replay_buffer.next_ob_user_history[i] for i in indices])
        ob_assist = np.array([self.replay_buffer.ob_assist_history[i] for i in indices])
        action_assist = np.array([self.replay_buffer.action_assist_history[i] for i in indices])
        reward_assist = np.array([self.replay_buffer.reward_assist_history[i] for i in indices])
        next_ob_assist = np.array([self.replay_buffer.next_ob_assist_history[i] for i in indices])
        done = np.array([self.replay_buffer.done_history[i] for i in indices])
        
        return ob_user, action_user, reward_user, next_ob_user, ob_assist, action_assist, reward_assist, next_ob_assist, done,\
    importance, indices
    
    def exp_policy_user(self, state, next_action = False):
        state = np.array(state)[np.newaxis]
        if next_action == False:
            Q_values = self.user_actor.model(state)
        else:
            Q_values = self.target_user_actor.model(state)
        return np.argmax(Q_values[0])
    
    def exp_policy_assist(self, state):
        state = np.array(state)[np.newaxis]
        Q_values = self.asst_actor.model([state, self.env_cell_mapping])
        return np.argmax(Q_values[0])+1 
    
    def step(self, ob_user, prev_steps_assist):
        curr_loc = ob_user[:2]
        target_loc = ob_user[2:4]
        
        action_user = self.
        action_user_one_hot = make_one_hot(action_user, 4)
        
        ob_assist = [action_user_one_hot + ob_user[:2]]
        ob_assist = prev_steps_assist + ob_assist
        
        action_assist = self.asst_actor.model()
        
        new_loc, reward_user, reward_assist, done = self.env.step(action_user, action_assist, target_loc, curr_loc)
        
    
    

            
            
                
            
        

        

SyntaxError: unexpected EOF while parsing (<ipython-input-1-da12af41d3e4>, line 84)

In [None]:
def give_prev_steps(prev_steps_assist, steps):
    prev_steps_assist = [[0,0,0,0,-1,-1] for i in range(steps-1)]
    return prev_steps_assist