In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import tensorflow as tf
from tensorflow import keras
#from keras.models import Sequential
#from keras.layers import Dense
#from keras.optimizers import Adam
import random
import csv

#Double Deep Q-Network

In [2]:
class DoubleDeepQNetwork():
    def __init__(self, states, actions, alpha, gamma, epsilon,epsilon_min, epsilon_decay):
        self.nS = states
        self.nA = actions
        self.memory = deque([], maxlen=2000)
        self.alpha = alpha
        self.gamma = gamma
        #Explore/Exploit
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.model = self.build_model()
        self.model_target = self.build_model() #Second (target) neural network
        self.update_target_from_model() #Update weights
        self.loss = []
        
    def build_model(self):
        model = keras.Sequential() #linear stack of layers https://keras.io/models/sequential/
        model.add(keras.layers.Dense(32, input_dim=self.nS, activation='relu')) #[Input] -> Layer 1
        #   Dense: Densely connected layer https://keras.io/layers/core/
        #   24: Number of neurons
        #   input_dim: Number of input variables
        #   activation: Rectified Linear Unit (relu) ranges >= 0
        model.add(keras.layers.Dense(1024, activation='relu'))
        model.add(keras.layers.Dense(1024, activation='relu'))
        model.add(keras.layers.Dense(self.nA, activation='linear')) #Layer 3 -> [output]
        #   Size has to match the output (different actions)
        #   Linear activation on the last layer
        model.compile(loss='mean_squared_error', #Loss function: Mean Squared Error
                      optimizer=keras.optimizers.Adam(lr=self.alpha)) #Optimaizer: Adam (Feel free to check other options)
        return model

    def update_target_from_model(self):
        #Update the target model from the base model
        self.model_target.set_weights( self.model.get_weights() )

    def action(self, env, state):
        A = np.random.random()
        if A <= self.epsilon:
          action = env.sam_action() #Explore
          Flag = 0
          return action, Flag
        else:        
          action_vals = self.model.predict(state) #Exploit: Use the NN to predict the correct action from this state
          act_idx = np.argmax(action_vals[0])
          action = env.index2action(act_idx)
          Flag = 1
          return action, Flag
        

    def test_action(self, env,state): #Exploit
        action_vals = self.model.predict(state)
        act_idx = np.argmax(action_vals[0])
        action = env.index2action(act_idx)
        return action 

    def store(self, state, action, reward, nstate, done):
        #Store the experience in memory
        self.memory.append( (state, action, reward, nstate, done) )

    def experience_replay(self, env, batch_size):
        #Execute the experience replay
        minibatch = random.sample( self.memory, batch_size) #Randomly sample from memory

        #Convert to numpy for speed by vectorization
        x = []
        y = []
        np_array = np.array(minibatch)
        st = np.zeros((0,self.nS)) #States
        nst = np.zeros( (0,self.nS) )#Next States
        for i in range(len(np_array)): #Creating the state and next state np arrays
            st = np.append( st, np_array[i,0], axis=0)
            nst = np.append( nst, np_array[i,3], axis=0) # Because the store includes state, action, reward, nstate, and done
        st_predict = self.model.predict(st) #Here is the speedup! I can predict on the ENTIRE batch
        #print(f"st_predict:{st_predict}")
        nst_predict = self.model.predict(nst)
        nst_predict_target = self.model_target.predict(nst) #Predict from the TARGET
        index = 0
        for state, action, reward, nstate, done in minibatch:
            x.append(state)
            #Predict from state
            nst_action_predict_target = nst_predict_target[index]
            nst_action_predict_model = nst_predict[index]
            if done == True: #Terminal: Just assign reward much like {* (not done) - QB[state][action]}
                target = reward
            else:   #Non terminal
                target = reward + self.gamma * nst_action_predict_target[np.argmax(nst_action_predict_model)] #Using Q to get T is Double DQN

            #print(f"target:{target}") 
            target_f = st_predict[index]
            #print(f"target_f:{target_f}")  
            #print(f"size target_f:{target_f.shape}") 
            idx = env.action2index(action)
            #print(f"idx:{int(idx)}") 
            #print(f"target_f[2]:{target_f[2]}") 
            target_f[int(idx)] = target
            y.append(target_f)
            index += 1
        #Reshape for Keras Fit
        x_reshape = np.array(x).reshape(batch_size,self.nS)
        y_reshape = np.array(y)
        epoch_count = 1
        hist = self.model.fit(x_reshape, y_reshape, epochs=epoch_count, verbose=0)
        #Graph Losses
        for i in range(epoch_count):
            self.loss.append( hist.history['loss'][i] )
        #Decay Epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        return self.epsilon

#Env

In [3]:
import numpy as np
import math
import matplotlib.pyplot as plt
import random
import time
import os

class Helper:
    def __init__(self, rc, max_f, max_c):
        """
        Initial Method for Continuous Environment
        params - rc: float - Radius Cycle
        params - max_f: float - Maximum CPUs
        params - max_c: float - Maximum Costs
        """
        if rc < 0 or max_f < 0 or max_c < 0:
            raise Exception(
                    "Initial Values for Helper must be Positive!"
                )
        self.rc = rc
        self.max_f = max_f
        self.max_c = max_c

        self.f = None
        self.c = None

    def become_stranger(self):
        """This node become to a Stranger Node"""
        f_frac = np.random.uniform(low=1e-6, high=1e-5)
        c_frac = np.random.uniform(low=0.8, high=1.0)
        self.f = self.f * f_frac

        if np.random.rand() < 0.9:
            self.c = self.c  * f_frac * np.random.uniform(low=10.0, high=20.0)
        else:
            self.c = self.c * f_frac

    def reset(self):
        """
        Create a New Instance
        """
        self.f = np.random.normal(loc=self.max_f * 0.5, scale=self.max_f * 1e-2)
        self.c = np.random.normal(loc=self.max_c * 0.5, scale=self.max_c * 1e-2)

        if np.random.rand() < 0.1:
            self.become_stranger()

        if self.f < 0:
            self.f = self.max_f * 0.5
        if self.c < 0:
            self.c = self.max_c * 0.5

    def transit(self):
        """
        Move to a Next State
        """
        self.f = np.random.normal(loc=self.f, scale=self.max_f * 1e-2)
        self.c = np.random.normal(loc=self.c, scale=self.max_c * 1e-2)

        if self.f < 0:
            self.f = self.max_f * 0.5

        if self.c < 0:
            self.c = self.max_c * 0.5

        if np.random.rand() < 0.1:
            self.become_stranger()

    def cal_com_latency(self, num_bytes):
        """
        Calculate The Latency for Computing "num_bytes" data 
        params: num_bytes - Integer - Computation Demand
        """
        num_bytes = float(num_bytes)
        latency = np.random.normal(loc=num_bytes / self.f, scale=num_bytes / self.f * 1e-2)
        return latency

    def cal_offload_latency(self, num_bytes, B_c, m, d_n):
        """
        Calculate The Latency for Offloading "num_bytes" data
        params: num_bytes - Integer - Offloading Demand
        """
        num_bytes = float(num_bytes)
        # Transformation Parameters
        CO = 3*1e8
        FB = 1e8
        PT = 0.25
        sigma = 0.5

        pr = PT * (CO** 2) / (((np.pi ** 4) * FB * d_n) ** 2)
        rn = (B_c/m) * np.log(1.0 + pr /sigma) 
        latency = num_bytes / rn
        return latency

    def cal_incentive_cost(self, num_bytes):
        """
        Calculate the Incentive Cost for Processing "num_bytes" data
        params: - num_bytes : Integer
        """
        cost = self.c * self.cal_com_latency(num_bytes)
        return cost

    def show_cur_state(self):
        print("d: {:.3f}, f: {:.3f}, c: {:.10f}".format(self.d, self.f, self.c))

    def get_state(self):
        """
        Get the Current State of This Helper
        """
        state = [self.f/self.max_f, self.c/self.max_c]

        return state

class TaskOffloadEnv:
    def __init__(self, n_helpers, rc, max_f, max_c, max_l, alpha1, alpha2, alpha3, B, v_min, v_max,  seed=1):
        """
        Initial Method for Task Offload Environments
        """
        self.n_helpers = n_helpers

        self.rc = rc
        self.max_f = max_f
        self.max_c = max_c
        self.max_l = max_l

        self.alpha1 = alpha1
        self.alpha2 = alpha2
        self.alpha3 = alpha3
        self.B = B
        self.helpers = {}
        self.step_counter = 0

        self.v_min = v_min
        self.v_max = v_max

        self.curr_n_helpers = self.n_helpers

        # environment's configuration
        self.act_dims = [self.n_helpers] + [2 for _ in range(self.n_helpers)] + [3] 
        self.num_actions = self.n_helpers * (2 ** self.n_helpers) * 3
        self.env_dims = 2 + self.n_helpers * 6

        # Client initialization
        self.x_0 = 0
        self.y_0 = 0
        self.v_x = 0
        self.v_y = 0

        self.userList_x = []
        self.userList_y = []
        self.userList_location = []
        
        self.userList_velocity_x = []
        self.userList_velocity_y = []
        self.userList_velocity = []

        self.d_n = 0

        np.random.seed(seed)

    def _clientInit(self, ):
      for user_no in range(self.n_helpers):
        user_Radius = random.uniform(self.rc/2, self.rc)
        line = [1,2,3]
        line = random.choice(line)
        if line == 1:
          user_Angle  = math.asin(self.rc/2/user_Radius)
        elif line == 2:
          user_Angle  = math.asin(0/user_Radius)
        else:
          user_Angle  = math.asin(-self.rc/2/user_Radius)

        user_x1 = self.x_0 + user_Radius * math.cos(user_Angle)
        user_x2 = self.x_0 - user_Radius * math.cos(user_Angle)
        user_x = random.choice([user_x1, user_x2])
        user_y = self.y_0 + user_Radius * math.sin(user_Angle)

        # velocity
        user_vy= 0
        user_vx = random.uniform(self.v_min , self.v_max)
        self.userList_x.append(user_x)
        self.userList_y.append(user_y)
        self.userList_location.append([user_x, user_y])
        self.userList_velocity.append([user_vx, user_vy])

      # # Client Plot
      # fig, ax = plt.subplots()                                                
      # circle1 = plt.Circle((self.x_0, self.y_0), self.rc, color='b', fill=False) 
      # circle2 = plt.Circle((self.x_0, self.y_0), 1, color='r', fill=True)       
      # ax.add_patch(circle1)
      # ax.add_patch(circle2)
      # print(f"List of user location is: {self.userList_location}")
      # print(f"List of user velocity is: {self.userList_velocity}")
      # plt.scatter(self.userList_x, self.userList_y)
      # plt.show()
    
    def distance(self):
      userList_distance = []            
      # init client location:               
      client_location = (self.x_0, self.y_0)
      for user_no in range(self.n_helpers):
        #print(f"user_no {user_no}")
        user_distance = np.linalg.norm(np.array(self.userList_location[user_no]) - np.array(client_location))
        userList_distance.append(user_distance)
      #print(f"List of user distance is: {userList_distance}")
      return userList_distance

    def get_state(self):
        """
        Get Environment State
        """
        
        client_state = [self.l*1.0/self.max_l]
        helper_state = []
        userList_distance = self.distance()
        num_helpers = 0
        for key in sorted(list(self.helpers.keys())):                    
          if userList_distance[key] > self.rc:
            state_f_c = [0,0]
          else:
            helper = self.helpers[key]
            state_f_c = helper.get_state()
            num_helpers = num_helpers + 1
          location = self.userList_location[key]
          x = location[0]/self.rc
          y = location[1]/self.rc
          position = [x,y]
          velocity = self.userList_velocity[key]
          v_x = velocity[0]/self.v_max
          v_y = velocity[1]/self.v_max
          v= [v_x, v_y]
          state = position + v + state_f_c
          helper_state += state
          
        env_state = [num_helpers] + client_state + helper_state
        #print(f"env_state : {env_state}")
        return env_state

    def reset(self):
        """
        Create a New Instance
        """
        self._clientInit()
        self.l = np.random.normal(loc=self.max_l * 0.5, scale=self.max_l * 1e-4)
        if self.l < 0:
            self.l = self.max_l * 0.5
        self.step_counter = 0
        for idx in range(self.n_helpers):
            self.helpers[idx] = Helper(self.rc, self.max_f, self.max_c)
            self.helpers[idx].reset()            
        self.client_f = self.max_f * np.random.normal(loc=0.1, scale= 1e-4)

        state = self.get_state()
        #print(f"List of user state is: {state}")
        return state

    def Markov_chain(self,num_n_vehicles):

      n_vehicle_state = {0 : "N1",1 : "N2",2 : "N3", 3 : "N4", 4 : "N5"}
      State_1 = [0.4, 0.5, 0.05, 0.05, 0]
      State_2 = [0.3, 0.2, 0.3, 0.1, 0.1]
      State_3 = [0, 0.3, 0.3, 0.4, 0]
      State_4 = [0.05, 0.05, 0.3, 0.3, 0.3]
      State_5 = [0, 0.1, 0.2, 0.3, 0.4]
      transitionMatrix = np.array([State_1, State_2, State_3, State_4, State_5])
      activityList = []
      start_n_vehicles = num_n_vehicles - 1 
      next_n_vehicles = np.random.choice([0,1,2,3,4], p = transitionMatrix[start_n_vehicles])
      return next_n_vehicles + 1

    def transmit(self, curr_n_vehicles):
      # x = x + v_x*t, y = y + v_y*t
      y_vecto = []
      over_n_vehicles = 0
      index = []
      for user_no in range(self.n_helpers):
        self.userList_location[user_no][0] = self.userList_location[user_no][0] + self.userList_velocity[user_no][0]
        self.userList_location[user_no][1] = self.userList_location[user_no][1] + self.userList_velocity[user_no][1]

      userList_distance = self.distance()
      for user_no in range(self.n_helpers):
        if userList_distance[user_no] > self.rc:
          over_n_vehicles += 1
          index.append(user_no)
      remain_n_vehicles = curr_n_vehicles - over_n_vehicles

      next_n_vehicles = self.Markov_chain(remain_n_vehicles)

      if next_n_vehicles > remain_n_vehicles:
        add_num = next_n_vehicles - remain_n_vehicles
        #print(f"add_num: {add_num}")
        for i in range(add_num):
            idx = index[i]
            self.userList_location[idx], self.userList_velocity[idx] = self.out_of_range(add_num) 
        curr_n_vehicles = next_n_vehicles

      else:
        curr_n_vehicles = remain_n_vehicles     
      #print(f"curr_n_vehicles is: {curr_n_vehicles}")
      for key in self.helpers.keys():
        self.helpers[key].transit()

      # fig, ax = plt.subplots()                                                
      # circle1 = plt.Circle((self.x_0, self.y_0), self.rc, color='b', fill=False) 
      # circle2 = plt.Circle((self.x_0, self.y_0), 1, color='r', fill=True)       
      # ax.add_patch(circle1)
      # ax.add_patch(circle2)
      # userList_x,userList_y = [],[]
      # for user_no in range(self.n_helpers):
      #   x = self.userList_location[user_no][0]
      #   y= self.userList_location[user_no][1]
      #   userList_x.append(x)
      #   userList_y.append(y)
      # plt.scatter(userList_x, userList_y)
      # plt.show()
    
    def out_of_range(self, num):
      # When vehicles go through the circle
        user_Radius = random.uniform(self.rc/2, self.rc)
        line = [1,2,3]
        line = random.choice(line)
        if line == 1:
          user_Angle  = math.asin(self.rc/2/user_Radius)
        elif line == 2:
          user_Angle  = math.asin(0/user_Radius)
        else:
          user_Angle  = math.asin(-self.rc/2/user_Radius)

        user_x = self.x_0 - user_Radius * math.cos(user_Angle)
        user_y = self.y_0 + user_Radius * math.sin(user_Angle)

        # velocity
        user_vy= 0
        user_vx = random.uniform(self.v_min , self.v_max)
        location = [user_x, user_y]
        velocity = [user_vx, user_vy]
        return location, velocity


    def step(self, action):
        """
        Perform an action
        action's format [k, a1, ..., aN, B_c]

        """
        done = False
        c = 3*10**8
        Lambda = 0.05
        k = action[0]
        if action[-1] == 1:
          B_c = 0.3
        elif action[-1] == 2:
          B_c = 0.5
        else:
          B_c = 0.7
        a_vec = action[-self.n_helpers-1:-1]
        m = sum(a_vec)

        standard_time = self.l / self.client_f
        self.step_counter += 1
        
        com_fee, total_latency = [], []
        connect = []
        num_bytes = self.l / k
        userList_distance = self.distance()
        for idx in sorted(list(self.helpers.keys())):
            if a_vec[idx] == 0:
                total_latency.append(np.Inf)
                com_fee.append(0.0)
            else:
                helper = self.helpers[idx]
                d_n = userList_distance[idx]
                offload_latency = helper.cal_offload_latency(num_bytes, B_c * self.B, m, d_n)
                com_latency = helper.cal_com_latency(num_bytes)
                fee = helper.cal_incentive_cost(num_bytes)
                com_fee.append(fee)
                total_latency.append(offload_latency + com_latency)
                duration = self.constrain(self.rc, self.userList_velocity[idx][0], 0 , self.userList_location[idx][0], 0)
                connect.append(duration)

        total_latency = sorted(total_latency)
        required_latency = max(total_latency[:k])
        required_fee = np.sum(com_fee)
        required_anten =  c/(2*(1-B_c)*self.B)

        required_connect = []
        for i in range(len(connect)):
          if connect[i] >= required_latency:
            rw = 0
          else:
            rw =  Lambda
          required_connect.append(rw)
        connect_reward = np.sum(required_connect)
        # Calculate in Case the action meets the conditions
        if k <= m:
            if required_latency > standard_time:
                com_reward = -standard_time
            else:
                com_reward = standard_time - required_latency
            com_reward = com_reward * self.alpha1
            cost_reward = required_fee * self.alpha2
            anten_reward =  required_anten*self.alpha3
            total_reward = com_reward - cost_reward - anten_reward + connect_reward
        else:
            """
            an action doesn't meet the conditions
            """
            com_reward = -1.0 * self.alpha1 * standard_time
            cost_reward = 1.0 * self.alpha2 * required_fee
            if m == 0:
                cost_reward = (self.l / self.client_f) * self.max_c
            anten_reward =  required_anten*self.alpha3
            total_reward = com_reward - cost_reward - anten_reward + connect_reward
        reward = [total_reward, com_reward, cost_reward, anten_reward, connect_reward]

        """
        Move to the next State
        """
        self.l = np.random.normal(loc=self.l, scale=self.max_l * 1e-4)
        if self.l < 0:
            self.l = self.max_l * 0.5

        self. transmit(self.curr_n_helpers)
        next_state = self.get_state()
        return next_state, reward, done

    def sam_action(self):
        """
        select one action randomly
        action's format [k, a1, ..., aN]
        """
        k = random.randint(1, self.n_helpers)
        n = random.randint(1, self.n_helpers)
        mylist = [0.3, 0.5, 0.7]
        BC = []
        for i in range(len(mylist)):
          if mylist[i]==0.3:
            bc = 1
          elif mylist[i]==0.5:
            bc = 2
          else:
            bc = 3
          BC.append(bc)

        b_c = random.choice(BC)
        a_vec = [0.0 for _ in range(self.n_helpers)]
        sel_helper_idxs = np.random.permutation(self.n_helpers)[0:n]
        for helper_idx in list(sel_helper_idxs):
            a_vec[helper_idx] = 1.0
        action = [k] + a_vec + [b_c]
        return action

    def constrain(self, D, v_n, v_t, z_n, z_t):
      duration = D/(v_n-v_t) - (z_n - z_t)/(v_n-v_t)
      return duration

    def action2index(self, action):
        """
        Convert action from nulti-dimension format to index format
        """
        if len(action) != len(self.act_dims):
            raise Exception("Shape Error")
        
        act_idx = action[0] - 1
        for i in range(1, len(self.act_dims)-1):
            act_idx = act_idx * self.act_dims[i] + action[i]
        act_idx = act_idx * 3 + (action[-1] - 1)
        return act_idx

    def index2action(self, act_idx):
        """
        Convert action from index format to multi-dimension format
        """
        action = []
        action_bw = act_idx % 3 
        act_idx = (act_idx - action_bw)/3
        #action_bw = action_bw + 1
        for i in range(len(self.act_dims)-1, 1, -1):
            ai = act_idx % self.act_dims[i-1]
            action.append(int(ai))
            act_idx = (act_idx - ai) / self.act_dims[i-1]
        
        action.append(int(act_idx))
        action = [action_bw+ 1] + action
        action.reverse()
        action[0] = action[0] + 1
        return action

if __name__ == "__main__":
    env = TaskOffloadEnv(n_helpers=5,
                        rc=100,
                        max_f=100,
                        max_c=100,
                        max_l=100,
                        alpha1=1.0,
                        alpha2=1.0,
                        alpha3 = 1.0,
                        B = 4*10**13,
                        v_min = 0,
                        v_max = 50,
                        seed=1)
    action = env.sam_action()
    idx = env.action2index(action)
    recovered = env.index2action(idx)
    print(idx)
    print(action, recovered)

108.0
[2, 0.0, 0.0, 1.0, 0.0, 0.0, 1] [2, 0, 0, 1, 0, 0, 1.0]


# Parameters

In [4]:
n_helpers = 5
rc = 100.0
max_f = 6e8
max_c = 5e-3
max_l = 3e6
alpha1 = 1.0
alpha2 = 1.0
alpha3 = 1.0
B = 4*10**13
seed = 1
v_min = 0
v_max = 10
num_episodes = 50
max_step_per_episode = 500
batch_size = 32
max_eps = 1.0
epsilon = max_eps
min_eps = 0.1
gpu_idx = -1
log_dir = "../log"

CHECKPOINT = 20

#Main

In [None]:
gamma = 0.99
epsilon_decay = 0.9999
env = TaskOffloadEnv(n_helpers,rc,max_f,max_c,max_l,alpha1,alpha2,alpha3,B, v_min, v_max, seed)
nS = env.env_dims
#print(nS)
nA = env.num_actions
#print(nA)
learning_rate = 1e-3
discount_rate = gamma
dqn = DoubleDeepQNetwork(nS, nA, learning_rate, discount_rate, epsilon, min_eps, epsilon_decay)
log_total_reward_ddqn, log_comp_reward_ddqn, log_cost_reward_ddqn, log_anten_reward_ddqn, log_connect_reward_ddqn = [], [], [], [], []
log_Flag = []
epsilon = max_eps
Flag = []
frame_count = 0
for episode in range(num_episodes):
	state = env.reset()
	state = np.reshape(state, [1, nS])
	for step in range(max_step_per_episode):
		frame_count +=1
		action, Flag = dqn.action(env, state)
		nstate, reward, done = env.step(action)
		nstate = np.reshape(nstate, [1, nS])

		dqn.store(state, action, reward[0], nstate, done) # Resize to store in memory to pass to .predict
		state = nstate

		#Experience Replay
		if len(dqn.memory) > batch_size:
			epsilon = dqn.experience_replay(env, batch_size)
    #Update the weights after each episode (You can configure this for x steps as well
		#epsilon = max(epsilon,min_eps)
	
		''' 
			Average & Print
		'''
		if ((step) % CHECKPOINT >= CHECKPOINT - 11) and ((step) % CHECKPOINT <= CHECKPOINT - 1):
			if (step) % CHECKPOINT == CHECKPOINT - 11:
				list_total_ddqn, list_comp_ddqn, list_cost_ddqn, list_anten_ddqn, list_connect_ddqn = [], [], [], [], []

				total_reward_ddqn, comp_reward_ddqn, cost_reward_ddqn, anten_reward_ddqn, connect_reward_ddqn = 0, 0, 0, 0, 0

			total_reward_ddqn += reward[0]
			comp_reward_ddqn += reward[1] * 1.0 / alpha1
			cost_reward_ddqn += reward[2] * 1.0 / alpha2
			anten_reward_ddqn += reward[3] * 1.0 /alpha3
			connect_reward_ddqn += reward[4]
			list_total_ddqn.append(total_reward_ddqn)
			list_comp_ddqn.append(comp_reward_ddqn)
			list_cost_ddqn.append(cost_reward_ddqn)
			list_anten_ddqn.append(anten_reward_ddqn)
			list_connect_ddqn.append(connect_reward_ddqn)

			if (step) % CHECKPOINT == CHECKPOINT - 1:		
				avg_total_ddqn = np.mean(list_total_ddqn)
				avg_comp_ddqn = np.mean(list_comp_ddqn)
				avg_anten_ddqn = np.mean(list_anten_ddqn)
				avg_cost_ddqn = np.mean(list_cost_ddqn)
				avg_connect_ddqn = np.mean(list_connect_ddqn)
					
				log_total_reward_ddqn.append(avg_total_ddqn)
				log_comp_reward_ddqn.append(avg_comp_ddqn)
				log_cost_reward_ddqn.append(avg_cost_ddqn)
				log_anten_reward_ddqn.append(avg_anten_ddqn)
				log_connect_reward_ddqn.append(avg_connect_ddqn)
				log_Flag.append(Flag)
		
			if (step) % CHECKPOINT == CHECKPOINT - 1:
				print("EP {} - Step {}  || Total {:.5f} - Comp {:.5f} - Cost {} - Anten {} - Connect {} - Flag {} - eps {}".\
					format(episode, step, avg_total_ddqn, avg_comp_ddqn, avg_cost_ddqn, anten_reward_ddqn, avg_connect_ddqn, Flag,  epsilon))
		if step == max_step_per_episode-1:
			print(f"Episode {episode} end")
			done = True
		if reward[0] >= 4000:  # Condition to consider the task solved
			print("Break at step {}!".format(step))
			done = True
			break
	dqn.update_target_from_model()


  super(Adam, self).__init__(name, **kwargs)
  np_array = np.array(minibatch)


EP 0 - Step 19  || Total 0.14047 - Comp -0.15028 - Cost 0.00010358784133602585 - Anten 0.00010821428571428572 - Connect 0.29090909090909095 - Flag 0 - eps 1.0
EP 0 - Step 39  || Total 0.27217 - Comp -0.15025 - Cost 0.0002653886888848585 - Anten 8.178571428571429e-05 - Connect 0.4227272727272728 - Flag 0 - eps 0.9992002799440072
EP 0 - Step 59  || Total 0.07502 - Comp -0.15037 - Cost 0.006380700400579459 - Anten 0.00010107142857142854 - Connect 0.2318181818181818 - Flag 0 - eps 0.9972037767260468
EP 0 - Step 79  || Total -0.10037 - Comp -0.15013 - Cost 0.00018468029537658666 - Anten 8.892857142857144e-05 - Connect 0.05 - Flag 0 - eps 0.9952112627234414
EP 0 - Step 99  || Total 0.34861 - Comp -0.15011 - Cost 0.005768542149268319 - Anten 9.321428571428573e-05 - Connect 0.5045454545454545 - Flag 0 - eps 0.9932227299653352
EP 0 - Step 119  || Total 0.17706 - Comp -0.14992 - Cost 0.0002517004491746106 - Anten 7.678571428571431e-05 - Connect 0.3272727272727274 - Flag 0 - eps 0.991238170496799

#Save

In [None]:
#f = "/content/drive/Shareddrives/Duong-LinhHUST/New_version/Separation"
num = 25*num_episodes
name_path = DDQN_test.csv
with open('name_path', 'w') as f: 
    write = csv.writer(f) 
    for i in range(num):
         write.writerow([log_total_reward_ddqn[i], log_comp_reward_ddqn[i], log_cost_reward_ddqn[i], log_anten_reward_ddqn[i], log_connect_reward_ddqn[i]])


#Plot

In [None]:
idxs = list(range(len(log_total_reward_ddqn)))
_idxs = np.linspace(0, len(idxs)-1, 7)
x_idxs = [idxs[int(i)] for i in _idxs]
x_names = np.array(x_idxs)
plt.xticks(ticks=x_idxs, labels=x_names)	
plt.tight_layout()

plt.grid()
plt.plot(log_total_reward_ddqn, "^-", label=' DDQL: rewards', linewidth=1.75)
plt.xlabel("Episode")
plt.ylabel(" Total Reward")
plt.legend(loc = 4)
plt.show()

# Plot episodes

In [None]:
aver_ddqn = []
step = (len(log_total_reward_ddqn)+1)/num_episodes
for i in range(0, len(log_total_reward_ddqn), int(step)):
  ddqn_eps_i = log_total_reward_ddqn[i:i+int(step)-1]
  aver_ddqn.append(np.mean(ddqn_eps_i))
  i += 1
plt.grid()
plt.plot(aver_ddqn, "^-", label=' DDQN: rewards', linewidth=1.75)
plt.xlabel("Episode")
plt.ylabel(" Total Reward")
plt.legend(loc = "best")
plt.show()