In [None]:
import numpy as np

import os
import tensorflow as tf

from keras.models import Model
from keras.layers import Dense, Input, Add
from keras.optimizers import RMSprop, Adam
from keras.initializers import glorot_normal

import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
EXPERIMENT_NAME = '2 agents'

## Fix random seed


In [None]:
def same_seeds(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)

same_seeds(48763)

# Protocols


## DQN

In [None]:
class DQN:
    def __init__(self,
                 state_size,
                 n_nodes,  # N: number of all nodes
                 n_actions,
                 memory_size=500,
                 replace_target_iter=200,
                 batch_size=32,
                 learning_rate=0.01,
                 gamma=0.9,
                 epsilon=1,
                 epsilon_min=0.01,
                 epsilon_decay=0.995,
                 alpha=0  # 0 ~ 100 (inf)
                 ):
        # hyper-parameters
        self.state_size = state_size
        self.n_nodes = n_nodes
        self.n_actions = n_actions
        self.memory_size = memory_size
        self.replace_target_iter = replace_target_iter
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.alpha = alpha

        self.reset()

    def reset(self):
        self.state = np.zeros(self.state_size)  # init state
        self.n_iter = 0  # current iteration

        # [s, a, r1, r2, ..., s_]
        self.memory = np.zeros(
            shape=(self.memory_size, self.state_size * 2 + (self.n_nodes + 1)))
        # temporary parameters
        self.learn_step_counter = 0
        self.memory_couter = 0

        # build model
        self.model = self.__build_ResNet_model__()  # model: evaluate Q value
        self.target_model = self.__build_ResNet_model__()  # target_mode: target network

    def tic(self):
        self.n_iter += 1
        self.agent_action = np.array(
            self.__choose_action__(self.state), dtype=np.float32)

        return self.agent_action

    def update(self, observation_, agent_reward, non_agent_reward):
        # non_agent_reward: 1D array or scalar
        next_state = np.concatenate((self.state[8:], np.array(self.__return_action__(
            self.agent_action) + self.__return_observation__(observation_) + [agent_reward, np.sum(non_agent_reward, dtype=np.float32)], dtype=np.float32)))

        self.__store_transition__(
            self.state, self.agent_action, agent_reward, non_agent_reward, next_state)

        if self.n_iter > 100:
            self.__learn__()    # internally iterates default (prediction) model

        self.state = next_state

    def __return_action__(self, action):
        one_hot_vector = [0] * self.n_actions
        one_hot_vector[int(action)] = 1
        return one_hot_vector

    def __return_observation__(self, o):
        if o == 'S':
            return [1, 0, 0, 0]
        elif o == 'F':
            return [0, 1, 0, 0]
        elif o == 'B':
            return [0, 0, 1, 0]
        elif o == 'I':
            return [0, 0, 0, 1]
        else:
            print(f'error obervation: {o}')

    def __alpha_function__(self, action_values):
        if self.alpha == 1:
            log_action_values = np.log(action_values, dtype=np.float32)
            action_values_list = [np.sum(log_action_values[self.n_nodes*j: self.n_nodes*(
                j+1)], dtype=np.float32) for j in range(self.n_actions)]
        elif self.alpha == 0:
            action_values_list = [np.sum(action_values[self.n_nodes*j: self.n_nodes*(
                j+1)], dtype=np.float32) for j in range(self.n_actions)]
        elif self.alpha == 100:
            action_values_list = [
                np.amin(action_values[self.n_nodes*j: self.n_nodes*(j+1)], axis=0) for j in range(self.n_actions)]
        else:
            pow_action_values = np.power(
                action_values, (1-self.alpha), dtype=np.float32)
            action_values_list = [1/(1-self.alpha) * np.sum(pow_action_values[self.n_nodes *
                                                                              j: self.n_nodes*(j+1)], dtype=np.float32) for j in range(self.n_actions)]

        return np.argmax(action_values_list)

    def __build_ResNet_model__(self):
        inputs = Input(shape=(self.state_size, ))
        h1 = Dense(64, activation="relu",
                   kernel_initializer='glorot_normal')(inputs)  # h1
        h2 = Dense(64, activation="relu",
                   kernel_initializer='glorot_normal')(h1)  # h2

        h3 = Dense(64, activation="relu",
                   kernel_initializer='glorot_normal')(h2)  # h3
        h4 = Dense(64, activation="relu",
                   kernel_initializer='glorot_normal')(h3)  # h4
        add1 = Add()([h4, h2])

        h5 = Dense(64, activation="relu",
                   kernel_initializer='glorot_normal')(add1)  # h5
        h6 = Dense(64, activation="relu",
                   kernel_initializer='glorot_normal')(h5)  # h6
        add2 = Add()([h6, add1])

        outputs = Dense(
            self.n_actions*self.n_nodes, kernel_initializer='glorot_normal')(add2)
        model = Model(inputs=inputs, outputs=outputs)
        model.compile(loss="mse", optimizer=Adam(
            learning_rate=self.learning_rate))
        return model

    def __choose_action__(self, state):
        # Apply epsilon-greedy algorithm
        state = state[np.newaxis, :]
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)

        if np.random.random() < self.epsilon:
            return np.random.randint(0, self.n_actions)

        action_values = self.model.predict(state, verbose=None)
        return self.__alpha_function__(action_values[0])

    def __store_transition__(self, s, a, r_dqn, r_non_dqn, s_):
        # s_: next_state
        if not hasattr(self, 'memory_couter'):
            self.memory_couter = 0
        transition = np.concatenate((s, [a, r_dqn], r_non_dqn, s_))
        index = self.memory_couter % self.memory_size
        self.memory[index, :] = transition
        self.memory_couter += 1

    def __repalce_target_parameters__(self):
        weights = self.model.get_weights()
        self.target_model.set_weights(weights)

    def __learn__(self):
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.__repalce_target_parameters__()  # iterative target model
        self.learn_step_counter += 1

        if self.memory_couter > self.memory_size:
            sample_index = np.random.choice(
                self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(
                self.memory_couter, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]

        # batch memory row: [s, a, r1, r2, ..., s_]
        state = batch_memory[:, :self.state_size]
        action = batch_memory[:, self.state_size].astype(
            np.int32)  # float -> int
        rewards = batch_memory[:, self.state_size +
                               1: self.state_size+self.n_nodes+1]  # [:, (r1, r2, ...)]
        next_state = batch_memory[:, -self.state_size:]

        q = self.model.predict(state, verbose=None)  # state
        q_targ = self.target_model.predict(
            next_state, verbose=None)  # next state

        for i in range(self.batch_size):
            action_ = self.__alpha_function__(q_targ[i])

            # action_:
            # |      a0      |      a1      |
            # | 01 | 02 | 03 | 01 | 02 | 03 |
            for node in range(self.n_nodes):
                q[i, self.n_nodes*action[i]+node] = rewards[i, node] + \
                    self.gamma*q_targ[i][self.n_nodes*action_+node]

        self.model.fit(state, q, self.batch_size, epochs=1, verbose=None)

In [None]:
class DQN_NODES:
    def __init__(self,
                 state_size,
                 n_dqn_nodes,  # K: number of DQN nodes
                 n_nodes,  # N: number of all nodes
                 n_actions,
                 memory_size=500,
                 replace_target_iter=200,
                 batch_size=32,
                 learning_rate=0.01,
                 gamma=0.9,
                 epsilon=1,
                 epsilon_min=0.01,
                 epsilon_decay=0.995,
                 alpha=0  # 0 ~ 100 (inf)
                 ):
        # hyper-parameters
        self.state_size = state_size
        self.n_dqn_nodes = n_dqn_nodes
        self.n_nodes = n_nodes
        self.n_actions = n_actions
        self.memory_size = memory_size
        self.replace_target_iter = replace_target_iter
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.alpha = alpha

        self.reset()

    def __create_agents__(self):
        self.agents = []

        for i in range(self.n_dqn_nodes):
            dqn_agent = DQN(self.state_size,
                            n_nodes=self.n_nodes,
                            n_actions=self.n_actions,
                            memory_size=self.memory_size,
                            replace_target_iter=self.replace_target_iter,
                            batch_size=self.batch_size,
                            learning_rate=self.learning_rate,
                            gamma=self.gamma,
                            epsilon=self.epsilon,
                            epsilon_min=self.epsilon_min,
                            epsilon_decay=self.epsilon_decay,
                            alpha=self.alpha
                            )
            self.agents.append(dqn_agent)

    def reset(self):
        self.__create_agents__()

    def tic(self):
        agent_actions = np.zeros(self.n_dqn_nodes, dtype=np.float32)

        for i in range(self.n_dqn_nodes):
            agent_actions[i] = self.agents[i].tic()
        return agent_actions

    def update(self, observations_, agent_rewards, non_agent_rewards):
        for i in range(self.n_dqn_nodes):
            # print(observations_[i], agent_rewards[i], non_agent_rewards[i])
            self.agents[i].update(
                observations_[i], agent_rewards[i], non_agent_rewards[i])

## TDMA

In [None]:
class TDMA_NODES:
    def __init__(self, n_nodes, action_list_len, X):
        # n_actions=2: (wait, transmit)
        # action_list_len and X indicate the parameters of ONE node.
        self.n_nodes = n_nodes
        self.action_list_len = action_list_len
        self.X = X
        self.action_list = self.__create_action_list__()
        self.counter = 0

    def __create_action_list__(self):  # (node, action_list)
        action_list = np.zeros((self.n_nodes, self.action_list_len))
        for i in range(self.n_nodes):
            idx = np.random.choice(self.action_list_len, self.X, replace=False)
            action_list[i, idx] = 1
        return action_list

    def tic(self):  # 1D: action of each node
        tdma_action = self.action_list[:, self.counter]
        # tdma_action = np.squeeze(tdma_action)
        self.counter += 1
        if self.counter == self.action_list.shape[1]:
            self.counter = 0
        return tdma_action.astype(np.float32)

    def shuffle(self, _X):  # Change the action pattern.
        self.X = _X
        self.__create_action_list__()

    def reset(self):
        self.action_list = self.__create_action_list__()
        self.counter = 0

## Exponential-backoff Aloha

In [None]:
class EB_ALOHA_NODES:
    def __init__(self, n_nodes, W=2, max_count=2):
        # n_actions=2: (wait, transmit)
        self.n_nodes = n_nodes
        self.max_count = max_count
        self.W = W
        self.actions = np.zeros(self.n_nodes, dtype=np.float32)

        self.count = np.zeros(self.n_nodes, dtype=np.float32)
        self.backoff = np.random.randint(
            0, self.W * 2**self.count, size=self.n_nodes)

    def tic(self):
        self.count = np.minimum(self.count, self.max_count)
        self.backoff -= 1

        filter_arr = self.backoff < 0
        filter_arr = np.arange(self.n_nodes, dtype=np.int32)[filter_arr]
        self.backoff[filter_arr] = np.random.randint(
            0, self.W * 2**self.count)[filter_arr]

        eb_Aloha_actions = (self.backoff == 0)
        eb_Aloha_actions = eb_Aloha_actions.astype(np.float32)
        self.actions = eb_Aloha_actions
        return eb_Aloha_actions  # return 1 if timeout

    def handle_success(self):
        filter_arr = (self.actions == 1)
        self.count[filter_arr] = np.zeros(self.n_nodes, dtype=np.int32)[filter_arr]
        

    def handle_collision(self):
        filter_arr = (self.actions == 1)
        self.count += filter_arr.astype(np.int32)

    def reset(self):  # Change the action pattern.
        self.count = np.zeros(self.n_nodes)
        self.backoff = np.random.randint(
            0, self.W * 2**self.count, size=self.n_nodes)

## q-Aloha

In [None]:
class q_Aloha_NODES:
    def __init__(self, n_nodes, q=0.5):
        # n_actions=2: (wait, transmit)
        self.n_nodes = n_nodes
        assert (q <= 1 and q >= 0)
        self.q = q  # probability to send
        self.actions = np.zeros(self.n_nodes, dtype=np.float32)

    def tic(self):
        # return 1 with prob. q
        return np.random.choice(2, self.n_nodes, p=[1-self.q, self.q])

    def reset(self):  # Change the action pattern.
        pass

# Environment

## Configurations

In [None]:
class Config:
    def __init__(self):
        self.n_DQN = 2
        self.n_TDMA = 0
        self.n_EB_Aloha = 0
        self.n_q_Aloha = 0

        self.max_iter = 10000 # simulation iterations
        self.N = 1000 # plot with avg of N iters

        # Agent (DQN)
        self.M = 20  # state length
        self.E = 500  # memory size
        self.F = 20  # target network update frequency
        self.B = 32  # mini-batch size
        self.alpha = 1 # alpha-fairness
        # state = cat(s[8:], [action, observation, agent_reward, non_agent_reward])
        self.state_size = int(8*self.M)

        # TDMA
        self.action_list_len = 10  # length of one period
        self.X = 2  # number of slot used in one perios

        # Exponential-backoff Aloha
        # wnd = randint(0, W*2^count)
        self.W = 2   # minimum window size
        self.max_count = 2  # maximum backoff count
        
        # q-Aloha
        self.q = .2

config = Config()

In [None]:
class ENVIRONMENT:
    def __init__(self, config):
        self.__set_env__(config)

    def __set_env__(self, _config):
        self.n_DQN = _config.n_DQN
        self.n_TDMA = _config.n_TDMA
        self.n_EB_Aloha = _config. n_EB_Aloha
        self.n_q_Aloha = _config.n_q_Aloha

        self.n_nodes = self.n_DQN + self.n_TDMA + self.n_EB_Aloha + self.n_q_Aloha

        self.dqn_nodes = DQN_NODES(_config.state_size,
                                   n_dqn_nodes=self.n_DQN,
                                   n_nodes=self.n_nodes,
                                   n_actions=2,
                                   memory_size=_config.E,
                                   replace_target_iter=_config.F,
                                   batch_size=_config.B,
                                   learning_rate=0.01,
                                   gamma=0.9,
                                   epsilon=0.5,
                                   epsilon_min=0.005,
                                   epsilon_decay=0.995,
                                   alpha=_config.alpha
                                   )
        self.tdma_nodes = TDMA_NODES(
            _config.n_TDMA, _config.action_list_len, _config.X)
        self.EB_ALOHA_NODES = EB_ALOHA_NODES(
            _config.n_EB_Aloha, _config.W, _config.max_count)
        self.q_Aloha_nodes = q_Aloha_NODES(_config.n_q_Aloha, _config.q)

    def reset(self, _config):
        self.config = _config
        self.__set_env__(self.config)

        self.dqn_nodes.reset()
        self.tdma_nodes.reset()
        self.EB_ALOHA_NODES.reset()
        self.q_Aloha_nodes.reset()

    def step(self):
        dqn_rewards = np.zeros(self.n_DQN)
        tdma_rewards = np.zeros(self.n_TDMA)
        eb_Aloha_rewards = np.zeros(self.n_EB_Aloha)
        q_Aloha_rewards = np.zeros(self.n_q_Aloha)

        dqn_actions = np.zeros(self.n_DQN, dtype=np.float32)
        tdma_actions = np.zeros(self.n_TDMA, dtype=np.float32)
        eb_Aloha_actions = np.zeros(self.n_EB_Aloha, dtype=np.float32)
        q_Aloha_actions = np.zeros(self.n_q_Aloha, dtype=np.float32)

        observation_ = np.array(['I']*self.n_DQN)  # obersvation for DQN nodes

        if self.n_DQN > 0:
            dqn_actions = self.dqn_nodes.tic()
        if self.n_TDMA > 0:
            tdma_actions = self.tdma_nodes.tic()
        if self.n_EB_Aloha > 0:
            eb_Aloha_actions = self.EB_ALOHA_NODES.tic()
        if self.n_q_Aloha > 0:
            q_Aloha_actions = self.q_Aloha_nodes.tic()

        # evaluate media condition
        n_Tx = np.sum(dqn_actions)+np.sum(tdma_actions) + \
            np.sum(eb_Aloha_actions)+np.sum(q_Aloha_actions)
        assert n_Tx >= 0

        if n_Tx == 0:  # idle (default)
            pass
        elif n_Tx == 1:  # success Tx
            dqn_rewards = dqn_actions
            tdma_rewards = tdma_actions
            eb_Aloha_rewards = eb_Aloha_actions
            q_Aloha_rewards = q_Aloha_actions

            self.EB_ALOHA_NODES.handle_success()

            for i in range(self.n_DQN):
                observation_[i] = 'S' if dqn_actions[i] == 1 else 'B'
        else:  # collision
            self.EB_ALOHA_NODES.handle_collision()

            if np.sum(dqn_actions) > 0:
                for i in range(self.n_DQN):
                    observation_[i] = 'F' if dqn_actions[i] == 1 else 'B'
            else:
                observation_ = np.array(['B']*self.n_DQN)

        # update DQN nodes
        non_agent_rewards = np.zeros(
            (self.n_DQN, self.n_nodes-1), dtype=np.float32)
        cat_rewards = np.concatenate(
            (dqn_rewards, tdma_rewards, eb_Aloha_rewards, q_Aloha_rewards), dtype=np.float32)

        for i in range(self.n_DQN):
            non_agent_rewards[i, :i] = cat_rewards[np.newaxis, :i]
            non_agent_rewards[i, i:] = cat_rewards[np.newaxis, i+1:]
            
        self.dqn_nodes.update(observation_, dqn_rewards, non_agent_rewards)

        return dqn_rewards, tdma_rewards, eb_Aloha_rewards, q_Aloha_rewards

# Run DQN


In [None]:
env = ENVIRONMENT(config=config)

def main(config):
    agent_reward_list = []
    tdma_reward_list = []
    eb_Aloha_reward_list = []
    q_Aloha_reward_list = []

    M, E, F, B, X, W, q = config.M, config.E, config.F, config.B, config.X, config.W, config.q
    n_DQN, n_TDMA, n_EB_Aloha, n_q_Aloha = config.n_DQN, config.n_TDMA, config.n_EB_Aloha, config.n_q_Aloha
    max_iter = config.max_iter

    for i in tqdm(range(config.max_iter)):
        dqn_rewards, tdma_rewards, eb_Aloha_rewards, q_Aloha_rewards = env.step()

        agent_reward_list.append(dqn_rewards)
        tdma_reward_list.append(tdma_rewards)
        eb_Aloha_reward_list.append(eb_Aloha_rewards)
        q_Aloha_reward_list.append(q_Aloha_rewards)

    agent_arr = np.array(agent_reward_list, dtype=np.float32)
    tdma_arr = np.array(tdma_reward_list, dtype=np.float32)
    eb_Aloha_arr = np.array(eb_Aloha_reward_list, dtype=np.float32)
    q_Aloha_arr = np.array(q_Aloha_reward_list, dtype=np.float32)

    if not os.path.isdir('./rewards'):
        os.mkdir('./rewards')
    file_path = f'rewards/{EXPERIMENT_NAME}_rewards_dqn{n_DQN}_t{n_TDMA}_ea{n_EB_Aloha}_qa{n_q_Aloha}_M{M}_E{E:.0E}_F{F}_B{B}_X{X}_W{W}_q{q}_{max_iter:.0E}.npz'
    np.savez(file_path, agent=agent_arr, tdma=tdma_arr,
             eb_Aloha=eb_Aloha_arr, q_Aloha=q_Aloha_arr)

    return file_path

In [None]:
file_path = main(config=config)

# Average_throughput


In [None]:
def plot_avg_throughput(file, config):
    max_iter = config.max_iter
    N = config.N
    n_DQN = config.n_DQN
    n_TDMA = config.n_TDMA
    n_EB_Aloha = config.n_EB_Aloha
    n_q_Aloha = config.n_q_Aloha

    num = [n_DQN, n_TDMA, n_EB_Aloha, n_q_Aloha]
    category = ['agent', 'tdma', 'eb_Aloha', 'q_Aloha']

    # load reward
    data = np.load(file)

    labels = []
    rewards = np.zeros((sum(num), max_iter), dtype=np.float32)  # reward

    cnt = 0
    for i in range(len(category)):
        _data = np.transpose(data[category[i]])
        for n in range(num[i]):
            lbl = f'{category[i]} {n+1}' if num[i] > 1 else f'{category[i]}'
            labels.append(lbl)
            rewards[cnt, :] = _data[n]
            cnt += 1

    avg_throughput = np.zeros((sum(num), max_iter), dtype=np.float32)
    temp_sum = np.zeros((sum(num), 1), dtype=np.float32)

    for i in range(0, max_iter):
        if i < N:
            temp_sum[:, 0] += rewards[:, i]
            avg_throughput[:, i] = temp_sum[:, 0]/(i+1)
        else:
            temp_sum[:, 0] += rewards[:, i]-rewards[:, i-N]
            avg_throughput[:, i] = temp_sum[:, 0]/N

    avg_throughput_total = np.sum(avg_throughput, axis=0, dtype=np.float32)

    plt.xlim((0, max_iter))
    plt.ylim((-0.05, 1))

    legend_list = []

    for i in range(len(avg_throughput)):
        line, = plt.plot(avg_throughput[i], lw=1, label=labels[i])
        legend_list.append(line)

    total_line, = plt.plot(avg_throughput_total,
                           color='r', lw=1.5, label='total')
    legend_list.append(total_line)

    plt.grid()
    plt.legend(handles=legend_list, loc='best')
    plt.xlabel("iteration")
    plt.ylabel("average throughput")

In [None]:
fig1 = plt.figure()
plot_avg_throughput(file_path, config)

plt.show()