In [535]:
import numpy as np

import os
import tensorflow as tf

from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Dropout, Input, Add, Activation, BatchNormalization
from keras.optimizers import RMSprop
from keras.initializers import glorot_normal

import matplotlib.pyplot as plt
from tqdm import tqdm

In [536]:
EXPERIMENT_NAME = 'agent+TDMA+EB_ALOHA'

## Fix random seed


In [537]:
def same_seeds(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)


same_seeds(48763)

# DQN_brain


## DQN

In [538]:
class DQN:
    def __init__(self,
                 state_size,
                 n_actions,
                 memory_size=500,
                 replace_target_iter=200,
                 batch_size=32,
                 learning_rate=0.01,
                 gamma=0.9,
                 epsilon=1,
                 epsilon_min=0.01,
                 epsilon_decay=0.995
                 ):
        # hyper-parameters
        self.state_size = state_size
        self.n_actions = n_actions
        self.memory_size = memory_size
        self.replace_target_iter = replace_target_iter
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.memory = np.zeros((self.memory_size, self.state_size * 2 + 2))
        # temporary parameters
        self.learn_step_counter = 0
        self.memory_couter = 0

        # build model
        self.model = self.build_ResNet_model()  # model: evaluate Q value
        self.target_model = self.build_ResNet_model()  # target_mode: target network

    def build_ResNet_model(self):
        inputs = Input(shape=(self.state_size, ))
        h1 = Dense(64, activation="relu",
                   kernel_initializer='glorot_normal')(inputs)  # h1
        h2 = Dense(64, activation="relu",
                   kernel_initializer='glorot_normal')(h1)  # h2

        h3 = Dense(64, activation="relu",
                   kernel_initializer='glorot_normal')(h2)  # h3
        h4 = Dense(64, activation="relu",
                   kernel_initializer='glorot_normal')(h3)  # h4
        add1 = Add()([h4, h2])

        h5 = Dense(64, activation="relu",
                   kernel_initializer='glorot_normal')(add1)  # h5
        h6 = Dense(64, activation="relu",
                   kernel_initializer='glorot_normal')(h5)  # h6
        add2 = Add()([h6, add1])

        outputs = Dense(
            self.n_actions, kernel_initializer='glorot_normal')(add2)
        model = Model(inputs=inputs, outputs=outputs)
        model.compile(loss="mse", optimizer=RMSprop(
            learning_rate=self.learning_rate))
        return model

    def choose_action(self, state):
        # Apply epsilon-greedy algorithm
        state = state[np.newaxis, :]
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)

        if np.random.random() < self.epsilon:
            return np.random.randint(0, self.n_actions)

        action_values = self.model.predict(state)
        return np.argmax(action_values)

    def store_transition(self, s, a, r, s_):
        # s_: next_state
        # r: ndarray or int
        if np.isscalar(r):
            r = [r]
        if not hasattr(self, 'memory_couter'):
            self.memory_couter = 0
        transition = np.concatenate((s, [a], r, s_))
        index = self.memory_couter % self.memory_size
        self.memory[index, :] = transition
        self.memory_couter += 1

    def repalce_target_parameters(self):
        weights = self.model.get_weights()
        self.target_model.set_weights(weights)

    def learn(self):
        # check to update target netowrk parameters
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.repalce_target_parameters()  # iterative target model
        self.learn_step_counter += 1

        # sample batch memory from all memory
        if self.memory_couter > self.memory_size:
            sample_index = np.random.choice(
                self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(
                self.memory_couter, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]

        # batch memory row: [s, a, r, s_]
        # number of batch memory: batch size
        state = batch_memory[:, :self.state_size]
        action = batch_memory[:, self.state_size].astype(int)  # float -> int
        reward = batch_memory[:, self.state_size]
        next_state = batch_memory[:, -self.state_size:]

        q = self.model.predict(state)  # state
        q_targ = self.target_model.predict(next_state)  # next state

        batch_index = np.arange(self.batch_size, dtype=np.int32)
        q[batch_index, action] = reward + self.gamma * np.max(q_targ, axis=1)

        self.model.fit(state, q, self.batch_size, epochs=1, verbose=0)

## TDMA

In [539]:
class TDMA:
    def __init__(self, n_nodes, action_list_len, n_slot_used):
        # n_actions=2: (wait, transmit)
        # action_list_len and n_slot_used indicate the parameters of ONE node.
        self.n_nodes = n_nodes
        self.action_list_len = action_list_len
        self.n_slot_used = n_slot_used
        self.action_list = self.__create_action_list__()
        self.counter = 0

    def __create_action_list__(self):  # (node, action_list)
        action_list = np.zeros((self.n_nodes, self.action_list_len))
        for i in range(self.n_nodes):
            idx = np.random.randint(
                self.action_list_len, size=self.n_slot_used)
            action_list[i, idx] = 1
        return action_list

    def tic(self):  # 1D: action of each node
        tdma_action = self.action_list[:, self.counter]
        # tdma_action = np.squeeze(tdma_action)
        self.counter += 1
        if self.counter == len(self.action_list):
            self.counter = 0
        return tdma_action.astype(np.int32)

    def reset(self, n_slot_used):  # Change the action pattern.
        self.n_slot_used = n_slot_used
        self.action_list = self.__create_action_list__()
        self.counter = 0

## Exponential-backoff ALOHA

In [540]:
class EB_ALOHA:
    def __init__(self, n_nodes, Wmin=2, max_backoff=2):
        # n_actions=2: (wait, transmit)
        # state_size and n_slot_used indicate the parameters of ONE node.
        self.n_nodes = n_nodes
        self.max_backoff = max_backoff
        self.Wmin = Wmin
        self.actions = np.zeros(self.n_nodes)

        self.count = np.zeros(self.n_nodes)
        self.backoff = np.random.randint(
            0, self.Wmin * 2**self.count, size=self.n_nodes)

    def tic(self):
        self.count = np.minimum(self.count, self.max_backoff)
        self.backoff -= 1

        filter_arr = self.backoff < 0
        filter_arr = np.arange(self.n_nodes, dtype=np.int32)[filter_arr]
        self.backoff[filter_arr] = np.random.randint(
            0, self.Wmin * 2**self.count)[filter_arr]

        aloha_actions = (self.backoff == 0)
        aloha_actions = aloha_actions.astype(np.int32)
        self.actions = aloha_actions
        return aloha_actions  # return 1 if timeout
    
    def handle_collision(self):
        filter_arr = (self.actions==1)
        self.count += filter_arr.astype(np.int32)

    def reset(self):  # Change the action pattern.
        self.count = np.zeros(self.n_nodes)
        self.backoff = np.random.randint(
            0, self.Wmin * 2**self.count, size=self.n_nodes)

# Environment


In [541]:
class ENVIRONMENT:
    def __init__(self, state_size=10, n_TDMA=1, n_EB_ALOHA=1):
        self.n_TDMA = n_TDMA
        self.n_EB_ALOHA = n_EB_ALOHA
        self.state_size = state_size
        self.TDMA_nodes = TDMA(n_TDMA, 14, 6)
        self.EB_ALOHA_nodes = EB_ALOHA(n_EB_ALOHA, Wmin=2, max_backoff=2)

    def reset(self):
        self.TDMA_nodes.reset(6)
        self.EB_ALOHA_nodes.reset()
        init_state = np.zeros(self.state_size)
        return init_state

    def step(self, action):
        agent_reward = 0
        tdma_reward = np.zeros(self.n_TDMA)
        aloha_reward = np.zeros(self.n_EB_ALOHA)
        reward = 0
        observation_ = 0
        tdma_actions = np.zeros(self.n_TDMA, dtype=np.int32)
        aloha_actions = np.zeros(self.n_EB_ALOHA, dtype=np.int32)
        if self.n_TDMA > 0:
            tdma_actions = self.TDMA_nodes.tic()
        if self.n_EB_ALOHA > 0:
            aloha_actions = self.EB_ALOHA_nodes.tic()

        if action == 1:
            if np.sum(tdma_actions)+np.sum(aloha_actions) > 0:  # collision
                observation_ = 'F'  # tx, no success
            else:  # agent success
                reward = 1
                agent_reward = 1
                observation_ = 'S'  # tx, success
        else:
            if tdma_actions.all(0) and aloha_actions.all(0):  # idle
                observation_ = 'I'
            elif np.sum(tdma_actions)+np.sum(aloha_actions) == 1:  # some node success
                reward = 1
                tdma_reward = tdma_actions
                aloha_reward = aloha_actions
                observation_ = 'B'
            else:  # some node collide
                observation_ = 'B'

        return observation_, reward, agent_reward, tdma_reward, aloha_reward

# Run DQN


In [542]:
# n_DQN = 1
M = 20  # state length
E = 500  # memory size
F = 20  # target network update frequency
B = 32  # mini-batch size

n_TDMA = 1
n_EB_ALOHA = 0

env = ENVIRONMENT(state_size=int(8*M), n_TDMA=n_TDMA, n_EB_ALOHA=n_EB_ALOHA)

dqn_agent = DQN(env.state_size,
                2,
                memory_size=E,
                replace_target_iter=F,
                batch_size=B,
                learning_rate=0.01,
                gamma=0.9,
                epsilon=0.5,
                epsilon_min=0.005,
                epsilon_decay=0.995,
                )

In [543]:
def return_action(action, n_actions=2):
    one_hot_vector = [0] * n_actions
    one_hot_vector[action] = 1
    return one_hot_vector


def return_observation(o):
    if o == 'S':
        return [1, 0, 0, 0]
    elif o == 'F':
        return [0, 1, 0, 0]
    elif o == 'B':
        return [0, 0, 1, 0]
    elif o == 'I':
        return [0, 0, 0, 1]


def main(max_iter):
    agent_reward_list = []
    tdma_reward_list = []
    aloha_reward_list = []
    state = env.reset()
    print('------------------------------------------')
    print('---------- Start processing ... ----------')
    print('------------------------------------------')

    for i in tqdm(range(max_iter)):
        agent_action = dqn_agent.choose_action(state)
        observation_, reward, agent_reward, tdma_reward, aloha_reward = env.step(
            agent_action)
        agent_reward_list.append(agent_reward)
        tdma_reward_list.append(tdma_reward)
        aloha_reward_list.append(aloha_reward)

        # state = (action_t, observation_t)
        next_state = np.concatenate((state[8:], np.array(return_action(
            agent_action)+return_observation(observation_) + [agent_reward, np.sum(tdma_reward)+np.sum(aloha_reward)]))).astype(np.int32)
        dqn_agent.store_transition(state, agent_action, reward, next_state)
        if i > 100:
            dqn_agent.learn()       # internally iterates default (prediction) model
        state = next_state
    if not os.path.isdir('./rewards'):
        os.mkdir('./rewards')
    np.save(f'./rewards/{EXPERIMENT_NAME}_agent', np.array(agent_reward_list))
    if n_TDMA >0:
        np.save(f'./rewards/{EXPERIMENT_NAME}_tdma', np.array(tdma_reward_list))
    if n_EB_ALOHA>0:
        np.save(f'./rewards/{EXPERIMENT_NAME}_aloha', np.array(aloha_reward_list))

In [544]:
main(max_iter=10000)

------------------------------------------
---------- Start processing ... ----------
------------------------------------------


  0%|          | 0/100 [00:00<?, ?it/s]



  2%|▏         | 2/100 [00:00<00:07, 12.94it/s]



  4%|▍         | 4/100 [00:00<00:08, 10.72it/s]



  6%|▌         | 6/100 [00:00<00:07, 12.56it/s]



  9%|▉         | 9/100 [00:00<00:05, 16.17it/s]



 11%|█         | 11/100 [00:00<00:05, 17.21it/s]



 14%|█▍        | 14/100 [00:00<00:04, 19.86it/s]



 18%|█▊        | 18/100 [00:00<00:03, 23.76it/s]



 21%|██        | 21/100 [00:01<00:03, 21.71it/s]



 29%|██▉       | 29/100 [00:01<00:02, 34.42it/s]



 33%|███▎      | 33/100 [00:01<00:01, 35.51it/s]



 38%|███▊      | 38/100 [00:01<00:01, 37.43it/s]



 42%|████▏     | 42/100 [00:01<00:01, 30.50it/s]



 46%|████▌     | 46/100 [00:01<00:01, 28.65it/s]



 50%|█████     | 50/100 [00:02<00:02, 22.89it/s]



 53%|█████▎    | 53/100 [00:02<00:02, 20.56it/s]



 56%|█████▌    | 56/100 [00:02<00:02, 20.16it/s]



 59%|█████▉    | 59/100 [00:02<00:02, 18.70it/s]



 61%|██████    | 61/100 [00:02<00:02, 17.69it/s]



 63%|██████▎   | 63/100 [00:02<00:02, 17.05it/s]



 65%|██████▌   | 65/100 [00:03<00:02, 17.05it/s]



 68%|██████▊   | 68/100 [00:03<00:01, 19.23it/s]



 71%|███████   | 71/100 [00:03<00:01, 21.26it/s]



 74%|███████▍  | 74/100 [00:03<00:01, 19.76it/s]



 77%|███████▋  | 77/100 [00:03<00:01, 21.30it/s]



 80%|████████  | 80/100 [00:03<00:01, 19.12it/s]



 86%|████████▌ | 86/100 [00:03<00:00, 22.08it/s]



 90%|█████████ | 90/100 [00:04<00:00, 22.39it/s]



 93%|█████████▎| 93/100 [00:04<00:00, 22.83it/s]



 97%|█████████▋| 97/100 [00:04<00:00, 25.85it/s]



100%|██████████| 100/100 [00:04<00:00, 22.28it/s]


# Average_throughput


In [545]:
def plot_avg_throughput(file1, file2, file3):
    max_iter = 10000
    N = 1000

    # load reward
    agent1_reward = np.load(file1)
    agent2_reward = np.load(file2)[:,0]
    agent3_reward = np.load(file3)[:,0]

    avg_throughput_agent1 = np.zeros((1, max_iter))
    avg_throughput_agent2 = np.zeros((1, max_iter))
    avg_throughput_agent3 = np.zeros((1, max_iter))

    agent1_temp_sum = 0
    agent2_temp_sum = 0
    agent3_temp_sum = 0
    for i in range(0, max_iter):
        if i < N:
            agent1_temp_sum += agent1_reward[i]
            avg_throughput_agent1[0][i] = agent1_temp_sum / (i+1)
            agent2_temp_sum += agent2_reward[i]
            avg_throughput_agent2[0][i] = agent2_temp_sum / (i+1)
            agent3_temp_sum += agent3_reward[i]
            avg_throughput_agent3[0][i] = agent3_temp_sum / (i+1)
        else:
            agent1_temp_sum += agent1_reward[i] - agent1_reward[i-N]
            avg_throughput_agent1[0][i] = agent1_temp_sum / N
            agent2_temp_sum += agent2_reward[i] - agent2_reward[i-N]
            avg_throughput_agent2[0][i] = agent2_temp_sum / N
            agent3_temp_sum += agent3_reward[i] - agent3_reward[i-N]
            avg_throughput_agent3[0][i] = agent3_temp_sum / N

    plt.xlim((0, max_iter))
    plt.ylim((-0.05, 1))

    agent1_line, = plt.plot(
        avg_throughput_agent1[0], color='r', lw=1.2, label='agent')
    agent2_line, = plt.plot(
        avg_throughput_agent2[0], color='g', lw=1.2, label='tdma')
    agent3_line, = plt.plot(
        avg_throughput_agent3[0], color='b', lw=1.2, label='em-aloha')

    plt.grid()
    plt.legend(handles=[agent1_line, agent2_line, agent3_line], loc='best')
    plt.xlabel("iteration")
    plt.ylabel("average throughput")

In [546]:
fig1 = plt.figure()
plot_avg_throughput(f'rewards/{EXPERIMENT_NAME}_agent.npy',
                    f'rewards/{EXPERIMENT_NAME}_tdma.npy',
                    f'rewards/{EXPERIMENT_NAME}_aloha.npy')

plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'rewards/agent+TDMA_20_aloha.npy'

<Figure size 640x480 with 0 Axes>