In [None]:
%matplotlib inline

In [None]:
import numpy as np
# Option 1.1 channel_state: number of users
# Option 1.2 channel_state: interference power

In [None]:
class Environment(object):
    """Currently no propogation effect is considered
       The central station:
       (1) only provide repeating signaling channel and do not schedule at all
       (2) monitor malicious users
    """

    def __init__(self, channel_num, max_steps):
        """channel_num: number of RF channels in the environment
           max_steps: the number of steps in one epoch (ie. 50)
                      when max_steps is reached game over and give reward
           self.channel_state: record the channel occupancy in this step
           self.history: store channel occupancy of the past
           self.step: log how many steps has gone
           self.agent_list: log the agent in this environment
           self.agent_list: record the reward of each agent
        """
        self.channel_num = channel_num
        self.channel_state = [[] for i in range(self.channel_num)]
        self.history = []
        
        self.max_steps = max_steps
        self.step = 0
        
        self.agent_list = []
        self.reward_list = []
        
    def join(self, agent):
        self.agent_list.append(agent)
        self.reward_list.append([])
        
    def one_action_step(self):
        """record steps and save last channel state
           enter to next time step and initialize channel state
           call one_action_step method of agents in agent list
        """
        self.step += 1
        self.history.append(self.channel_state)
        self.channel_state = [[] for i in range(self.channel_num)]
        for agent in agent_list:
            agent.one_time_step()
        #evaluate the reward of each agent
        ber()
    
    def ber(self):
        """reward: success transmission (1/task), conflict trans (-1/task)
        """
        for state in enumerate(self.channel_state):
            if len(state) == 1:
                agent = state[0]
                index = self.agent_list.index(agent)
                self.reward_list[index] += 1
            if len(state) > 1:
                for agent in state:
                    index = self.agent_list.index(agent)
                    self.reward_list[index] -= 1
    
    def broadcast(self):
        return self.agent_list

    def propagation(self, channel_index, agent):
        """propagate the signal of a certain agent"""
        self.channel_state[channel_index].append(agent)

    def query(self, channel_index):
        """provide identification result
           return the list of agents occupying the certain channel
        """
        return self.history[-1][channel_index]

    def report(self):
        """provid observation result
           return the number of channel users 
        """
        last_state = self.history[-1]
        return [len(l) for l in last_state]

    def get_reward(self, agent):
        # The instructor is the environment (receiver),
        # which can evaluate how good the agent is doing by checksum.
        # The reward message is passed over signaling channel.
        # Since the frequent receiver-to-agent interaction wastes bandwidth,
        # the agent can only get back reward after its tasks are all finished
        # or the maximum time step is reached.
        if self.step < self.max_steps:
            return 0
        if self.step == self.max_steps:
            index = self.agent_list.index(agent)
            return self.reward_list[index]
        if self.step > self.max_steps:
            raise RuntimeError('maximum steps reached and game over')

In [None]:
class Agent(object):
    """at the beginning, random trasmit
       then mainly use coordinate to gain channel
    """

    def __init__(self, env, task_num):
        """env: the environment to operate in
           task_num: number of task to be transmitted, ie. 100
           self.channels: established channels (coordinated with receiver)
           self.part_state: observed channel states
           self.part_agent: identified agents in each channel
           self.agent_id: log agents apeared
        """
        self.env = env
        self.task_num = task_num
        self.channels = []
        self.part_state = []
        self.agent_id = {}
        self.reward = 0
        
        self.action_queue = []
        self.args_queue = []

    def one_action_step(self):
        """interact with environment and agents within in one time step
           using Q-learning to decide whether to tear channel when job finished
        """
        
        if self.task_num > 0:
            # job has not finished at last step, punish for the delay
            # since we want to finish tasks as soon as possible
            self.reward -= 0.5
        
        if len(self.action_queue)>0：
            # execute the actions scheduled by communication
            # since communicate may occur multiple times
            action = self.action_queue[-1]
            args = self.args_queue[-1]
            action(args)
            # TODO: log the communicate result and use Q-learning to decide
            # what to do
        else:
            # TODO: Q learning to choose operation (function and args)
            # func: rest,transmit,establish,tear,observe,communicate
            # args: channel_index
            # channel_index: ie. 0~10
            # status: task_num,channels,part_state
            # find f: function, channel_index, target_agent_id = f(status)
        
        # get reward from environment
        self.reward += self.env.get_reward(self)

    def rest(self, *args):
        """reset in this time step and do nothing
        """
        return
    
    def transmit(self, *args):
        """transmit using all established channels
           TODO: choose channel based on certain policy
        """
        for index in self.channels:
            if self.task_num == 0: return
            self.env.propagation(index, self)
            self.task_num -= 1       

    def establish(self, channel_index, *args):
        """establish a new channel by coordinate with receiver
           channel_index: the index of channel to occupy
        """
        if channel_index not in self.channels:
            self.channels.append(channel_index)
            # Since expand operation need signaling bandwidth to coordinate
            self.reward -= 1

    def tear(self, channel_index, *args):
        """tear down a channel by coordinate with receiver
           channel_index: the index of channel to release
        """
        if channel_index in self.channels:
            self.channels.remove(channel_index)
            # Shrink operation do not need signaling bandwidth to coordinate (0)
            # use the current data channel to trasmit tear down signal
            # Shrink operation ficilitate collabaration(+1)
            # decrease the possibility to conflict good for myself
            self.reward += 1

    def observe(self, *args):
        """observe the channel usages
           TODO: may directly return the channel with highest availability
           TODO: in this stage, we let Q-learning find the candidate channel
        """
        self.part_state.append(self.env.report()) 
        # Since observe operation need energy to detect occupancy
        self.reward -= 0.2
        # TODO find the channel with highest availability (least occupied)
    
    # abandon p2p communicate
    # reason 1: p2p communicate -> feedback are not stable
    # ie. A:0.1, B:0.3, C:0.2, multiple communicate may overwrite each other
    # reason 2: identify result is not reliable
    # since you do not exactly know whether the agent is still using the channel
    # so use broadcast communicate
    def communicate(self, channel_index, *args):
        """communicate with other agents over signaling channel
           the priority and desired channel is broadcast to all other agents           target: the target agent to communicate with
           protocal: (1) exchange priority score to collabration
                     (2) protect agents using different protocals
           since can not collaborate with them means can not use the channel
           and you will be interferenced when you do
        """
        # Since communicate need signaling bandwidth to coordinate (-1)
        self.reward -= 1
        score = priority()
        for agent in self.env.broadcast():
            if not agent.insist(channel_index, score): break
        else:
            return
        self.action_queue.append(self.establish)
        self.args_queue.append((channel_index,))  

    def insist(self, channel_index, score):
        """schedule tear down action if score is higher
           return the priority score of this agent
        """
        if channel_index not in self.channels:
            return True
        if score > self.priority():
            self.action_queue.append(self.tear)
            self.args_queue.append((channel_index,))
            return False
        else:
            return True
        
    def priority(self):
        """calculate priority score, or loss of the agent
           possible metric is to combine tasks num and self.reward
           need to add constrict over score to avoid malicious deception
           通过（1）设备入网审查；（2）监测加入时间和后续发送数，保证score真实性
        """
        return self.task_num - len(self.channels) - self.reward
    
    def report(self):
        return self.task_num, self.channels, self.reward