In [None]:
import tensorflow as tf
import numpy as np

tf.__version__

In [None]:
# for get n-uplet de theta
from itertools import permutations
import random
import matplotlib.pyplot as plt

In [None]:
class DQN:
    # time
    T=1000
    # factor discounted
    beta=None
    # number of actions(choose of allocation)
    K = 3
    # theta real for each action
    theta_true = [0.65,0.40,0.25]
    # theta discrete(possible, finite) estimated of each action
    theta_possible = [0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1]
    # NO pays a fee for each action if QoS less then threshold
    gamma = 55
    
    # all possible theta joint(vector of theta) 
    # which respect the contraint of theta_1 > theta_2 > ... > theta_K
    theta_joint=[]
    # the belief
    state = {}
    # all possible actions
    actions = None
    
    # input size for the Q-network
    input_size = None
    # the input of the first layer of network
    x_input = None
    # the input of target of network
    target_input = None
    # all value of loss of each iteration(training)
    history_loss = None
    # loss function
    loss = None
    # learning rate 
    learning_rate = None
    # GradientDescentOptimizer of training
    train_op = None
    # the layer of prediction of network
    prediction = None
    
    # number of iteration
    iteration = None
    # epsilon(probability) of epsilon-greedy 
    epsilon = None
    
    # session of network which manage the run of the network
    session = None
    
    def __init__(self, learning_rate = 0.0002, beta = 0.9):
        # initialize the variables
        self.learning_rate = learning_rate
        self.beta = beta
        self.iteration = 0
        self.epsilon = 0.8
        self.history_loss = []
        
        # get all theta joint by <permutations>
        # which respect the contraint of theta_1 > theta_2 > ... > theta_K
        for theta in permutations(self.theta_possible,self.K):
            legal = True
            K = self.K
            for i in range(K-1):
                # test if the vector of theta respect the constraint or not
                if theta[i] <= theta[i+1]:
                    legal = False
                    break
            if legal:
                # if respect
                self.theta_joint.append(theta)
        
        # initialize the belief by probibility joint uniform
        for i in range(len(self.theta_joint)):
            self.state[self.theta_joint[i]] = 1/(len(self.theta_joint))
            
        # initilize the actions <0,1,2,...,K-1>
        self.actions = [i for i in range(K)]
        # input size of network = number of theta joint + number of action
        self.input_size = len(self.theta_joint)+ K
        
        # build the network
        self.create_network()
    
    def create_network(self):
        # the input layer of data
        self.x_input = tf.placeholder(shape=[None,self.input_size], dtype=tf.float32)
        # the input layer of targer
        self.target_input = tf.placeholder(shape=[None], dtype=tf.float32)
        
        # the first layer
        # has 16 nodes
        # use relu as active function
        neural_network_layer_1 = 16
        l1 = self.add_layer(self.x_input, self.input_size, neural_network_layer_1, activation_function=tf.nn.relu)
        
        # the second layer
        # has 8 nodes
        # use relu as active function
        neural_network_layer_2 = 8
        l2 = self.add_layer(l1, neural_network_layer_1, neural_network_layer_2, activation_function=tf.nn.relu)
        
        # the output layer 
        # the shape of output (None, 1)
        self.prediction = self.add_layer(l2, neural_network_layer_2,1,activation_function=None)
        
        # define the loss function
        # if input pieces of data, then take the mean
        self.loss = tf.reduce_mean(tf.reduce_sum(tf.square(self.target_input-self.prediction),reduction_indices=[1]))
        # us GradientDescentOptimizer and minimizer the loss
        self.train_op = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss)
        
        # initialize the weights randomly of the network 
        init = tf.global_variables_initializer()
        self.session = tf.Session()
        self.session.run(init)
    
    def get_action(self):
        '''
        get action based on current state and weight of network by epsilon-greedy
        epsilon reduce by iteration increasing
        
        get action randomly with a probability of epsilon
        otherwize get action which can minimizer Q calculated by current network
        
        return : action, Q correspond action calculated by network
        '''
        a = None
        # get all Q by the current network for each possible action
        Q = [self.predict(self.state, i)[0][0] for i in range(self.K)]
        
        '''a = np.argmin(Q)
        '''
        #self.epsilon = 0
        if random.uniform(0,1)<self.epsilon:
            # get action randomly
            a = np.random.randint(0,self.K)
        else :
            # get action which can minimizer the Q
            a = np.argmin(Q)
        if self.iteration%20 == 19 and self.epsilon>0.1:
            self.epsilon -= self.epsilon/20
        
        return a,Q[a]
    
    def get_action2(self):
        '''
        get action which can minimizer Q calculated by current network
        
        return : action, Q correspond action calculated by network
        '''
        estimated_theta = []
        rts = []
        for a in range(self.K):
            prob_Y_y_a = 0.0
            y_a = 1
            for joint in self.theta_joint:
                prob_Y_y_a += self.prob_bernouill(joint[a], y_a)*self.state[joint]
            estimated_theta.append(prob_Y_y_a)
        a = np.argmin(estimated_theta)
        return a,self.predict(self.state, a)[0][0]
    
    def allocate(self, a):
        '''
        simulate action to client
        calculate the reword and
        the client return 1 with a probability of real theta_a, or 0
        
        a : action choosed
        
        problem : I dont sure that the rework should be expected or not
        
        return : rework immediate, 1 or 0
        '''
        # calculate probability of belief of theta for action a
        prob_Y_y_a = 0.0
        y_a = 1
        for joint in self.theta_joint:
            prob_Y_y_a += self.prob_bernouill(joint[a], y_a)*self.state[joint]
            
        # calculate reword immediate by cost and expected fee should pay
        rt = self.cost(a) + self.gamma*prob_Y_y_a
        y_a = None
        
        # get the real theta of action
        theta = self.theta_true[a]
        # simulate
        if random.uniform(0,1) <= theta:
            y_a = 1
        else:
            y_a = 0
        #rt = self.cost(a) + self.gamma*y_a
        return rt,y_a
        
    
    def prob_bernouill(self, theta, y):
        '''
        probability of Bernouill
        '''
        return np.power(theta, y)*np.power((1-theta), (1-y))
    
    def get_next_state(self, a, y_a):
        '''
        calculate the new belief of theta for each action
        
        a : action performed
        y_a : 1 or 0, the result return by client(simulation)
        
        return : the state(belief) of next time
        '''
        # calculate probability of belief of theta for action a
        prob_Y_y_a = 0.0
        next_state = {}
        for joint in self.theta_joint:
            prob_Y_y_a += self.prob_bernouill(joint[a], y_a)*self.state[joint]
        
        # calculate the new probability
        for joint in self.theta_joint:
            theta = joint[a]
            p1 = self.prob_bernouill(theta, y_a)
            next_state[joint] = p1*self.state[joint]/prob_Y_y_a
            
        return next_state
    
    def transition(self, next_state):
        '''
        update current state
        '''
        self.state = next_state
    
    def get_target(self, r, next_state):
        '''
        calculate the target by r + minimun Q of state_t+1
        
        r : current reword by simulating action
        next_state : the state of t+1 
        
        return : target
        '''
        Q = [self.predict(next_state, i)[0][0] for i in range(self.K)]
        return self.beta * np.min(Q) + r
            
        
    def train(self,s,a,y):
        '''
        train the network
        
        s : current state
        a : action performed
        y : the target
        
        return : the loss of current train
        '''
        # iteration increases
        self.iteration += 1
        x = self.get_x(s,a)
        _,loss = self.session.run([self.train_op, self.loss], feed_dict={self.x_input:x,self.target_input:y})
        # save the loss of current train
        self.history_loss.append(loss)
        return loss
        
    
    def predict(self,s,a):
        '''
        get the output of network by input
        
        s : current stata
        a : action performed
        
        return : Q predicted by network
        '''
        # calculate the data can be inputed by state and action
        x = self.get_x(s,a)
        return self.session.run(self.prediction, feed_dict={self.x_input:x})
    
    def evaluate(self):
        '''
        evaluate the network, and return:
        1. estimated(expected) theta for each action
        2. list of Q for each action of current state
        3. expected reword of each action of current belief
        '''
        estimated_theta = []
        rts = []
        for a in range(self.K):
            prob_Y_y_a = 0.0
            y_a = 1
            for joint in self.theta_joint:
                prob_Y_y_a += self.prob_bernouill(joint[a], y_a)*self.state[joint]
            estimated_theta.append(prob_Y_y_a)
            rts.append(self.cost(a) + self.gamma*prob_Y_y_a)
        print('  estimated thetas: ', estimated_theta)
        print('  Q               : ', [self.predict(self.state, i)[0][0] for i in range(self.K)])
        print('  rt              : ', rts)
    
    def get_x(self,s,a):
        '''
        get data can be input to the network
        '''
        return np.array([np.append(self.F(s), self.G(a))])
        
    def F(self,s):
        '''
        function maps state to vector
        '''
        return np.array([s[joint] for joint in self.theta_joint])
    
    def G(self,k):
        '''
        function maps state to vector
        '''
        r = np.zeros(self.K)
        r[k]=1
        return r
    
    def cost(self,k):
        '''
        cost of each action
        '''
        return 10*(k+1)
    
    def add_layer(self,inputs,in_size,out_size,activation_function=None):
        '''
        create one layer of network
        
        inputs : the input of the layer
        in_size : input size
        out_size : output size (number of nodes)
        activation_function : activation function of the layer
        '''
        w = tf.Variable(tf.random_normal([in_size,out_size]))
        b = tf.Variable(tf.zeros([1,out_size])+0.1)
        f = tf.matmul(inputs,w) + b
        if activation_function is None:
            outputs = f
        else:
            outputs = activation_function(f)
        return outputs


In [None]:
# new a DQN
dqn = DQN()

In [None]:
# show current state (belief)
dqn.state

In [None]:
# evaluate the network
dqn.evaluate()

In [None]:
#train the network

# iteration
t = 800
for i in range(t):
    # get action, and Q of the action
    a_t, Q_t = dqn.get_action()
    # allocate the choix to the client, and get the rework immediate and Y of a(1 of 0)
    r_t, y_t = dqn.allocate(a_t)
    # calculate the next state
    next_state = dqn.get_next_state(a=a_t,y_a=y_t)
    # calculate the target for training the network
    target = dqn.get_target(r_t, next_state)
    # print 
    print('at: %d  Qt: %.3f  rt: %.3f  yt: %d  target:%.3f'%(a_t,Q_t,r_t,y_t,target))
    # train the network
    dqn.train(dqn.state, a_t, [target])
    # transfer to next state
    dqn.transition(next_state)

In [None]:
# evaluate the network
dqn.evaluate()

In [None]:
# show history loss

plt.plot([i for i in range(dqn.iteration)], dqn.history_loss)
plt.xlabel('iteration')
plt.ylabel('loss')

In [None]:
# show the belief
dqn.state