In [1]:
import tensorflow as tf
import numpy as np
import cPickle as pickle
import time

In [2]:
np.random.seed(1)
tf.set_random_seed(1)

In [3]:
#####################  hyper parameters  ####################

MAX_EPISODES = 200
MAX_EP_STEPS = 200
LR_A = 0.001    # learning rate for actor
LR_C = 0.001    # learning rate for critic
GAMMA = 0.9     # reward discount
REPLACEMENT = [
    dict(name='soft', tau=0.01),
    dict(name='hard', rep_iter_a=600, rep_iter_c=500) 
][0]            # you can try different target replacement strategies

MEMORY_CAPACITY = 10000
BATCH_SIZE = 32

HF_ACTIONBOUND=400
SEP_ACTIONBOUND=40

RENDER = False
OUTPUT_GRAPH = False

File='SRA'

In [4]:
###############################  Actor  ####################################


class Actor(object):
    def __init__(self, sess, action_dim, action_bound, learning_rate, replacement):
        self.sess = sess
        self.a_dim = action_dim
        self.action_bound = action_bound
        self.lr = learning_rate
        self.replacement = replacement
        self.t_replace_counter = 0

        with tf.variable_scope('Actor'):
            # input s, output a
            self.a = self._build_net(S, scope='eval_net', trainable=True)

            # input s_, output a, get a_ for critic
            self.a_ = self._build_net(S_,  scope='target_net', trainable=False)

        self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
        self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')

        if self.replacement['name'] == 'hard':
            self.t_replace_counter = 0
            self.hard_replace = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]
        else:
            self.soft_replace = [tf.assign(t, (1 - self.replacement['tau']) * t + self.replacement['tau'] * e)
                                 for t, e in zip(self.t_params, self.e_params)]

    def _build_net(self, s, scope, trainable):
        with tf.variable_scope(scope):
            init_w = tf.random_normal_initializer(0., 0.3)
            init_b = tf.constant_initializer(0.1)
            net = tf.layers.dense(s, 30, activation=tf.nn.relu,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l1',
                                  trainable=trainable)
            with tf.variable_scope('a'):
                actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
                                          bias_initializer=init_b, name='a', trainable=trainable)
                scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a')  # Scale output to -action_bound to action_bound
        return scaled_a

    def learn(self, s):   # batch update
        self.sess.run(self.train_op, feed_dict={S: s})

        if self.replacement['name'] == 'soft':
            self.sess.run(self.soft_replace)
        else:
            if self.t_replace_counter % self.replacement['rep_iter_a'] == 0:
                self.sess.run(self.hard_replace)
            self.t_replace_counter += 1

    def choose_action(self, s):
        return self.sess.run(self.a, feed_dict={S: s})

    def add_grad_to_graph(self, a_grads):
        with tf.variable_scope('policy_grads'):
            # ys = policy;
            # xs = policy's parameters;
            # a_grads = the gradients of the policy to get more Q
            # tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
            
            # supvised gradient
            supvised_grads=tf.gradients(ys=tf.squared_difference(self.a, A),xs=self.e_params)
            
            self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)+supvised_grads
            
        with tf.variable_scope('A_train'):
            opt = tf.train.AdamOptimizer(-self.lr)  # (- learning rate) for ascent policy
            self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))


In [5]:
###############################  Critic  ####################################

class Critic(object):
    def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, replacement, a, a_):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.lr = learning_rate
        self.gamma = gamma
        self.replacement = replacement

        with tf.variable_scope('Critic'):
            
            self.a = tf.stop_gradient(a)    # stop critic update flows to actor
            # Input (s, a^), output q
            self.q = self._build_net(S, self.a, 'eval_net', trainable=True)
            
            self.q_ = self._build_net(S_, a_, 'target_net', trainable=False)    # target_q is based on a_ from Actor's target_net

            self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
            self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
              

        with tf.variable_scope('target_q'):
            self.target_q = R + self.gamma * self.q_

        with tf.variable_scope('TD_error'):
            self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))
            
        with tf.variable_scope('C_train'):
            self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

        with tf.variable_scope('a_grad'):
            self.a_grads = tf.gradients(self.q, a)[0]   # tensor of gradients of each sample (None, a_dim)

        #with tf.variable_scope('a_grad'):
           #self.q_a=self.sess.run(self.q, feed_dict={S: S, A: a}) 
           #self.a_grads = tf.gradients(self.q_a, a)[0]   # tensor of gradients of each sample (None, a_dim)

        if self.replacement['name'] == 'hard':
            self.t_replace_counter = 0
            self.hard_replacement = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]
        else:
            self.soft_replacement = [tf.assign(t, (1 - self.replacement['tau']) * t + self.replacement['tau'] * e)
                                     for t, e in zip(self.t_params, self.e_params)]

    def _build_net(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            init_w = tf.random_normal_initializer(0., 0.1)
            init_b = tf.constant_initializer(0.1)

            with tf.variable_scope('l1'):
                n_l1 = 30
                w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
                w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
                net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)

            with tf.variable_scope('q'):
                q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable)   # Q(s,a)
        return q
        

    def learn(self, s, a, r, s_):
        _,loss=self.sess.run([self.train_op,self.loss], feed_dict={S: s, self.a: a, R: r, S_: s_})
            
        if self.replacement['name'] == 'soft':
            self.sess.run(self.soft_replacement)
        else:
            if self.t_replace_counter % self.replacement['rep_iter_c'] == 0:
                self.sess.run(self.hard_replacement)
            self.t_replace_counter += 1
        return loss
    
    def get_q(self,a,s):
        return self.sess.run(self.q, feed_dict={S: s, self.a: a})
        

In [6]:
#####################  Memory  ####################

class Memory(object):
    def __init__(self): 
        # 一个S数组，一个R数组，一个A数组
 #dims=s.dim+a.dim+r.dim+s_.dim
        S=pickle.load(open(File + '/S' + '.seqs','rb'))
        R=pickle.load(open(File + '/S' + '.seqs','rb'))
        A=pickle.load(open(File + '/S' + '.seqs','rb'))
        S_=S
        S_=S
        
        self.capacity = len(S)
        self.state_dim=len(S[0])
        self.action_dim=len(A[0])
        
        S_[:len(S)-1]=S[1:]
        
        self.data = np.hstack((S, A, R, S_))

    def sample(self, n):
        indices = np.random.choice(self.capacity, size=n)
        return self.data[indices, :]

In [7]:
M = Memory()
state_dim=M.state_dim
action_dim=M.action_dim
# all placeholder for tf
with tf.name_scope('S'):
    S = tf.placeholder(tf.float32, shape=[None, state_dim], name='s')
with tf.name_scope('R'):
    R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('A_'):
    A = tf.placeholder(tf.float32, shape=[None, action_dim], name='a')
with tf.name_scope('S_'):
    S_ = tf.placeholder(tf.float32, shape=[None, state_dim], name='s_')

In [8]:
sess = tf.Session()

In [9]:
# Create actor and critic.
# They are actually connected to each other, details can be seen in tensorboard or in this picture:
actor = Actor(sess, action_dim, SEP_ACTIONBOUND, LR_A, REPLACEMENT)

Instructions for updating:
Use keras.layers.dense instead.


In [10]:
critic = Critic(sess, state_dim, action_dim, LR_C, GAMMA, REPLACEMENT,actor.a,actor.a_)

In [11]:
actor.add_grad_to_graph(critic.a_grads)

In [12]:
sess.run(tf.global_variables_initializer())

In [13]:
if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)
var = 3  # control exploration

In [32]:
t1 = time.time()
for n in range(MAX_EPISODES):
    LOSS=0
    for j in range(MAX_EP_STEPS):

        var *= .9995    # decay the action randomness
        b_M = M.sample(BATCH_SIZE)
        b_s = b_M[:, :state_dim]
        b_a = b_M[:, state_dim: state_dim + action_dim]
        b_a = np.clip(np.random.normal(b_a, var), -2, 2)    # add randomness to action selection for exploration
        b_r = b_M[:, -state_dim - 1: -state_dim]
        b_s_ = b_M[:, -state_dim:]

        loss=critic.learn(b_s, b_a, b_r, b_s_)
        a = actor.choose_action(b_s)
        critic.learn(b_s, a, b_r, b_s_)
        actor.learn(b_s)
        LOSS+=loss
        
        if j == MAX_EP_STEPS-1:
            LOSS=LOSS/MAX_EP_STEPS
            
            
    
    #对测试集
    b_M = M.sample(BATCH_SIZE)
    b_s = b_M[:, :state_dim]
    b_a = b_M[:, state_dim: state_dim + action_dim]
    b_a = np.clip(np.random.normal(b_a, var), -2, 2)    # add randomness to action selection for exploration
    b_r = b_M[:, -state_dim - 1: -state_dim]
    b_s_ = b_M[:, -state_dim:]
    a = actor.choose_action(b_s)
    q_a = critic.get_q(a,b_s)
    eveQ=0
    for i in range(BATCH_SIZE):
        eveQ+=q_a[i]
    eveQ=eveQ/BATCH_SIZE
    
    print('Episode:', n, ' Critic loss: %.2f' % LOSS, 'Var: %.2f' % var,'everage Q: %.2f',eveQ)
    
print('Running time: ', time.time()-t1)

('Episode:', 0, ' Critic loss: 0.17', 'Var: 0.00', 'everage Q: %.2f', array([38.68952], dtype=float32))
('Episode:', 1, ' Critic loss: 0.17', 'Var: 0.00', 'everage Q: %.2f', array([39.50118], dtype=float32))
('Episode:', 2, ' Critic loss: 0.17', 'Var: 0.00', 'everage Q: %.2f', array([41.163162], dtype=float32))
('Episode:', 3, ' Critic loss: 0.17', 'Var: 0.00', 'everage Q: %.2f', array([41.522728], dtype=float32))
('Episode:', 4, ' Critic loss: 0.17', 'Var: 0.00', 'everage Q: %.2f', array([39.86371], dtype=float32))
('Episode:', 5, ' Critic loss: 0.17', 'Var: 0.00', 'everage Q: %.2f', array([41.291534], dtype=float32))
('Episode:', 6, ' Critic loss: 0.17', 'Var: 0.00', 'everage Q: %.2f', array([41.272465], dtype=float32))
('Episode:', 7, ' Critic loss: 0.17', 'Var: 0.00', 'everage Q: %.2f', array([38.66093], dtype=float32))
('Episode:', 8, ' Critic loss: 0.17', 'Var: 0.00', 'everage Q: %.2f', array([37.973587], dtype=float32))
('Episode:', 9, ' Critic loss: 0.17', 'Var: 0.00', 'everage

KeyboardInterrupt: 