# Reinforcement Learning Reference Collections

### Status quo label
[.]:copied [-]:on progress [=]:written [%]:completed [@]:memorized

## Contents


1. [QLEARNING] : q-table-simple learning
    - [%][Q-learning tensorflow](#Q-learning tensorflow)  
2. [RL_PBA] : policy based learning  
    - [-][Policy Based Agent tensorflow](#Policy Based Agent tensorflow)  
3. [RL_MBA] : model based learning  
    - [-][Model Based Agent tensorflow](#Model Based Agent tensorflow)  
4. [DQN] : deep qlearning network  
    - [%][Deep Q-Learning Network tensorflow](#Deep Q-Learning Network tensorflow)  
5. [DDQN] : deep qlearning Network + dueling, double  
    - [=][Double-Dualing Deep Q-Learning Network tensorflow](#Double-Dualing Deep Q-Learning Network tensorflow)      
6. [A3C] : asynchronous advantage actor critic model  
    - [-][Asynchronous Advantages Actor-Critic Model tensorflow on Breakout-v0](#Asynchronous Advantages Actor-Critic Model tensorflow)  
7. [Meta_RL](#Meta_RL) : meta reinforcement learning [paper1](https://arxiv.org/pdf/1611.05763.pdf) [paper2](https://arxiv.org/abs/1611.02779)  
    - [][]()


---

[][DRQN] : deep recurrent q-network  

Partially observable Markov Decision Process to maximize cumulative reward  
Decision problems,  
Imitation learning  
- behavioral learning  
- inverse reinforcement learning  
DAgger (Dataset Aggregation)  
Q-learning  


Policy gradient learning : observation → action  

Q-learning : long term reward : state with reward → action  

Experience replay  
Freezing target network  

 

## Reference  


[Arthur Juliani](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.u07v7laru)  

[Arthur Juliani github](https://github.com/awjuliani/DeepRL-Agents)

  Aside
  [Neural Stack Machine](https://iamtrask.github.io/2016/02/25/deepminds-neural-stack-machine/)

In [1]:
DATASET_DIR = './dataset/'
PROJECT_DIR = './projects/RL_collections/'
SUMMARY_DIR = PROJECT_DIR+'summaries/'
SAVER_DIR = PROJECT_DIR+'models/'
CHECKPOINT_DIR = PROJECT_DIR+'checkpoints/'
RESULT_DIR = PROJECT_DIR+'results/'

In [1]:
# NOTE: In the verbose version, There are example codes with doc string inside function which is not a doctest.
# NOTE: For the sake of educational purpose, these codes are note well organized.
# TODO: RL_PBA agent version NOT DONE, DQN: , DDQN: MEM error

In [2]:
import time

def print_runtime(func):
    start = time.time()
    func()
    end = time.time()
    print("="*20)
    print("running time : ")
    print(end-start, "s")
    print("="*20)
    
def testfun():
    time.sleep(5)
    
print_runtime(testfun)

running time : 
5.001129865646362 s


<a id='Q-learning tensorflow'></a>
## Q-learning tensorflow

In [3]:
def QLEARNING():
    """ 20 mins """
    import time

    import numpy as np
    import tensorflow as tf
    import gym

    # HYPER PARAMS
    env = gym.make("FrozenLake-v0")
    learning_rate = 0.4

    input_size = env.observation_space.n # 16
    output_size = env.action_space.n # 4
    max_episode = 2000
    dis = 0.99

    # PARAMS
    X = tf.placeholder(tf.float32, [1, input_size]) # 1x16
    W = tf.Variable(tf.random_uniform([input_size, output_size], 0, 0.01)) # 16x4

    Qpred = tf.matmul(X, W) # 1x4
    Y = tf.placeholder(tf.float32, [1, output_size])

    loss = tf.reduce_mean(tf.square(Y - Qpred))
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

    rList = []

    def one_hot(x, inputsize):
        return np.identity(input_size)[x:x+1]

    start = time.time()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for episode in range(max_episode):
            # e-greedy
            e = 1. / ((episode / 50.) + 10.)
            s = env.reset() # 1
            rewardAll = 0
            done = False
            step = 1
            while not done:
                Qs = sess.run(Qpred, feed_dict={X: one_hot(s, input_size)})
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(Qs)

                sn, reward, done, info = env.step(action)

                if done:
                    Qs[0, action] = reward
                else:
                    Qsn = sess.run(Qpred, feed_dict={X: one_hot(sn, input_size)})
                    Qs[0, action] = reward + dis*np.max(Qsn)
                sess.run(optimizer, feed_dict={X: one_hot(s, input_size), Y: Qs})
                rewardAll += reward
                s = sn
                step += 1
            print("episode : {:d}, reward : {:.5f}".format(episode, rewardAll))
            rList.append(rewardAll)
        end = time.time()
        print("acc : " + str(sum(rList)/max_episode * 100) + "%")
        print("In time : {} s".format(end-start))
    return None

QLEARNING()

[2017-03-15 22:09:21,129] Making new env: FrozenLake-v0


episode : 0, reward : 0.00000
episode : 1, reward : 0.00000
episode : 2, reward : 0.00000
episode : 3, reward : 0.00000
episode : 4, reward : 0.00000
episode : 5, reward : 1.00000
episode : 6, reward : 0.00000
episode : 7, reward : 0.00000
episode : 8, reward : 1.00000
episode : 9, reward : 0.00000
episode : 10, reward : 0.00000
episode : 11, reward : 0.00000
episode : 12, reward : 0.00000
episode : 13, reward : 0.00000
episode : 14, reward : 1.00000
episode : 15, reward : 0.00000
episode : 16, reward : 0.00000
episode : 17, reward : 1.00000
episode : 18, reward : 0.00000
episode : 19, reward : 0.00000
episode : 20, reward : 0.00000
episode : 21, reward : 0.00000
episode : 22, reward : 0.00000
episode : 23, reward : 1.00000
episode : 24, reward : 0.00000
episode : 25, reward : 0.00000
episode : 26, reward : 1.00000
episode : 27, reward : 1.00000
episode : 28, reward : 0.00000
episode : 29, reward : 0.00000
episode : 30, reward : 0.00000
episode : 31, reward : 1.00000
episode : 32, rewa

<a id='Policy Based Agent tensorflow'></a>
## Policy Based Agent

In [4]:
def RL_PBA():
    """ 30 mins """
    import matplotlib.pyplot as plt
    import math

    import numpy as np
    import tensorflow as tf

    import tensorflow.contrib.slim as slim

    import gym

    env = gym.make("CartPole-v0")
    s = env.reset()

    agent_level = True
    gamma = 0.99
    # HELPER FUNCTION
    def discount_rewards(reward):
            discounted_r = np.zeros_like(reward)
            running_add = 0
            for t in reversed(range(reward.size)):
                running_add = running_add * gamma + reward[t]
                discounted_r[t] = running_add
            return discounted_r

    if not agent_level:
        # HYPER PARAMS
        random_episodes = 0
        episode_number = 1
        total_episodes = 2000

        batch_size = 3
        learning_rate = 5e-2
        gamma = .99

        input_dim = 4

        n_hidden1 = 40
        n_hidden2 = 40

        # PARAMS
        tf.reset_default_graph()
        reward_sum = 0
        running_reward = None

        observations = tf.placeholder(tf.float32, [None, input_dim], name="input_x")
        input_y = tf.placeholder(tf.float32, [None, 1], name="input_y")
        advantages = tf.placeholder(tf.float32, name="reward")

        def pol_net(observations):
            w1 = tf.get_variable("w1", shape=[input_dim, n_hidden1], 
                                initializer=tf.contrib.layers.xavier_initializer())
            h1 = tf.nn.elu(tf.matmul(observations, w1))
            w2 = tf.get_variable("w2", shape=[n_hidden1, n_hidden2], 
                                initializer=tf.contrib.layers.xavier_initializer())
            h2 = tf.nn.elu(tf.matmul(h1, w2))
            w3 = tf.get_variable("w3", shape=[n_hidden2, 1], 
                                initializer=tf.contrib.layers.xavier_initializer())
            return tf.nn.sigmoid(tf.matmul(h2, w3))

        pred = pol_net(observations)

        tvars = tf.trainable_variables()

        loglikelihood = tf.log(input_y*(input_y - pred) + (1-input_y)*(input_y + pred))
        policy_cost = -tf.reduce_mean(loglikelihood*advantages)

        newGrad = tf.gradients(policy_cost, tvars)
        optimizer = tf.train.AdamOptimizer(learning_rate)

        w1grad = tf.placeholder(tf.float32, name="batch_grad1")
        w2grad = tf.placeholder(tf.float32, name="batch_grad2")

        batchGrad = [w1grad, w2grad]
        updateGrad = optimizer.apply_gradients(zip(batchGrad, tvars))

        with tf.Session() as sess:
            rendering = False
            sess.run(tf.global_variables_initializer())
            s = env.reset()

            gradBuffer = sess.run(tvars)
            for ix, grad in enumerate(gradBuffer):
                gradBuffer[ix] = grad*0

            xs, hs, dlogps, drs, ys, tfps = [], [], [], [], [], []

            while episode_number <= total_episodes:
                if reward_sum/batch_size > 100 or rendering == True:
                    env.render()
                    rendering = True
                s = np.reshape(s, [1, input_dim])
                tfprob = sess.run(pred, feed_dict={observations: s})

                action = 1 if np.random.uniform() < tfprob else 0
                xs.append(s)

                y = 1 if action == 0 else 0
                ys.append(y)

                sn, reward, done, info = env.step(action)

                reward_sum += reward
                drs.append(reward)
                s = sn

                if done:
                    episode_number += 1
                    epx = np.vstack(xs)
                    epy = np.vstack(ys)
                    epr = np.vstack(drs)

                    tfp = tfps
                    xs, hs, dlogps, drs, ys, tfps = [], [], [], [], [], []

                    discounted_epr = discount_rewards(epr)
                    discounted_epr -= np.mean(discounted_epr)
                    discounted_epr /= np.std(discounted_epr)

                    tGrad = sess.run(newGrad, feed_dict={
                            observations: epx,
                            input_y: epy,
                            advantages: discounted_epr,
                        })
                    for ix, grad in enumerate(tGrad):
                        gradBuffer[ix] += grad

                    if episode_number % batch_size == 0:
                        sess.run(updateGrad, feed_dict={w1grad: gradBuffer[0], w2grad: gradBuffer[1]})

                        for ix, grad in enumerate(gradBuffer):
                            gradBuffer[ix] = grad*0

                        running_reward = reward_sum if running_reward is None else running_reward*0.99 + reward_sum*0.01
                        print("{:d} episode : Average reward for episode {:.2f}. Total average reward {:.2f}".format(
                                    episode_number,
                                    reward_sum/batch_size,
                                    running_reward/batch_size
                            ))
                    if reward_sum/batch_size > 200:
                        print("Task solved in ", episode_number, " episodes.")
                        break
                    reward_sum = 0
                    s = env.reset()

    else:
        # DEFINE POLICY AGENT
        class agent():
            def __init__(self, lr, s_size, a_size, h_size, alpha):
                with tf.variable_scope("agent"):
                    # POLICY NET
                    self.state_in = tf.placeholder(tf.float32, [None, s_size]) #
                    hidden = slim.fully_connected(self.state_in, h_size, 
                                biases_initializer=None, activation_fn=tf.nn.relu)
                    self.output = slim.fully_connected(hidden, a_size, 
                                biases_initializer=None, activation_fn=tf.nn.softmax) #

                    self.action = tf.argmax(self.output, 1)

                    self.reward_holder = tf.placeholder(tf.float32, [None]) #
                    self.action_holder = tf.placeholder(tf.int32, [None]) # 
                    self.indexes = tf.range(0, tf.shape(self.output)[0])*tf.shape(self.output)[1] + self.action_holder
                    self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

                    self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)

                    tvars = tf.trainable_variables()
                    self.gradient_holders = [] #
                    for idx, var in enumerate(tvars):
                        placeholder = tf.placeholder(tf.float32, name=str(idx)+"_holder")
                        self.gradient_holders.append(placeholder)

                    self.gradients = tf.gradients(self.loss, tvars) #
                    optimizer = tf.train.AdamOptimizer(learning_rate=lr)
                    self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders, tvars)) #

        tf.reset_default_graph()
        Agent = agent(lr=1e-2, s_size=4, a_size=2, h_size=8, alpha=0.01)
        total_episodes = 5000
        max_ep = 999
        update_frequency = 5

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            i = 0
            total_reward = []
            total_length = []

            gradBuffer = sess.run(tf.trainable_variables())
            for ix, grad in enumerate(gradBuffer):
                gradBuffer[ix] = grad*0

            while i < total_episodes:
                s = env.reset()
                running_reward = 0
                ep_buffer = []
                for j in range(max_ep):
                    a_dist = sess.run(Agent.output, feed_dict={Agent.state_in : [s]})
                    a = np.random.choice(a_dist[0], p=a_dist[0])
                    a = np.argmax(a_dist == a)

                    sn, r, d, _ = env.step(a)
                    ep_buffer.append([s,a,r,sn])
                    s = sn
                    running_reward += r

                    if d:
                        ep_buffer = np.array(ep_buffer)
                        ep_buffer[:, 2] = discount_rewards(ep_buffer[:, 2])
                        feed_dict={Agent.reward_holder: ep_buffer[:, 2],
                            Agent.action_holder: ep_buffer[:, 1],
                            Agent.state_in: np.vstack(ep_buffer[:, 0])}

                        grads = sess.run(Agent.gradients)

                        for idx, grad in enumerate(grads):
                            gradBuffer[idx] += grad

                        if i % update_frequency == 0 and i != 0:
                            feed_dict=dict(zip(Agent.gradient_holders, gradBuffer))
                            _ = sess.run(Agent.update_batch, feed_dict=feed_dict)
                            for ix, grad in enumerate(gradBuffer):
                                gradBuffer[ix] = grad * 0
                        total_reward.append(running_reward)
                        total_length.append(j)
                        break
                if i % 100 ==0:
                    print("mean reward : ",np.mean(total_reward[-100:]))
                if np.mean(total_reward[-100:]) > 190:
                    env.render()
                    break
                i += 1

    return None

RL_PBA()

[2017-03-15 22:18:10,838] Making new env: CartPole-v0
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


InvalidArgumentError: You must feed a value for placeholder tensor 'agent/Placeholder' with dtype float
	 [[Node: agent/Placeholder = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op 'agent/Placeholder', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python3.5/dist-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2827, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-bbea01d50998>", line 237, in <module>
    RL_PBA()
  File "<ipython-input-4-bbea01d50998>", line 179, in RL_PBA
    Agent = agent(lr=1e-2, s_size=4, a_size=2, h_size=8, alpha=0.01)
  File "<ipython-input-4-bbea01d50998>", line 153, in __init__
    self.state_in = tf.placeholder(tf.float32, [None, s_size]) #
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_ops.py", line 1587, in placeholder
    name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 2043, in _placeholder
    name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 759, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'agent/Placeholder' with dtype float
	 [[Node: agent/Placeholder = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]


<a id='Model Based Agent tensorflow'></a>
## Model Based Agent

In [None]:
def RL_MBA():
    import math
    import matplotlib.pyplot as plt

    import numpy as np
    import tensorflow as tf	

    from tensorflow.python.framework import dtypes
    from tensorflow.python.framework import ops
    from tensorflow.python.ops import (
                array_ops,
                control_flow_ops,
                math_ops,
                nn_ops,
                rnn, rnn_cell,
                variable_scope
        )
    from tensorflow.contrib.layers import xavier_initializer

    import gym

    env = gym.make("CartPole-v0")

    # HYPER PARAMS	
    max_episode_num = 5000

    learning_rate = 1e-2
    dis = .99
    decay_rate = .99
    resume = False

    model_batch_size = 3
    real_batch_size = 3

    input_dim = 4
    n_hidden = 4	

    # POLICY NETWORK
    tf.reset_default_graph()
    class Policy_Network():
        def __init__(self, session, input_size, output_size, structure_dict=None, name="pol_main")
            self.session = session
            self.input_size = input_size
            self.output_size = output_size 
            self._build_network()

        def _build_network(self, lr=1e-2):
            #TODO: structure dict contains layer structure
            with tf.variable_scope("policy_network"):
                self._obs = tf.placeholder(tf.float32, [None, 4], name="input_x")
                self._y = tf.placeholder(tf.float32, [None, 1], name="input_y")

                w1 = tf.get_variable("w1", shape=[4, n_hidden], initializer=xavier_initializer())
                w2 = tf.get_variable("w2", shape=[n_hidden, 1], initializer=xavier_initializer())

                h1 = tf.nn.relu(tf.matmul(obs, w1))
            self._pred = tf.nn.sigmoid(tf.matmul(h1, w2))

            tvars = tf.trainable_variables()

            self._advantages = tf.placeholder(tf.float32, name="reward_signal")
            self._loglik = tf.log(input_y*(input_y - pred) + (1-input_y)*(input_y+pred))
            self._loss = -tf.reduce_mean(loglik*advantages)
            self._optm = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

        def predict(self, state):
            x = np.reshape(state, [1, self.input_size])
            return self.session.run(self._pred, feed_dict={self._obs: x})

        def update(self, x_stack, y_stack):
            return self.session.run([self._loss, self._optm], 
                        feed_dict={self._obs: x_stack, self._y: y_stack})


    class Model_Network():
        def __init__(self, session, input_size, output_size, 
                        sdict={'m_hidden':256}, name="model_main"):
            """
                input_size : [env.observation_space.shape, env.action_space.m]
            """
            self.input_size = input_size
            self.s_size = np.prod(input_size[0])
            self.a_size = input_size[1]
            self.output_size = output_size
            self.m_hidden = sdict['m_hidden']

            print_network_info()
            _build_network()

        def _build_network(self, rl=1e-3):#TODO: change to structure dict
            with tf.variable_scope("model_network"):
                m_w1 = tf.get_variable("m_w1", shape=[self.input_size, self.m_hidden],
                                                initializer=xavier_initializer())
                m_b1 = tf.Variable(tf.zeros([self.m_hidden, self.m_hidden]), name="m_b1")

                m_w2 = tf.get_variable("m_w2", shape=[self.m_hidden, self.m_hidden], 
                                                initializer=xavier_initializer())
                m_b2 = tf.Variable(tf.zeros([self.m_hidden, self.output_size]))

                m_w_obs = tf.get_variable("m_w_obs", shape=[self.m_hidden, a_size],
                                                initializer=xavier_initializer())
                m_w_reward = tf.get_variable("m_w_reward", shape=[self.m_hidden, 1])
                m_w_done = tf.get_variable("m_w_done", shape=[self.m,_hidden 1])

                m_b_obs = tf.Variable(tf.zeros([s_size]), name="m_b_obs")
                m_b_reward = tf.Variable(tf.zeros([1]), name="m_b_reward")
                m_b_done = tf.Variable(tf.zeros([1]), name="m_b_done")

                self.prev_state = tf.placeholder(tf.float32, [None, self.input_size])
                m_h1 = tf.nn.relu(tf.matmul(prev_state, m_w1) + m_b1)
                m_out = tf.nn.relu(tf.matmul(m_h1, m_w2) + m_b2)

            pred_obs = tf.matmul(m_out, m_w_obs, name="pred_obs") + m_b_obs
            pred_reward = tf.matmul(m_out, m_w_bos, name="pred_reward") + m_b_reward
            pred_done = tf.sigmoid(tf.matmul(m_out, m_w_done, name="pred_done") + m_b_done)

            self.pred_state = tf.concat(1, [pred_obs, pred_reward, pred_done])	

            true_obs = tf.placeholder(tf.float32, [None, s_size], name="true_observation")
            true_reward = tf.placeholder(tf.float32, [None, 1],name="true_reward")
            true_done = tf.placeholder(tf.float32, [None, 1])

        def predict(self, state):
            x = np.reshape(state, [1, self.input_size])
            return self.session.run(self._pred, feed_dict={self._obs: x})

        def update(self, x_stack, y_stack):
            return self.session.run([self._loss, self._optm], feed_dict={self._obs: x_stack, self._y: y_stack})

        def stepModel(sess, xs, action):
            feed_dict = {self.prev_state: np.reshape(np.hstack([xs[-1][0], np.array(action)]),
                        [1, 5])}
            myPredict = sess.run([self.pred_state], feed_dict=feed_dict)
            reward = myPredict[0][:4]
            obs = myPredict[0][:, 0:4]
            obs[:, 0] = np.clip(obs[:, 0], -2.4, 2.4)
            obs[:, 2] = np.clip(obs[:, 2], -0.4, 0.4)

            doneP = np.clip(myPredict[0][:, 5], 0, 1)
            if doneP > 0.1 or len(xs) >= 300:
                done = True
            else:
                done = False
            return obs, reward, done 

        def print_network_info(self):#TODO: infos
            print("Building Network.." + "\n" +\
                "="*10 + name +"="*10 + "\n" +\
            )


    # PARAMS

    xs, drs, ys, ds = [], [], [], []

    drawFromModel = Flase
    trainTheModel = True
    trainThePolicy = False

    with tf.Session() as sess:
        rendering = False
        sess.run(tf.global_variables_initializer())

        policy_net = Policy_NetWork(sess, input_size, output_size)
        model_net = Model_NetWork(sess, input_sizes, output_size)
        print("SET NETWORK")

        obs = env.reset()
        x = obs
        print("SET ENVIRONMENT")

        while episode <= max_episode_num:
            if episode % 50 == 0:
                print("in " + str(episode) + "th episode...")
            if (reward_sum/batch_size) > 140 and drawFromModel == False or rendering == True:
                    env.render()
                    rendering = True

            x = np.reshape(obs, [1, 4])
            tfprob = sess.run(pred, feed_dict={obs: x})

            if np.random.uniform() < tfprob:
                action = 1
            else:
                action = env.action_space.sample()

            xs.append(x)

            if drawFromModel == False:
                obs, reward, done, info = env.step(action)
            else:
                obs, reward, done = stepModel(sess, xs, action)

            reward_sum += reward

            ds.append(done*1)
            drs.append(reward)

            if done:
                if drawFromModel == False:
                    real_episodes += 1
                episode_number += 1

                epx = np.vstack(xs)
                epy = np.vstack(ys)
                epr = np.vstack(drs)
                epd = np.vstack(ds)
                xs, drs, ys, ds = [], [], [], []

            if trainTheModel == True:
                print("trainPolicy in " + str(episode_number))
                discounted_epr = discounted_rewards(epr).astype('float32')
                discounted_epr -= np.mean(discounted_epr)
                discounted_epr /= np.std(discounted_epr)

                sess.run(optimizer, feed_dict={
                            obs: epx,
                            input_y: epy,
                            advantages: discounted_epr
                        })

            if switch_point + batch_size == episode_number:
                switch_point = episode_number

                if trainThePolicy == True:
                    print("trainThePoilcy in "+str(episode_number))
                    discounted_epr = discounted_rewards(epr).astype('float32')
                    discounted_epr -= np.mean(discounted_epr)
                    discounted_epr /= np.std(discounted_epr)

                running_reward = reward_sum if running_reward is None else running_reward*.99 + reward_sum*.01
                if drawFromModel == False:
                    print("World Perf: Episode {:d}, Reward {:.5f}, Action {:.5f}"+
                            "Mean Reward {}".format(real_episodes, 
                                reward_sum/real_batch_size,
                                action,
                                running_reward/real_batch_size))
                    if reward_sum/batch_size > 200:
                        break
                reward_sum = 0
                if episode_number > 100:
                    drawFromModel = not drawFromModel
                    trainTheModel = not trainTheModel
                    trainThePolicy = not trainThePolicy

            if drawFromModel == True:
                observation = np.random.uniform(-0.1, 0.1, [4])
                batch_size = model_batch_size
            else:
                observation = env.reset()
                batch_size = real_batch_size


    plt.figure(figsize=(8, 12))	
    for i in range(6):
        plt.subplot(6, 2, 2*i+1)
        plt.plot(pstate[:, i])
        plt.subplot(6, 2, 2*i+1)
        plt.plot(state_nextsAll[:, i])	
    plt.tight_layout()
    plt.show()

    return None

<a id='Deep Q-Learning Network tensorflow'></a>
## Deep Q-Learning Network tensorflow

In [6]:
def DQN():
    """ 30 mins """
    import random
    from collections import deque
    import numpy as np

    import tensorflow as tf
    import gym

    env = gym.make("CartPole-v0")
    input_size = env.observation_space.shape[0]
    output_size = env.action_space.n
    dis = 0.99
    REPLAY_BUFFER = 4000

    class dqn():
        def __init__(self, session, input_size, output_size, name="main"):
            self.session = session
            self.input_size = input_size
            self.output_size = output_size
            self.net_name = name

            self._build_network()

        def _build_network(self, h_size=10, l_rate=1e-1):
            with tf.variable_scope(self.net_name):
                self._X = tf.placeholder(tf.float32, [None, self.input_size], name="input_x")
                W1 = tf.get_variable("w1", shape=[self.input_size, h_size], 
                            initializer=tf.contrib.layers.xavier_initializer())
                W2 = tf.get_variable("w2", shape=[h_size, self.output_size], 
                            initializer=tf.contrib.layers.xavier_initializer())
                h1 = tf.nn.tanh(tf.matmul(self._X, W1))

                self._Qpred = tf.matmul(h1, W2)
            self._Y = tf.placeholder(tf.float32, [None, self.output_size])
            self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
            self._train = tf.train.AdamOptimizer(l_rate).minimize(self._loss)

        def predict(self, state):
            x = np.reshape(state, [1, self.input_size])
            return self.session.run(self._Qpred, feed_dict={self._X: x})

        def update(self, x_stack, y_stack):
            return self.session.run([self._loss, self._train], feed_dict={self._X: x_stack, self._Y: y_stack})

    def simple_replay_train(maindqn, target, train_batch):
        x_stack = np.empty(0).reshape(0, maindqn.input_size)
        y_stack = np.empty(0).reshape(0, maindqn.output_size)

        for state, action, reward, next_state, done in train_batch:
            q = maindqn.predict(state)
            if done:
                q[0, action] = reward
            else:
                q[0, action] = reward + dis*np.max(targetdqn.predict(next_state))
            y_stack = np.vstack([y_stack, q])
            x_stack = np.vstack([x_stack, state])
        return maindqn.update(x_stack, y_stack)

    def bot_play(maindqn):
        s = env.reset()
        reward_sum = 0
        while True:
            env.render()
            a = np.argmax(maindqn.predict(s))
            s, reward, done, info = env.step(a)
            reward_sum += reward
            if done:
                print("Total reward : {}".format(reward_sum))
                break

    def get_copy_var_ops(*, from_scope="main", to_scope="target"):
        op_holder = []
        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
        for from_var, to_var in zip(from_vars, to_vars):
            op_holder.append(to_var.assign(from_var))
        return op_holder

    max_episodes = 300
    replay_buffer = deque()
    with tf.Session() as sess:
        maindqn = dqn(sess, input_size, output_size, name="main")
        targetdqn = dqn(sess, input_size, output_size, name="target")
        sess.run(tf.global_variables_initializer())

        copy_ops = get_copy_var_ops(from_scope="main", to_scope="target")
        sess.run(copy_ops)

        for episode in range(max_episodes):
            e = 1. / ((episode / 10.) + 1.)
            done = False
            step_count = 0

            s = env.reset()
            while not done:
                if np.random.rand(1) < e:
                    a = env.action_space.sample()
                else:
                    a = np.argmax(maindqn.predict(s))

                sn, r, d, info = env.step(a)
                if done:
                    r = -100
                replay_buffer.append((s, a, r, sn, d))
                if len(replay_buffer) > REPLAY_BUFFER:
                    replay_buffer.popleft()

                s = sn
                step_count += 1
                if step_count > 10000:
                    break
            print("episode : {}, step : {}".format(episode, step_count))
            #if step_count > 10000:
            #	break

            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = simple_replay_train(maindqn, targetdqn, minibatch)
                if episode % 50 == 1:
                    print("loss : ", loss)
                sess.run(copy_ops)
        bot_play(maindqn)
    return None

print_runtime(DQN)

[2017-03-15 22:24:27,295] Making new env: CartPole-v0
[2017-03-15 22:24:27,722] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.
[2017-03-15 22:24:27,885] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 0, step : 10001


[2017-03-15 22:24:28,457] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 1, step : 10001
loss :  0.00232896


[2017-03-15 22:24:29,070] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 2, step : 10001


[2017-03-15 22:24:29,792] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 3, step : 10001


[2017-03-15 22:24:30,697] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 4, step : 10001


[2017-03-15 22:24:31,639] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 5, step : 10001


[2017-03-15 22:24:32,664] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 6, step : 10001


[2017-03-15 22:24:33,770] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 7, step : 10001


[2017-03-15 22:24:34,948] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 8, step : 10001


[2017-03-15 22:24:36,137] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 9, step : 10001


[2017-03-15 22:24:37,515] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 10, step : 10001


[2017-03-15 22:24:39,069] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 11, step : 10001


[2017-03-15 22:24:40,666] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 12, step : 10001


[2017-03-15 22:24:42,141] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 13, step : 10001


[2017-03-15 22:24:43,661] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


episode : 14, step : 10001


KeyboardInterrupt: 

<a id='Double-Dualing Deep Q-Learning Network tensorflow'></a>
## Doubling-Dualing Deep Q-learning Network tensorflow

In [None]:
def DDQN():
    """ 120 mins  """
    import os
    import functools
    import random
    import matplotlib.pyplot as plt
    import scipy.misc

    import numpy as np
    import tensorflow as tf
    import tensorflow.contrib.slim as slim

    import gym

    env = gym.make("Breakout-v0")

    #	input_state_n = functools.reduce(lambda x,y : x*y, env.observation_space.shape)
    input_state_n = np.prod(env.observation_space.shape)
    input_obs_shape = list(env.observation_space.shape)
    output_action_n = env.action_space.n

    max_episode_num = 1000
    max_step_num = 500

    pre_train_step_num = 100

    starte = 1
    ende = 0.1
    explore_num = min(max_step_num, 300)
    ed = (starte - ende)/explore_num
    e = starte

    replay_buffer_size = 100
    update_freq = 5
    h_size = 512

    batch_size = 32
    tau = 0.001

    dis = .99

    load_model = False
    model_path = "./ddqn_model"

    class qnetwork():
        def __init__(self, h_size):
            self.scalarInput = tf.placeholder(tf.float32, [None, input_state_n])
            self.imageIn = tf.reshape(self.scalarInput, shape=[-1]+input_obs_shape)
            self.conv1 = slim.convolution2d(
                            inputs=self.imageIn,
                            num_outputs=32,
                            kernel_size=[8, 8],
                            stride=[4, 4],
                            padding='VALID',
                            activation_fn=tf.nn.relu,
                            biases_initializer=None
                        )
            self.conv2 = slim.convolution2d(
                            inputs=self.conv1,
                            num_outputs=64,
                            kernel_size=[3, 3],
                            stride=[1, 1],
                            padding='VALID',
                            activation_fn=tf.nn.relu,
                            biases_initializer=None
                        )
            self.conv3 = slim.convolution2d(
                            inputs=self.conv2,
                            num_outputs=512,
                            kernel_size=[7, 7],
                            stride=[1, 1],
                            padding='VALID',
                            activation_fn=tf.nn.relu,
                            biases_initializer=None
                        )

            # Dueling
            self.streamAC, self.streamVC = tf.split(3, 2, self.conv3)
            self.streamA = slim.flatten(self.streamAC)
            self.streamV = slim.flatten(self.streamVC)

            self.streamA = slim.fully_connected(self.streamA, 256)
            self.streamV = slim.fully_connected(self.streamV, 256)

            self.AW = tf.Variable(tf.random_normal([h_size//2, output_action_n]))
            self.VW = tf.Variable(tf.random_normal([h_size//2, 1]))

            self.Advantage = tf.matmul(self.streamA, self.AW)
            self.Value = tf.matmul(self.streamV, self.VW)

            self.Qout = self.Value + tf.sub(self.Advantage, tf.reduce_mean(self.Advantage, reduction_indices=1, keep_dims=True))
            self.predict = tf.argmax(self.Qout, 1)

            self.targetQ = tf.placeholder(tf.float32, [None])
            self.actions = tf.placeholder(tf.int32, [None])
            self.actions_onehot = tf.one_hot(self.actions, output_action_n, dtype=tf.float32)

            self.Q = tf.reduce_sum(tf.mul(self.Qout, self.actions_onehot), reduction_indices=1)

            self.td_error = tf.square(self.targetQ - self.Q)
            self.loss = tf.reduce_mean(self.td_error)
            self.trainer = tf.train.AdamOptimizer(learning_rate=1e-3)
            self.updateModel = self.trainer.minimize(self.loss)

    class experience_buffer():
        """ s, a, r, sn, d """
        def __init__(self, buffer_size=50000):
            self.buffer = []
            self.buffer_size = buffer_size

        def add(self, experience):
            if len(self.buffer) + len(experience) >= self.buffer_size:
                self.buffer[0:(len(experience) + len(self.buffer))] = []
            self.buffer.extend(experience)

        def sample(self, size):
            return np.reshape(np.array(random.sample(self.buffer, size)), [size, 5])

    def updateTargetGraph(tfVars, tau):
        total_vars = len(tfVars)
        op_holder = []
        for idx, var in enumerate(tfVars[0:total_vars//2]):
            op_holder.append(tfVars[idx+total_vars//2].assign((var.value())*tau) + (1-tau)*tfVars[idx+total_vars//2].value())
        return op_holder

    def updateTarget(op_holder, sess):
        for op in op_holder:
            sess.run(op)

    def processState(states):
        return np.reshape(states, input_state_n)

    tf.reset_default_graph()
    mainqn = qnetwork(h_size)
    targetqn = qnetwork(h_size)

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    tvars = tf.trainable_variables()

    targetOps = updateTargetGraph(tvars, tau)
    mybuffer = experience_buffer(replay_buffer_size)

    stepList = []
    rList = []
    total_steps = 0

    if not os.path.exists(model_path):
        os.makedirs(model_path)

    with tf.Session() as sess:
        if load_model == True:
            print("Loading Model..")
            ckpt = tf.train.get_checkpoint_state(model_path)
            saver.restore(sess, ckpt.model_checkpoint_path)
        sess.run(init)
        updateTarget(targetOps)

        for episode in range(max_episode_num):
            episode_buffer = experience_buffer(replay_buffer_size)
            s = env.reset()
            s = processState(s)
            d = False
            rAll = 0
            step = 0

            while step < max_step_num:
                #e = 1. / ((episode + 50.) + 10.)
                if np.random.rand(1) < e or total_steps < pre_train_step_num:
                    a = np.random.randint(0, 4)
                else:
                    a = sess.run(mainqn.predict, feed_dict={mainqn.scalarInput[s]})[0]

                sn, r, d, _ = env.step(a)
                sn = processState(sn)

                total_steps += 1
                step+=1
                episode_buffer.add(np.reshape(np.array([s, a, r, sn, d]), [1, 5]))

                if total_steps > pre_train_step_num:
                    if e > ende:
                        e -= ed

                    if total_steps % update_freq == 0:
                        trainBatch = mybuffer.sample(batch_size)
                        # Doubling
                        Q1 = sess.run(mainqn.predict, feed_dict={mainqn.scalarInput: np.vstack(trainBatch[:, 3])})
                        Q2 = sess.run(targetqn.Qout, feed_dict={targetqn.scalarInput: np.vstack(trainBatch[:, 3])})
                        end_multiplier = -(trainBatch[:, 4] - 1) # filp done
                        doubleQ = Q2[range(batch_size), Q1]
                        targetQ = trainBatch[:, 2] + (dis*doubleQ*end_multiplier) # reward + gamma * (collect not done q)
                        _ = sess.run(mainqn.updateModel,
                                feed_dict={
                                    mainqn.scalarInput: np.vstack(trianBatch[:, 0]),
                                    mainqn.targetQ: targetQ,
                                    mainqn.actions: trainBatch[:, 1]
                                })
                        updateTarget(targetOps, sess)
                rAll += r
                s = sn
                if d == True:
                    break
            mybuffer.add(episode_buffer.buffer)
            stepList.append(step)
            rList.append(rAll)
            if i % 1000 == 0:
                svaer.save(sess, model_path + "/model-"+str(episode)+".ckpt")
                print("Saved Model..")
            if len(rList) % 10 == 0:
                print(total_steps, np.mean(rList[-10:]), e)
        saver.save(sess, path+"/model-"+str(episode)+".ckpt")
        bot_play(mainqn)

    print("Percent of succesful episodes : " + str(sum(rList)/num_episodes * 100) + "%")

    return None

<a id='Asynchronous Advantages Actor-Critic Model tensorflow'></a>
## Asynchronous Advantages Actor-Critic Model tensorflow

In [2]:
def A3C():
    import os
    import multiprocessing
    import threading
    import functools
    import scipy.signal
    import scipy.misc

    import numpy as np
    import tensorflow as tf
    import tensorflow.contrib.slim as slim
    from tensorflow.python.ops import rnn_cell, rnn

    import gym

    replay_buffer_size = 30

    # HELPER FUNCTIONS
    def update_target_ops(from_scope, to_scope):
        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
        op_holder=[]
        for from_var, to_var in zip(from_vars, to_vars):
            op_holder.append(to_var.assign(from_var))
        return op_holder

    def discount(x, gamma):
        return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]

    def normalized_columns_initializer(stddev=0.1):#TODO:
        def _initializer(shape, dtype=None, partition_info=None):
            """
                >>> a = np.array([[1,2,3],[2,3,4]])
                >>> a.sum(axis=0, keepdims=True)
                array([[3, 5, 7]])
                >>> a
                array([[1, 2, 3],
                       [2, 3, 4]])
                >>> a.sum(axis=0)
                array([3, 5, 7])
            """
            out = np.random.randn(*shape).astype(np.float32)
            out *= stddev / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
            return tf.constant(out)
        return _initializer

    def preprocessState(state):
        """ Flatten """
        state = state.astype(np.float32) / 255.0
        return np.reshape(state, [1,-1])

    def process_State(frame):
        s = frame[10:-10,30:-30]
        s = scipy.misc.imresize(s,[84,84])
        s = np.reshape(s,[np.prod(s.shape)]) / 255.0
        return s


    # NETWORK
    class AC_Network():
        def __init__(self, obs_shape, s_size, a_size, trainer, scope):
            with tf.variable_scope(scope):
                # INPUT LAYERS
                self.inputs = tf.placeholder(tf.float32, [None, s_size])
                self.imageIn = tf.reshape(self.inputs, [-1]+obs_shape)
                self.conv1 = slim.conv2d(inputs=self.imageIn, num_outputs=16,
                                         kernel_size=[8, 8], stride=[4, 4], 
                                         padding='VALID', activation_fn=tf.nn.elu)
                self.conv2 = slim.conv2d(inputs=self.conv1, num_outputs=32, 
                                         kernel_size=[8, 8], stride=[4, 4], 
                                         padding='VALID', activation_fn=tf.nn.elu)

                self.hidden = slim.fully_connected(slim.flatten(self.conv2), 256,
                                                   activation_fn=tf.nn.elu)

                # RNN FOR TEMPORAL DEPENDENCY
                lstm_cell = rnn_cell.BasicLSTMCell(256, state_is_tuple=True)
                c_size, h_size = lstm_cell.state_size.c, lstm_cell.state_size.h

                c_init = np.zeros((1, c_size), np.float32)
                h_init = np.zeros((1, h_size), np.float32)
                self.state_init = [c_init, h_init]# TODO: check state_init 

                c_in = tf.placeholder(tf.float32, [1, c_size])
                h_in = tf.placeholder(tf.float32, [1, h_size])
                self.state_in = (c_in, h_in)

                rnn_in = tf.expand_dims(self.hidden, [0]) # [?, 256]
                step_size = tf.shape(self.imageIn)[:1] # 4
                state_in = rnn_cell.LSTMStateTuple(c_in, h_in) # [1, 256]
                # rnn_in, state_in

                lstm_output, lstm_state = tf.nn.dynamic_rnn(lstm_cell, rnn_in, 
                        initial_state=state_in,
                        sequence_length=[256], time_major=False
                    )
                #TODO:
                # rnn_out -> OUTPUT LAYERS, state_out -> worker.work
                rnn_out = tf.reshape(lstm_output, [-1, 256])
                self.state_out = (lstm_state[0][:1, :], lstm_state[1][:1, :])

                # OUTPUT LAYERS
                self.policy = slim.fully_connected(rnn_out, a_size, activation_fn=tf.nn.softmax,
                                    weights_initializer=normalized_columns_initializer(0.01),
                                    biases_initializer=None)#NOTE:
                self.value = slim.fully_connected(rnn_out, 1, activation_fn=tf.nn.softmax,
                                    weights_initializer=normalized_columns_initializer(1),
                                    biases_initializer=None)#NOTE:

                if scope != 'global':
                    # 3. LOSS FUNCTION
                    self.actions = tf.placeholder(tf.int32, [None])
                    self.actions_onehot = tf.one_hot(self.actions, a_size, dtype=tf.float32)
                    self.target_v = tf.placeholder(tf.float32, [None])
                    self.advantages = tf.placeholder(tf.float32, [None]) 
                    self.responsible_outputs = -tf.reduce_sum(self.policy * self.actions_onehot, [1])

                    self.value_loss = 0.5*tf.reduce_sum(tf.square(self.target_v - self.value))
                    self.entropy = -tf.reduce_sum(tf.log(self.policy)*self.policy)
                    self.policy_loss = tf.log(self.responsible_outputs) * self.advantages
                    self.loss = self.policy_loss + 0.5*self.value_loss - 0.01*self.entropy

                    # 4. TRAIN LOCAL NETWORK
                    local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                    self.gradients = tf.gradients(self.loss, local_vars)
                    self.var_norms = tf.global_norm(local_vars)
                    grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, 40.0)

                    # 5. APPLY GRADIENT TO GLOBAL NETWORK
                    global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                    self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))

    class Worker():
        def __init__(self, env, obs_shape, idx, s_size, a_size, trainer, global_episode):
            self.name = 'worker'+str(idx)
            self.number = idx
            self.trainer = trainer
            self.global_episode = global_episode
            self.increment = self.global_episode.assign_add(1)
            self.episode_rewards = []
            self.episode_lengths = []
            self.episode_mean_values = []
            self.summary_writer = tf.summary.FileWriter('train_'+str(self.number)) 
            self.env = env	

            self.local_AC = AC_Network(obs_shape, s_size, a_size, trainer, self.name)
            self.update_local_ops = update_target_ops('global', self.name)

        def train(self, sess, rollout, bootstrap_value, gamma):
            # 5. APPLY GRADIENT TO GLOBAL NETWORK
            rl = len(rollout)
            rollout = np.array(rollout)
            observations = rollout[:, 0]
            actions = rollout[:, 1]
            rewards = rollout[:, 2]
            next_observation = rollout[:, 3]
            values = rollout[:, 5]

            # Advantage and Discounted returns
            self.reward_plus = np.asarray(rewards.tolist() + [bootstrap_value])
            discounted_rewards = discount(self.reward_plus, gamma)[:-1]
            self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) 
            advantage = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
            advantage = discount(advantage, gamma)[0]
            
            advantage = np.array([advantage])
            print(advantage, advantage.dtype)
            
            rnn_state = self.local_AC.state_init
            feed_dict = {
                self.local_AC.target_v: discounted_rewards,
                self.local_AC.actions: actions,
                self.local_AC.advantages: advantage,
                self.local_AC.state_in[0]: rnn_state[0],
                self.local_AC.state_in[1]: rnn_state[1]
            }

            print("worker train! : ")
            print(discounted_rewards.dtype, advantage.dtype)
            print(advantage)
            
            v_l, p_l, e_l, g_n, v_n = sess.run([
                self.local_AC.value_loss,
                self.local_AC.policy_loss,
                self.local_AC.entropy,
                self.local_AC.grad_norms,
                self.local_AC.var_norms,
                ], feed_dict=feed_dict
            )

            return v_l/rl, p_l/rl, e_l/rl, g_n, v_n


        def work(self, sess, coor, saver, max_episode_length, gamma):#TODO:
            episode_count = sess.run(self.global_episode)
            total_steps = 0
            print("Starting Worker " + str(self.number))
            with sess.as_default(), sess.graph.as_default():
                while not coor.should_stop():
                    # 1. INTIALIZE LOCAL NETWORK FROM GLOBAL NETWORK
                    sess.run(self.update_local_ops) 

                    episode_buffer = []
                    episode_values =  []
                    episode_frames = []
                    episode_reward = 0
                    episode_step_count = 0
                    done = False

                    s = self.env.reset()
                    s = preprocessState(s)
                    #s = process_State(s)
                    print("s in worker.work")
                    print(type(s), s.dtype, s.shape)
                    
                    
                    rnn_state = self.local_AC.state_init
                    
                    print('worker work! : ', rnn_state[0].dtype,  rnn_state[1].dtype)
                    while not done:
                        # 2. INTERACTS WITH ITS OWN ENV ; TAKE ACTION
                        a_dist, v, rnn_state = sess.run([
                            self.local_AC.policy, self.local_AC.value, self.local_AC.state_out
                            ], feed_dict={
                                self.local_AC.inputs: s,
                                self.local_AC.state_in[0]: rnn_state[0],
                                self.local_AC.state_in[1]: rnn_state[1]
                        }) 

                        a = np.random.choice(a_dist[0], p=a_dist[0])
                        a = np.argmax(a_dist == a)

                        sn, r, d, _ = self.env.step(a)

                        if d == False:
                            sn = preprocessState(sn)
                            #sn = process_State(sn)
                            

                        episode_buffer.append([s, a, r, sn, done, v[0, 0]])
                        episode_values.append(v[0, 0])
                        episode_reward += r
                        s = sn
                        total_steps += 1
                        episode_step_count += 1

                        # 2. INTERACTS WITH ITS OWN ENV ; UPDATE ROLLOUT
                        # buffer clean
                        if len(episode_buffer) == replay_buffer_size and done != True and episode_step_count != max_episode_length-1:
                            vn = sess.run(self.local_AC.value,
                                    feed_dict={
                                        self.local_AC.inputs: s,
                                        self.local_AC.state_in[0]: rnn_state[0],
                                        self.local_AC.state_in[1]: rnn_state[1],
                                })[0, 0]
                            v_l, p_l, e_l, g_n, v_n = self.train(sess, episode_buffer, vn, gamma)
                            episode_buffer = []
                            sess.run(self.update_local_ops)

                        if d == True:
                            break
                        self.episode_rewards.append(episode_reward)
                        self.episode_lengths.append(episode_step_count)
                        self.episode_mean_values.append(np.mean(episode_values))
                        # buffer update
                        if len(episode_buffer) != 0:
                            v_l, p_l, e_l, g_n, v_n = self.train(sess, episode_buffer, 0.0, gamma)

                    episode_count += 1
            return 

    env = gym.make('Breakout-v0')
    input_obs_n = list(env.observation_space.shape)# [210,160,3]
    output_action_n = env.action_space.n # 6 : 

    #s_size = functools.reduce(lambda x,y : x*y, input_obs_n)
    s_size = np.prod(input_obs_n)
    print(s_size)
    a_size = output_action_n

    max_episode_nums = 400
    dis = .99

    load_model = False
    model_path = SAVER_DIR+'/a3c_model'

    if not os.path.exists(model_path):
        os.makedirs(model_path)
    # GENERATE NETWORK
    with tf.device("/cpu:0"):
        global_episode = tf.Variable(0, dtype=tf.int32, name='global_episodes', trainable=False)
        global_optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)

        master_network = AC_Network([210, 160, 3], s_size, a_size, None, 'global')
        worker_num = 1 #multiprocessing.cpu_count()
        workers = []
        for i in range(worker_num):
            workers.append(Worker(gym.make('Breakout-v0'), [210, 160, 3], i, s_size, a_size, global_optimizer, global_episode))
            saver = tf.train.Saver(max_to_keep=5)

    # THREADING
    with tf.Session() as sess:
        coor = tf.train.Coordinator()
        if load_model :
            print("Loading Model..")
            ckpt = tf.train.get_checkpoint_state(model_path)# NOTE:
            saver.restore(sess, ckpt.model_checkpoint_path)# NOTE:
        else:
            sess.run(tf.global_variables_initializer())
        worker_threads = []
        for worker in workers:
            worker_work = lambda: worker.work(sess, coor, saver, max_episode_nums, dis)# NOTE:
            t = threading.Thread(target=worker_work)
            t.start()
            worker_threads.append(t)
        coor.join(worker_threads)

    return None

A3C()

[2017-03-21 02:21:27,930] Making new env: Breakout-v0
[2017-03-21 02:21:28,101] Making new env: Breakout-v0


100800
Starting Worker 0
s in worker.work
<class 'numpy.ndarray'> float32 (1, 100800)
worker work! :  float32 float32
[-1.] float64
worker train! : 
float64 float64
[-1.]


Exception in thread Thread-16:
Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1021, in _do_call
    return fn(*args)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1003, in _run_fn
    status, run_metadata)
  File "/usr/lib/python3.5/contextlib.py", line 66, in __exit__
    next(self.gen)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/errors_impl.py", line 469, in raise_exception_on_not_ok_status
    pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: You must feed a value for placeholder tensor 'worker0/Placeholder' with dtype float
	 [[Node: worker0/Placeholder = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.5/threadin

In [1]:
DATASET_DIR = './dataset/'
PROJECT_DIR = './projects/RL_collections/'
SUMMARY_DIR = PROJECT_DIR+'summaries/'
SAVER_DIR = PROJECT_DIR+'models/'
CHECKPOINT_DIR = PROJECT_DIR+'checkpoints/'
RESULT_DIR = PROJECT_DIR+'results/'

<a id='Meta_RL'></a>
## Meta_RL

#### from https://github.com/awjuliani/Meta-RL, https://hackernoon.com/learning-policies-for-learning-policies-meta-reinforcement-learning-rl%C2%B2-in-tensorflow-b15b592a2ddf#.2ck6nb2jp

x(t)
r(t-1)
a(t-1)

[IDEA] how could one cluster the tasks with similarities... for meta rl

In [1]:
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy.misc
import os
import csv
import itertools
import tensorflow.contrib.slim as slim
from PIL import Image
from PIL import ImageDraw 
from PIL import ImageFont


# Copies one set of variables to another.
# Used to set worker network parameters to those of global network.
def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

# Discounting function used to calculate discounted returns.
def discount(x, gamma):
    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]

#Used to initialize weights for policy and value output layers
def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer


#This code allows gifs to be saved of the training episode for use in the Control Center.
def make_gif(images, fname, duration=2, true_image=False):
    import moviepy.editor as mpy

    def make_frame(t):
        try:
            x = images[int(len(images)/duration*t)]
        except:
            x = images[-1]

    if true_image:
        return x.astype(np.uint8)
    else:
        return ((x+1)/2*255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.write_gif(fname, fps = len(images) / duration,verbose=False)

def set_image_bandit(values,probs,selection,trial):
    bandit_image = Image.open('./resources/bandit.png')
    draw = ImageDraw.Draw(bandit_image)
    font = ImageFont.truetype("./resources/FreeSans.ttf", 24)
    draw.text((40, 10),str(float("{0:.2f}".format(probs[0]))),(0,0,0),font=font)
    draw.text((130, 10),str(float("{0:.2f}".format(probs[1]))),(0,0,0),font=font)
    draw.text((60, 370),'Trial: ' + str(trial),(0,0,0),font=font)
    bandit_image = np.array(bandit_image)
    bandit_image[115:115+floor(values[0]*2.5),20:75,:] = [0,255.0,0] 
    bandit_image[115:115+floor(values[1]*2.5),120:175,:] = [0,255.0,0]    
    bandit_image[101:107,10+(selection*95):10+(selection*95)+80,:] = [80.0,80.0,225.0]
    return bandit_image
    
    
def set_image_context(correct, observation,values,selection,trial):
    obs = observation * 225.0
    obs_a = obs[:,0:1,:]
    obs_b = obs[:,1:2,:]
    cor = correct * 225.0
    obs_a = scipy.misc.imresize(obs_a,[100,100],interp='nearest')
    obs_b = scipy.misc.imresize(obs_b,[100,100],interp='nearest')
    cor = scipy.misc.imresize(cor,[100,100],interp='nearest')
    bandit_image = Image.open('./resources/c_bandit.png')
    draw = ImageDraw.Draw(bandit_image)
    font = ImageFont.truetype("./resources/FreeSans.ttf", 24)
    draw.text((50, 360),'Trial: ' + str(trial),(0,0,0),font=font)
    draw.text((50, 330),'Reward: ' + str(values),(0,0,0),font=font)
    bandit_image = np.array(bandit_image)
    bandit_image[120:220,0:100,:] = obs_a
    bandit_image[120:220,100:200,:] = obs_b
    bandit_image[0:100,50:150,:] = cor
    bandit_image[291:297,10+(selection*95):10+(selection*95)+80,:] = [80.0,80.0,225.0]
    return bandit_image


def set_image_gridworld(frame,color,reward,step):
    a = scipy.misc.imresize(frame,[200,200],interp='nearest')
    b = np.ones([400,200,3]) * 255.0
    b[0:200,0:200,:] = a 
    b[200:210,0:200,:] = np.array(color) * 255.0
    b = Image.fromarray(b.astype('uint8'))
    draw = ImageDraw.Draw(b)
    font = ImageFont.truetype("./resources/FreeSans.ttf", 24)
    draw.text((40, 280),'Step: ' + str(step),(0,0,0),font=font)
    draw.text((40, 330),'Reward: ' + str(reward),(0,0,0),font=font)
    c = np.array(b)
    return c


In [2]:
import threading
import multiprocessing
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.contrib.slim as slim
import scipy.signal
from PIL import Image
from PIL import ImageDraw 
from PIL import ImageFont
%matplotlib inline

from random import choice
from time import sleep
from time import time

class dependent_bandit():
    def __init__(self,difficulty):
        self.num_actions = 2
        self.difficulty = difficulty
        self.reset()
        
    def set_restless_prob(self):
        self.bandit = np.array([self.restless_list[self.timestep],1 - self.restless_list[self.timestep]])
        
    def reset(self):
        self.timestep = 0
        if self.difficulty == 'restless': 
            variance = np.random.uniform(0,.5)
            self.restless_list = np.cumsum(np.random.uniform(-variance,variance,(150,1)))
            self.restless_list = (self.restless_list - np.min(self.restless_list)) / (np.max(self.restless_list - np.min(self.restless_list))) 
            self.set_restless_prob()
        if self.difficulty == 'easy': bandit_prob = np.random.choice([0.9,0.1])
        if self.difficulty == 'medium': bandit_prob = np.random.choice([0.75,0.25])
        if self.difficulty == 'hard': bandit_prob = np.random.choice([0.6,0.4])
        if self.difficulty == 'uniform': bandit_prob = np.random.uniform()
        if self.difficulty != 'independent' and self.difficulty != 'restless':
            self.bandit = np.array([bandit_prob,1 - bandit_prob])
        else:
            self.bandit = np.random.uniform(size=2)
        
    def pullArm(self,action):
        #Get a random number.
        if self.difficulty == 'restless': self.set_restless_prob()
        self.timestep += 1
        bandit = self.bandit[action]
        result = np.random.uniform()
        if result < bandit:
            #return a positive reward.
            reward = 1
        else:
            #return a negative reward.
            reward = 0
        if self.timestep > 99: 
            done = True
        else: done = False
        return reward,done,self.timestep
    
class AC_Network():
    def __init__(self,a_size,scope,trainer):
        with tf.variable_scope(scope):
            #Input and visual encoding layers
            self.prev_rewards = tf.placeholder(shape=[None,1],dtype=tf.float32)
            self.prev_actions = tf.placeholder(shape=[None],dtype=tf.int32)
            self.timestep = tf.placeholder(shape=[None,1],dtype=tf.float32)
            self.prev_actions_onehot = tf.one_hot(self.prev_actions,a_size,dtype=tf.float32)

            hidden = tf.concat([self.prev_rewards,self.prev_actions_onehot,self.timestep],1)
            
            #Recurrent network for temporal dependencies
            lstm_cell = tf.contrib.rnn.BasicLSTMCell(48,state_is_tuple=True)
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
            self.state_init = [c_init, h_init]
            c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
            h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
            self.state_in = (c_in, h_in)
            rnn_in = tf.expand_dims(hidden, [0])
            step_size = tf.shape(self.prev_rewards)[:1]
            state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
                time_major=False)
            lstm_c, lstm_h = lstm_state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(lstm_outputs, [-1, 48])
            
            self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
            self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32)
                        
            #Output layers for policy and value estimations
            self.policy = slim.fully_connected(rnn_out,a_size,
                activation_fn=tf.nn.softmax,
                weights_initializer=normalized_columns_initializer(0.01),
                biases_initializer=None)
            self.value = slim.fully_connected(rnn_out,1,
                activation_fn=None,
                weights_initializer=normalized_columns_initializer(1.0),
                biases_initializer=None)
            
            #Only the worker network need ops for loss functions and gradient updating.
            if scope != 'global':
                self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)
                
                self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])

                #Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
                self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy + 1e-7))
                self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs + 1e-7)*self.advantages)
                self.loss = 0.5 *self.value_loss + self.policy_loss - self.entropy * 0.05

                #Get gradients from local network using local losses
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss,local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,50.0)
                
                #Apply local gradients to global network
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))
                
class Worker():
    def __init__(self,game,name,a_size,trainer,model_path,global_episodes):
        self.name = "worker_" + str(name)
        self.number = name        
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter("train_"+str(self.number))

        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_AC = AC_Network(a_size,self.name,trainer)
        self.update_local_ops = update_target_graph('global',self.name)        
        self.env = game
        
    def train(self,rollout,sess,gamma,bootstrap_value):
        rollout = np.array(rollout)
        actions = rollout[:,0]
        rewards = rollout[:,1]
        timesteps = rollout[:,2]
        prev_rewards = [0] + rewards[:-1].tolist()
        prev_actions = [0] + actions[:-1].tolist()
        values = rollout[:,4]
        
        self.pr = prev_rewards
        self.pa = prev_actions
        # Here we take the rewards and values from the rollout, and use them to 
        # generate the advantage and discounted returns. 
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        advantages = discount(advantages,gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        rnn_state = self.local_AC.state_init
        feed_dict = {self.local_AC.target_v:discounted_rewards,
            self.local_AC.prev_rewards:np.vstack(prev_rewards),
            self.local_AC.prev_actions:prev_actions,
            self.local_AC.actions:actions,
            self.local_AC.timestep:np.vstack(timesteps),
            self.local_AC.advantages:advantages,
            self.local_AC.state_in[0]:rnn_state[0],
            self.local_AC.state_in[1]:rnn_state[1]}
        v_l,p_l,e_l,g_n,v_n,_ = sess.run([self.local_AC.value_loss,
            self.local_AC.policy_loss,
            self.local_AC.entropy,
            self.local_AC.grad_norms,
            self.local_AC.var_norms,
            self.local_AC.apply_grads],
            feed_dict=feed_dict)
        return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n
        
    def work(self,gamma,sess,coord,saver,train):
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():                 
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = [0,0]
                episode_step_count = 0
                d = False
                r = 0
                a = 0
                t = 0
                self.env.reset()
                rnn_state = self.local_AC.state_init
                
                while d == False:
                    #Take an action using probabilities from policy network output.
                    a_dist,v,rnn_state_new = sess.run([self.local_AC.policy,self.local_AC.value,self.local_AC.state_out], 
                        feed_dict={
                        self.local_AC.prev_rewards:[[r]],
                        self.local_AC.timestep:[[t]],
                        self.local_AC.prev_actions:[a],
                        self.local_AC.state_in[0]:rnn_state[0],
                        self.local_AC.state_in[1]:rnn_state[1]})
                    a = np.random.choice(a_dist[0],p=a_dist[0])
                    a = np.argmax(a_dist == a)
                    
                    rnn_state = rnn_state_new
                    r,d,t = self.env.pullArm(a)                        
                    episode_buffer.append([a,r,t,d,v[0,0]])
                    episode_values.append(v[0,0])
                    episode_frames.append(set_image_bandit(episode_reward,self.env.bandit,a,t))
                    episode_reward[a] += r
                    total_steps += 1
                    episode_step_count += 1
                                            
                self.episode_rewards.append(np.sum(episode_reward))
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))
                
                # Update the network using the experience buffer at the end of the episode.
                if len(episode_buffer) != 0 and train == True:
                    v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0)
            
                    
                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 50 == 0 and episode_count != 0:
                    if episode_count % 500 == 0 and self.name == 'worker_0' and train == True:
                        saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
                        print("Saved Model")

                    if episode_count % 100 == 0 and self.name == 'worker_0':
                        self.images = np.array(episode_frames)
                        make_gif(self.images,'./frames/image'+str(episode_count)+'.gif',
                            duration=len(self.images)*0.1,true_image=True,salience=False)

                    mean_reward = np.mean(self.episode_rewards[-50:])
                    mean_length = np.mean(self.episode_lengths[-50:])
                    mean_value = np.mean(self.episode_mean_values[-50:])
                    summary = tf.Summary()
                    summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
                    summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
                    summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
                    if train == True:
                        summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
                        summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
                        summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
                        summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
                        summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
                    self.summary_writer.add_summary(summary, episode_count)

                    self.summary_writer.flush()
                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1
                
                
gamma = .8 # discount rate for advantage estimation and reward discounting
a_size = 2 # Agent can move Left, Right, or Fire
load_model = True
train = False
model_path = './model_meta'


tf.reset_default_graph()

if not os.path.exists(model_path):
    os.makedirs(model_path)
    
if not os.path.exists('./frames'):
    os.makedirs('./frames')

    
with tf.device("/cpu:0"): 
    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    trainer = tf.train.AdamOptimizer(learning_rate=1e-3)
    master_network = AC_Network(a_size,'global',None) # Generate global network
    #num_workers = multiprocessing.cpu_count() # Set workers ot number of available CPU threads
    num_workers = 1
    workers = []
    # Create worker classes
    for i in range(num_workers):
        workers.append(Worker(dependent_bandit('uniform'),i,a_size,trainer,model_path,global_episodes))
    saver = tf.train.Saver(max_to_keep=5)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    if load_model == True:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess,ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())
        
    worker_threads = []
    for worker in workers:
        worker_work = lambda: worker.work(gamma,sess,coord,saver,train)
        thread = threading.Thread(target=(worker_work))
        thread.start()
        worker_threads.append(thread)
    coord.join(worker_threads)


TypeError: Expected int32, got list containing Tensors of type '_Message' instead.