In [1]:
import tensorflow as tf
import random
import gym
import numpy as np
import skimage
from skimage import color, exposure, transform
import threading

  from ._conv import register_converters as _register_converters


In [2]:
IMG_WIDTH = 84
IMG_HEIGHT = 84
CNT_FRAMES = 4
GLOBAL_SCOPE = 'global'
VALUE_MODIFIER = 0.5*1e0
POLICY_MODIFIER = 1*1e0
ENTROPY_MODIFIER = 1e-1#2.5e-5#0.0005
MAX_STEPS = 50
DISCOUNT = 0.99
ENV_NAME = 'BreakoutDeterministic-v4'
#ENV_NAME = 'PongDeterministic-v4'
MAX_EP_LENGTH = 700
LEARNING_RATE = 1e-4
CLIP_VALUE = 10.0
DECAY = 0.99

In [3]:
def process_frame(x_t, img_rows, img_cols):
    x_t = skimage.color.rgb2gray(x_t)
    x_t = skimage.transform.resize(x_t,(img_rows, img_cols), mode='constant')
    x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255))
    x_t = x_t.reshape((1, img_rows, img_cols, 1))
    x_t /= 255.0
    return x_t

def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

In [4]:
class EnvWrapper:
    def __init__(self, env_name):
        self.env = gym.make(env_name)
        self.action_space = self.env.action_space
    def reset(self):
        s = self.env.reset()
        s = process_frame(s, IMG_WIDTH, IMG_HEIGHT)
        s = np.stack([s for i in range(CNT_FRAMES)], axis=3)
        s = s.reshape(1, s.shape[1], s.shape[2], s.shape[3])
        self.s = np.copy(s)
        return s
    def step(self, a):
        s1, r, d, _ = self.env.step(a)
        s1 = process_frame(s1, IMG_WIDTH, IMG_HEIGHT)
        s = np.append(s1, self.s[:, :, :, :CNT_FRAMES-1], axis=3)
        self.s = np.copy(s)
        return s, r, d, _

In [5]:
last_iter = 0

class Agent:
    def __init__(self, env, scope_name, optimizer):
        self.env = env
        self.scope_name = scope_name
        self.action_size = self.env.action_space.n
        self.optimizer = optimizer
        
        self.__build_model()
    def __build_model(self):
        print('building model')
        with tf.variable_scope(self.scope_name):
            #weights_initializer = tf.truncated_normal_initializer(stddev=0.02)
            weights_initializer = tf.contrib.layers.xavier_initializer_conv2d()
            bias_initializer = tf.zeros_initializer()
            self.X = tf.placeholder(shape=[None, IMG_WIDTH, IMG_HEIGHT, CNT_FRAMES], dtype=tf.float32, name='input')
            conv1 = tf.contrib.layers.conv2d(self.X, 32, 3, stride=2, activation_fn=tf.nn.relu, padding='SAME', \
                                            weights_initializer=weights_initializer, biases_initializer = bias_initializer,\
                                            scope='first_conv')
            mp1 = tf.contrib.layers.max_pool2d(conv1, 2, scope='first_mp')
            conv2 = tf.contrib.layers.conv2d(mp1, 32, 3, stride=2, activation_fn=tf.nn.relu, padding='SAME', \
                                            weights_initializer=weights_initializer, biases_initializer = bias_initializer,\
                                            scope='second_conv')
            mp2 = tf.contrib.layers.max_pool2d(conv2, 2, scope='second_mp')
            conv3 = tf.contrib.layers.conv2d(mp2, 64, 3, stride=2, activation_fn=tf.nn.relu, padding='SAME', \
                                             weights_initializer=weights_initializer, biases_initializer = bias_initializer,\
                                            scope='third_conv')
            flattened = tf.contrib.layers.flatten(conv3, scope='flatten')
            embedding = tf.contrib.layers.fully_connected(flattened, 512, activation_fn=tf.nn.relu, weights_initializer=tf.random_normal_initializer(stddev=0.02), biases_initializer=bias_initializer,\
                                                         scope='fc_embed')
            
            #normalization = tf.layers.batch_normalization(embedding)
                        
            self.policy = tf.contrib.layers.fully_connected(embedding, self.action_size, activation_fn=tf.nn.softmax, weights_initializer=tf.random_normal_initializer(stddev=0.5), biases_initializer=None,\
                                                           scope='fc_policy')
            self.value = tf.contrib.layers.fully_connected(\
                                                           embedding, \
                                                           1, \
                                                           activation_fn=None, \
                                                           weights_initializer=tf.random_normal_initializer(stddev=.25), \
                                                           biases_initializer=None,\
                                                          scope='fc_value')
            
            if self.scope_name != GLOBAL_SCOPE:
                print('building agent:', self.scope_name)
                self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name='actions')
                self.actions_oh = tf.one_hot(self.actions, depth=self.action_size, dtype=tf.float32, name='actions_oh')
                self.target_values = tf.placeholder(shape=[None], dtype=tf.float32, name='target_vals')
                self.advantages = tf.placeholder(shape=[None], dtype=tf.float32, name='advantages')
                #print('adv shape', self.advantages.shape)
                #self.advantages = tf.subtract(tf.stop_gradient(self.value), self.target_values, name='advantage')

                MIN_POLICY = 1e-8
                MAX_POLICY = 1.0 - MIN_POLICY
                
                self.log_policy = tf.log(tf.clip_by_value(self.policy, MIN_POLICY, MAX_POLICY), name='log_policy')

                self.log_policy_for_action = tf.reduce_sum(tf.multiply(self.log_policy, self.actions_oh), axis=1, name='log_policy_for_action')
                self.value_loss = tf.reduce_mean(tf.square(self.value - self.target_values), name='value_loss')
                self.value_loss = self.value_loss * VALUE_MODIFIER
                #self.value_loss = self.value_loss - self.value_loss
                self.policy_loss = -tf.reduce_mean(tf.multiply(self.log_policy_for_action, self.advantages), name='policy_loss')
                self.policy_loss = self.policy_loss * POLICY_MODIFIER
                #entropija je E[-log(X)] = sum(p(x) * log(x))
                self.entropy_beta = tf.get_variable('entropy_beta', shape=[],
                                       initializer=tf.constant_initializer(ENTROPY_MODIFIER), trainable=False)
                self.entropy_loss = -tf.reduce_mean(self.policy * -self.log_policy, name='entropy_loss')
                self.entropy_loss = self.entropy_loss * self.entropy_beta
                #self.entropy_loss = self.entropy_loss - self.entropy_loss
                self.loss = self.value_loss + \
                            self.policy_loss + \
                            self.entropy_loss
                #get locals
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope_name)
                #update locals
                grads = tf.gradients(self.loss, local_vars)
                grads = [tf.clip_by_average_norm(grad, CLIP_VALUE) for grad in grads]
                #grads, grad_norms = tf.clip_by_global_norm(grads, CLIP_VALUE)
                self.update_ops = update_target_graph(GLOBAL_SCOPE, self.scope_name)
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, GLOBAL_SCOPE)
                capped_gvs = [(grad, var) for grad, var in zip(grads, global_vars)]
                self.global_update = self.optimizer.apply_gradients(capped_gvs)
    
    def predict(self, sess, state):
        policy = sess.run((self.policy), \
                                            feed_dict={\
                                                       self.X:state\
                                                      }\
                                           )
        policy = policy.flatten()
        #print('cur policy', policy)
        
        global last_iter
        
        if global_counter - 10000 > last_iter and self.scope_name == 'local0':
            if hasattr(self, 'k5cnt') == False:
                self.k5cnt = 0
            self.k5cnt += 1
            last_iter = global_counter
            FINAL_ENTROPY_VALUE = 1*1e-3
            scale = np.clip(self.k5cnt / 400, 0.0, 1.0)
            entropy_beta = ENTROPY_MODIFIER - scale * (ENTROPY_MODIFIER - FINAL_ENTROPY_VALUE)
            sess.run(self.entropy_beta.assign(entropy_beta))
            print('policy', policy, 'at iter', global_counter)
        
        prediction = np.random.choice(self.action_size, p=policy)
        #prediction = np.argmax(policy)
        #print('prediction', prediction)
        return prediction
            
    def act(self, sess, state):
        prediction = self.predict(sess, state)
        a = prediction
        next_state,r,d,_ = self.env.step(a)
        return state, a, r, d, next_state
    
    def get_value(self, sess, state):
        return sess.run(\
                        self.value, \
                        feed_dict={ \
                                   self.X: state\
                                  } \
                       )
    
    def train(self, sess, states, actions, target_values, advantages):
        gu, value_loss, policy_loss, entropy_loss = \
            sess.run((self.global_update, self.value_loss, self.policy_loss, self.entropy_loss), \
                     feed_dict={
                         self.X: states,
                         self.actions: actions,
                         self.target_values: target_values,
                         self.advantages: advantages
                     })
        return value_loss / len(states), policy_loss / len(states), entropy_loss / len(states)
    
    def update_to_global(self, sess):
        if self.scope_name != GLOBAL_SCOPE:
            sess.run(self.update_ops)

In [6]:
import time

global_counter = 0

start_time = time.time()

class Worker:
    def __init__(self, agent):
        self.agent = agent
        self.summary_writer = tf.summary.FileWriter(self.agent.scope_name)
    def work(self, sess, optimizer, thread_lock):
        
        global global_counter
        global start_time
        
        print('worker starting agent:', self.agent.scope_name)
        done = True
        s = None
        episode_reward = 0
        timestep = 0
        episode_counter = 0
        value_losses = []
        policy_losses = []
        entropy_losses = []
        last_rewards = []
        last_frames = []
        last_values = []
        last_advantages = []
        
        elapsed_time = time.time() - start_time
        
        with sess.as_default(), sess.graph.as_default():
            while True:#global_counter <= MAX_ITERATIONS and elapsed_time <= MAX_LEARNING_TIME:
                if done or timestep > MAX_EP_LENGTH:
                    self.agent.update_to_global(sess)
                    last_rewards.append(episode_reward)
                    last_frames.append(timestep)
                    if episode_counter > 0 and episode_counter % 5 == 0:
                        #print('for agent:', self.agent.scope_name)
                        #print('at episode', episode_counter, 'episode reward is', episode_reward)
                        if len(value_losses) > 0:
                            summary = tf.Summary()
                            
                            summary.value.add(tag='Performance/Reward', simple_value=float(sum(last_rewards) / len(last_rewards)))
                            summary.value.add(tag='Performance/Length', simple_value=float(sum(last_frames) / len(last_frames)))
                            summary.value.add(tag='Performance/Values mean', simple_value=float(sum(last_values) / len(last_values)))
                            summary.value.add(tag='Performance/Advantage mean', simple_value=float(sum(last_advantages) / len(last_advantages)))
                            summary.value.add(tag='Losses/Value Loss', simple_value=float(sum(value_losses) / len(value_losses)))
                            summary.value.add(tag='Losses/Policy Loss', simple_value=float(sum(policy_losses) / len(policy_losses)))
                            summary.value.add(tag='Losses/Entropy', simple_value=float(sum(entropy_losses) / len(entropy_losses)))
                            
                            self.summary_writer.add_summary(summary, episode_counter)

                            self.summary_writer.flush()
                            
                            last_rewards = []
                            last_frames = []
                            value_losses = []
                            policy_losses = []
                            entropy_losses = []
                            last_values = []
                            last_advantages = []
                    s = self.agent.env.reset()
                    done = False
                    episode_reward = 0
                    timestep = 0
                    episode_counter += 1
                    
                states = []
                actions = []
                rewards = []
                values = []
                target_values = []
                
                has_rewards = False
                
                while len(states) < MAX_STEPS and not done:
                    s, a, r, d, ns = self.agent.act(sess, s)
                    with thread_lock:
                        global_counter += 1
                    episode_reward += r
                    timestep += 1
                    r = np.clip(r, -1.0, 1.0)
                    states.append(s)
                    actions.append(a)
                    rewards.append(r)
                    done = d
                    val = np.copy(self.agent.get_value(sess, s)[0])
                    #print('val', val)
                    last_values.append(val)
                    values.append(val)
                    
                    s = ns
                    
                    #has_rewards = has_rewards or r != 0.0
                
                target_value = 0
                
                if not done:
                    target_value = values[-1]
                
                for reward in reversed(rewards):
                    target_value = reward + DISCOUNT * target_value
                    target_values.append(target_value)
                target_values.reverse()
                
                #advantages = np.array(target_values) - np.array(values)
                
                #print('values shape', np.array(values).shape)
                #print('target values shape', np.array(target_values).shape)
                advantages = np.array(target_values).flatten() - np.array(values).flatten()
                advantages = advantages.flatten()
                
                #print('adv before app', advantages.shape)
                
                states = np.vstack(states)
                #print('states shape', states.shape)
                actions = np.array(actions).flatten()
                #print('actions shape', actions.shape)
                target_values = np.array(target_values).flatten()
                
                 #np.vstack(advantages).ravel()
                
                value_loss, policy_loss, entropy_loss = \
                    self.agent.train(sess, states, actions, target_values, advantages)
                
                last_advantages += advantages.tolist()
                
                #if has_rewards:
                    #print('rewarded round')
                    #print('target values', target_values)
                    #print('values', np.array(values).flatten())
                    #print('advantages', advantages)
                    #print('value loss', value_loss)
                    #print('policy loss', policy_loss)
                    #print('entropy loss', entropy_loss)
                
                #if self.agent.scope_name == 'local0':
                #    print('values', values)
                #    print('target values', target_values)
                #for i in range(len(rewards)-1):
                #    idx = len(rewards) - i - 1
                #    target_values[idx-1] = rewards[idx-1] + DISCOUNT * target_values[idx]
                               
                #sleep_time = random.uniform(1e-3, 1e-2)
                #time.sleep(sleep_time)
                
                value_losses.append(value_loss)
                policy_losses.append(policy_loss)
                entropy_losses.append(entropy_loss)
                
                elapsed_time = time.time() - start_time

In [7]:
!mkdir models

mkdir: cannot create directory ‘models’: File exists


In [8]:
import time

worker_threads = []

env_global = EnvWrapper(ENV_NAME)
#global_agent = Agent(env_global, GLOBAL_SCOPE, tf.train.AdamOptimizer())
global_agent = Agent(env_global, GLOBAL_SCOPE, tf.train.GradientDescentOptimizer(LEARNING_RATE))

#config = tf.ConfigProto(device_count = {'GPU': 0})
config = tf.ConfigProto()
config.gpu_options.allow_growth=True

sess = tf.Session(config=config)

def global_saving_thread(agent, sess):
    
    global global_counter
    
    MAX_MODELS = 3
    cnt_model = 0
    
    with sess.as_default(), sess.graph.as_default():
    
        saver = tf.train.Saver()

        elapsed_time = time.time() - start_time
        
        #save model every 15 minutes
        while True:#global_counter <= MAX_ITERATIONS and elapsed_time <= MAX_LEARNING_TIME:
            print("Current model save name:", 'model_' + str(cnt_model % MAX_MODELS))
            save_path = saver.save(sess, "models/model_" + str(cnt_model % MAX_MODELS) + ".ckpt")
            print("Current global iteration", global_counter)
            cnt_model += 1
            time.sleep(30 * 60)
        print("Learning time was", int(elapsed_time/60/60), "hours", int((elapsed_time - int(elapsed_time/60/60)*60*60)/60), "minutes")

building model


In [None]:
cnt_threads = 16
thread_lock = threading.Lock()

def worker_fun(worker, sess, optimizer, thread_lock):
    worker.work(sess, optimizer, thread_lock)

for i in range(cnt_threads):
    env = EnvWrapper(ENV_NAME)
    optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    #optimizer = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE, decay=DECAY)
    worker = Worker(Agent(env, 'local' + str(i), optimizer))
    t = threading.Thread(target=worker_fun, args=(worker, sess, optimizer, thread_lock))
    worker_threads.append(t)
    time.sleep(0.05)

sess.run(tf.global_variables_initializer())
for t in worker_threads:
    t.start()
    time.sleep(0.05)
    
global_t = threading.Thread(target=global_saving_thread, args=(global_agent, sess))

worker_threads.append(global_t)
global_t.start()

for t in worker_threads:
    t.join()

building model
building agent: local0
building model
building agent: local1
building model
building agent: local2
building model
building agent: local3
building model
building agent: local4
building model
building agent: local5
building model
building agent: local6
building model
building agent: local7
building model
building agent: local8
building model
building agent: local9
building model
building agent: local10
building model
building agent: local11
building model
building agent: local12
building model
building agent: local13
building model
building agent: local14
building model
building agent: local15
worker starting agent: local0
worker starting agent: local1
worker starting agent: local2
worker starting agent: local3
worker starting agent: local4
worker starting agent: local5
worker starting agent: local6
worker starting agent: local7
worker starting agent: local8
worker starting agent: local9
worker starting agent: local10
worker starting agent: local11
worker starting agent: l

policy [0.2643048  0.23238775 0.22081283 0.28249463] at iter 1041095
policy [0.3016033  0.26528054 0.20782523 0.22529095] at iter 1051103
policy [0.13130863 0.1360975  0.04154382 0.6910501 ] at iter 1061101
policy [0.20751882 0.27143618 0.2129609  0.3080841 ] at iter 1071122
policy [0.17131491 0.2129034  0.06309167 0.55268997] at iter 1081133
policy [0.24114366 0.23493108 0.22481431 0.29911098] at iter 1091137
Current model save name: model_2
Current global iteration 1092886
policy [0.32399756 0.17190434 0.17202617 0.33207193] at iter 1101156
policy [0.3146037  0.16559021 0.14911169 0.37069446] at iter 1111158
policy [0.29630408 0.18222395 0.30816117 0.21331085] at iter 1121180
policy [0.22285864 0.2122029  0.3100904  0.25484806] at iter 1131191
policy [0.29418933 0.23991694 0.18826623 0.27762744] at iter 1141201
policy [0.26689228 0.19389355 0.16465679 0.37455738] at iter 1151213
policy [0.13435881 0.20483594 0.07629653 0.5845087 ] at iter 1161219
policy [0.27796003 0.2775821  0.22230

policy [0.21963833 0.32074258 0.28320375 0.17641532] at iter 2202334
policy [0.3223465  0.19176885 0.14572945 0.34015524] at iter 2212343
policy [0.26172763 0.16162075 0.28135332 0.29529825] at iter 2222352
policy [0.31448725 0.24911888 0.15308261 0.28331125] at iter 2232355
policy [0.2817858  0.2253271  0.25067    0.24221712] at iter 2242372
policy [0.2776546  0.2388627  0.24971803 0.23376474] at iter 2252385
policy [0.3356199  0.32230353 0.15181448 0.19026206] at iter 2262387
policy [0.20300893 0.19226499 0.543892   0.06083404] at iter 2272397
policy [0.15176904 0.20853233 0.54580605 0.09389262] at iter 2282403
policy [0.25232565 0.19082406 0.09614018 0.4607101 ] at iter 2292414
policy [0.2240662  0.29851297 0.19211964 0.28530118] at iter 2302415
policy [0.14808565 0.3675535  0.1718062  0.3125547 ] at iter 2312424
policy [0.18612602 0.33779338 0.18947804 0.2866025 ] at iter 2322446
policy [0.26671365 0.26771483 0.2701639  0.19540763] at iter 2332456
policy [0.259924   0.27661556 0.28

policy [0.235518   0.33915177 0.21601303 0.20931725] at iter 3373551
policy [0.20122048 0.33656064 0.20883968 0.25337914] at iter 3383580
policy [0.22153498 0.26122382 0.26129913 0.25594208] at iter 3393578
policy [0.2073384  0.20263559 0.2898577  0.3001683 ] at iter 3403586
policy [0.24223559 0.21008947 0.31600234 0.23167257] at iter 3413587
policy [0.25798503 0.3118573  0.11389364 0.316264  ] at iter 3423593
policy [4.4727451e-03 3.6396538e-03 1.2476406e-04 9.9176288e-01] at iter 3433604
policy [0.2882089  0.19159044 0.42900023 0.09120043] at iter 3443610
policy [0.4009804  0.27760452 0.14738424 0.17403081] at iter 3453634
policy [0.19637387 0.1500008  0.60532194 0.04830341] at iter 3463641
policy [0.27702817 0.31781814 0.18432896 0.22082473] at iter 3473658
policy [0.3301209  0.3171778  0.18375653 0.16894482] at iter 3483684
policy [0.29652843 0.2836874  0.20944838 0.21033578] at iter 3493688
policy [0.28906256 0.311619   0.16874962 0.2305688 ] at iter 3503698
policy [0.3188414  0.1

policy [0.25228584 0.27075416 0.24957441 0.22738563] at iter 4544962
policy [0.14163841 0.18083365 0.66110474 0.01642318] at iter 4554975
policy [0.30952966 0.24824366 0.19309415 0.24913256] at iter 4564976
policy [0.2579615  0.25673118 0.27241755 0.21288976] at iter 4575000
policy [0.28533217 0.25666115 0.20139042 0.25661626] at iter 4585017
policy [0.276887   0.18012728 0.08293599 0.46004972] at iter 4595030
policy [0.18270598 0.24602115 0.00520671 0.56606615] at iter 4605039
policy [0.2935226  0.2953147  0.19901566 0.21214706] at iter 4615047
policy [0.2621297  0.30497155 0.27975    0.15314879] at iter 4625060
policy [0.20395795 0.15001066 0.5673463  0.07868515] at iter 4635070
policy [0.26614568 0.24548678 0.22450414 0.26386335] at iter 4645070
policy [0.2742106  0.20702483 0.3417167  0.17704779] at iter 4655074
policy [0.3526274  0.24033451 0.08665182 0.32038623] at iter 4665076
policy [0.14032124 0.13841665 0.6987846  0.0224775 ] at iter 4675086
policy [0.04694423 0.06109127 0.88

policy [0.0870929  0.10189239 0.799566   0.01144878] at iter 5716136
policy [0.2643518  0.26321208 0.20805007 0.26438612] at iter 5726153
policy [0.24069536 0.310605   0.29895926 0.14974043] at iter 5736163
policy [0.27144334 0.2497933  0.22599873 0.25276464] at iter 5746174
policy [0.2597582  0.26683268 0.18828711 0.28512198] at iter 5756178
policy [0.25260067 0.32220286 0.12477573 0.3004207 ] at iter 5766189
policy [0.2530055  0.25900504 0.3529124  0.13507703] at iter 5776202
policy [0.2698273  0.20418021 0.23188247 0.29411   ] at iter 5786210
policy [0.27160725 0.23754051 0.26724488 0.22360736] at iter 5796211
policy [0.24846448 0.28664285 0.24841456 0.21647805] at iter 5806218
policy [0.2140484  0.19798866 0.07414355 0.51381934] at iter 5816229
policy [0.28878805 0.24433921 0.05226065 0.41461203] at iter 5826239
policy [0.28227642 0.2757642  0.26136476 0.18059461] at iter 5836244
policy [0.31540284 0.24517383 0.35410994 0.08531334] at iter 5846254
policy [0.20600274 0.11266625 0.06

policy [0.27958068 0.25471097 0.21760917 0.24809921] at iter 6887212
policy [0.26832902 0.26267123 0.22709237 0.24190737] at iter 6897221
policy [0.21553351 0.15485151 0.18427815 0.44533685] at iter 6907235
policy [0.20200719 0.26005724 0.18139012 0.35654542] at iter 6917237
policy [0.2861957  0.3403114  0.11467238 0.25882056] at iter 6927245
policy [0.19506438 0.21937557 0.12715112 0.45840892] at iter 6937254
policy [0.20549905 0.32885    0.28219685 0.18345414] at iter 6947257
policy [0.2393314  0.1959027  0.2698155  0.29495034] at iter 6957257
policy [0.24258015 0.24439906 0.30027622 0.21274455] at iter 6967259
policy [0.22324224 0.24559312 0.23483725 0.2963274 ] at iter 6977272
policy [0.28340116 0.31817165 0.33285934 0.06556778] at iter 6987280
policy [0.2785279  0.2682682  0.15082477 0.3023791 ] at iter 6997294
policy [0.01120743 0.02005539 0.9675703  0.00116698] at iter 7007298
policy [0.30346978 0.28601888 0.20544893 0.20506245] at iter 7017307
policy [0.29246217 0.20085548 0.19

policy [0.16602853 0.26099414 0.1749555  0.39802176] at iter 8068282
policy [0.2725098  0.25259754 0.11254975 0.36234292] at iter 8078287
policy [0.2326083  0.26376545 0.19892679 0.30469942] at iter 8088293
policy [0.29836455 0.24122837 0.36839616 0.09201093] at iter 8098302
policy [0.31691682 0.23668036 0.18465655 0.26174635] at iter 8108304
policy [0.36398223 0.16857693 0.24030194 0.22713895] at iter 8118311
policy [0.23272589 0.26020488 0.16145445 0.3456147 ] at iter 8128322
policy [0.20422629 0.27547598 0.37543976 0.14485796] at iter 8138334
policy [0.22547084 0.1504739  0.5566901  0.06736519] at iter 8148345
policy [0.45101944 0.11043234 0.24455899 0.19398917] at iter 8158357
Current model save name: model_2
policy [0.33181196 0.23531371 0.16407344 0.26880085] at iter 8168370
Current global iteration 8168453
policy [0.2216301  0.27672955 0.4224031  0.07923726] at iter 8178376
policy [0.21527943 0.31519789 0.12224488 0.34727785] at iter 8188382
policy [0.33051023 0.14644839 0.18842

policy [0.21911119 0.27971512 0.20117696 0.29999676] at iter 9239393
policy [0.2561938  0.20561333 0.15655868 0.38163418] at iter 9249399
policy [0.2958617  0.23613268 0.1845715  0.2834341 ] at iter 9259410
policy [0.25275323 0.24336107 0.29373136 0.21015428] at iter 9269413
policy [0.33126098 0.21399444 0.19289495 0.26184964] at iter 9279436
policy [0.2844817  0.28281587 0.17146553 0.2612369 ] at iter 9289435
policy [0.3099615  0.2650217  0.10320823 0.3218086 ] at iter 9299440
policy [0.10119088 0.09150535 0.80161566 0.00568813] at iter 9309443
policy [0.24656183 0.3404011  0.04302779 0.3700093 ] at iter 9319462
policy [0.11691845 0.09211183 0.01804062 0.7729291 ] at iter 9329483
policy [0.29508987 0.24522837 0.23182258 0.22785918] at iter 9339488
policy [0.19437082 0.27103192 0.20404777 0.33054948] at iter 9349504
policy [0.2436892  0.24767293 0.2630578  0.24558009] at iter 9359512
policy [0.02714427 0.04461259 0.9195784  0.00866481] at iter 9369522
policy [8.1540663e-03 1.0822051e-0

policy [0.27215073 0.17161475 0.41475496 0.14147954] at iter 10400426
policy [0.24019559 0.23828019 0.3802295  0.14129472] at iter 10410444
policy [0.27985248 0.33574975 0.26198986 0.12240789] at iter 10420451
policy [0.27324313 0.3735024  0.1263419  0.22691251] at iter 10430461
policy [0.31759584 0.36470723 0.11751399 0.2001829 ] at iter 10440467
policy [0.19608577 0.18611479 0.04468732 0.57311213] at iter 10450488
policy [0.21984544 0.2296674  0.33198464 0.21850245] at iter 10460489
policy [7.7228188e-03 8.4727779e-03 2.1494408e-04 9.8358953e-01] at iter 10470496
policy [0.23887864 0.38013366 0.24880868 0.13217902] at iter 10480506
policy [0.27170128 0.36673835 0.20961002 0.15195039] at iter 10490521
policy [0.25536692 0.26255974 0.16045275 0.32162052] at iter 10500524
policy [0.22565928 0.26692063 0.2324783  0.27494174] at iter 10510528
policy [0.14801285 0.16183072 0.00928142 0.68087506] at iter 10520535
policy [0.23006225 0.20200929 0.527507   0.0404215 ] at iter 10530543
policy [

In [None]:
elapsed_time = time.time() - start_time

In [None]:
print("Learning time was", int(elapsed_time/60/60), "hours", int((elapsed_time - int(elapsed_time/60/60)*60*60)/60), "minutes")

In [None]:
def test_agent_fun(test_agent):
    test_env = EnvWrapper(ENV_NAME)
    #test_agent = Agent(test_env, 'tester', optimizer)
    test_agent.update_to_global(sess)

    done = False
    state = test_env.reset()

    reward = 0

    while not done:
        policy = sess.run((test_agent.policy), \
                                                feed_dict={\
                                                           test_agent.X:state\
                                                          }\
                                               )
        policy = policy.flatten()
        #print('cur policy', policy)
        #prediction = np.argmax(policy)
        prediction = np.random.choice(test_env.action_space.n, p=policy)
        if random.random() < 0.05:
            prediction = env.action_space.sample()

        ns, r, d, _ = test_env.step(prediction)
        test_env.env.render()
        state = ns
        reward += (r)
        done = d
    test_env.env.close()
    print('final reward is', reward)

In [None]:
test_env = EnvWrapper(ENV_NAME)
tester_agent = global_agent

In [None]:
test_agent_fun(tester_agent)