In [1]:
import tensorflow as tf
import random
import gym
import numpy as np
import skimage
from skimage import color, exposure, transform
import threading

  from ._conv import register_converters as _register_converters


In [2]:
IMG_WIDTH = 84
IMG_HEIGHT = 84
CNT_FRAMES = 4
GLOBAL_SCOPE = 'global'
VALUE_MODIFIER = 0.25
POLICY_MODIFIER = 1
ENTROPY_MODIFIER = 2.5 * 1e-4#0.0005
MAX_STEPS = 10
DISCOUNT = 0.99
ENV_NAME = 'BreakoutDeterministic-v4'
#ENV_NAME = 'PongDeterministic-v4'
MAX_EP_LENGTH = 500
LEARNING_RATE = 2.5*1e-4
CLIP_VALUE = 1
DECAY = 0.99

In [3]:
def process_frame(x_t, img_rows, img_cols):
    x_t = skimage.color.rgb2gray(x_t)
    x_t = skimage.transform.resize(x_t,(img_rows, img_cols), mode='constant')
    x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255))
    x_t = x_t.reshape((1, img_rows, img_cols, 1))
    x_t /= 255.0
    return x_t

def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer

In [4]:
class EnvWrapper:
    def __init__(self, env_name):
        self.env = gym.make(env_name)
        self.action_space = self.env.action_space
    def reset(self):
        s = self.env.reset()
        s = process_frame(s, IMG_WIDTH, IMG_HEIGHT)
        s = np.stack([s for i in range(CNT_FRAMES)], axis=3)
        s = s.reshape(1, s.shape[1], s.shape[2], s.shape[3])
        self.s = np.copy(s)
        return s
    def step(self, a):
        s1, r, d, _ = self.env.step(a)
        s1 = process_frame(s1, IMG_WIDTH, IMG_HEIGHT)
        s = np.append(s1, self.s[:, :, :, :CNT_FRAMES-1], axis=3)
        self.s = np.copy(s)
        return s, r, d, _

In [5]:
last_iter = 0

class Agent:
    def __init__(self, env, scope_name, optimizer):
        self.env = env
        self.scope_name = scope_name
        self.action_size = self.env.action_space.n
        self.optimizer = optimizer
        
        self.__build_model()
    def __build_model(self):
        print('building model')
        with tf.variable_scope(self.scope_name):
            weights_initializer = tf.truncated_normal_initializer(stddev=0.02)
            bias_initializer = tf.zeros_initializer()
            self.X = tf.placeholder(shape=[None, IMG_WIDTH, IMG_HEIGHT, CNT_FRAMES], dtype=tf.float32)
            conv1 = tf.contrib.layers.conv2d(self.X, 32, 3, stride=2, activation_fn=tf.nn.relu, padding='VALID', \
                                            weights_initializer=weights_initializer, biases_initializer = bias_initializer)
            conv2 = tf.contrib.layers.conv2d(conv1, 32, 3, stride=2, activation_fn=tf.nn.relu, padding='VALID', \
                                            weights_initializer=weights_initializer, biases_initializer = bias_initializer)
            conv3 = tf.contrib.layers.conv2d(conv2, 32, 3, stride=2, activation_fn=tf.nn.relu, padding='VALID', \
                                             weights_initializer=weights_initializer, biases_initializer = bias_initializer)
            conv4 = tf.contrib.layers.conv2d(conv3, 32, 3, stride=2, activation_fn=tf.nn.relu, padding='VALID', \
                                             weights_initializer=weights_initializer, biases_initializer = bias_initializer)
            flattened = tf.contrib.layers.flatten(conv4)
            embedding = tf.contrib.layers.fully_connected(flattened, 512, activation_fn=tf.nn.relu, weights_initializer=tf.random_normal_initializer(stddev=0.02), biases_initializer=bias_initializer)
            embedding2 = tf.contrib.layers.fully_connected(embedding, 128, activation_fn=tf.nn.relu, weights_initializer=tf.random_normal_initializer(stddev=0.02), biases_initializer=bias_initializer)
            
            #normalization = tf.layers.batch_normalization(embedding)
                        
            self.policy = tf.contrib.layers.fully_connected(embedding2, self.action_size, activation_fn=tf.nn.softmax, weights_initializer=tf.random_normal_initializer(stddev=0.5), biases_initializer=None)
            self.value = tf.contrib.layers.fully_connected(embedding2, 1, activation_fn=None, weights_initializer=normalized_columns_initializer(1.), biases_initializer=None)
            
            if self.scope_name != GLOBAL_SCOPE:
                print('building agent:', self.scope_name)
                self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
                self.actions_oh = tf.one_hot(self.actions, depth=self.action_size, dtype=tf.float32)
                self.target_values = tf.placeholder(shape=[None], dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None], dtype=tf.float32)

                MIN_POLICY = 1e-8
                MAX_POLICY = 1.0 - MIN_POLICY
                
                self.log_policy = tf.log(tf.clip_by_value(self.policy, MIN_POLICY, MAX_POLICY))

                self.log_policy_for_action = tf.reduce_sum(self.log_policy * self.actions_oh, axis=1)
                self.value_loss = tf.reduce_mean(tf.squared_difference(self.value, self.target_values))
                self.policy_loss = -tf.reduce_mean(self.log_policy_for_action * self.advantages)
                #entropija je E[-log(X)] = sum(p(x) * log(x))
                self.entropy_loss = -tf.reduce_mean(self.policy * -self.log_policy)
                #self.entropy_loss = self.entropy_loss - self.entropy_loss
                self.loss = VALUE_MODIFIER * self.value_loss + \
                            POLICY_MODIFIER * self.policy_loss + \
                            ENTROPY_MODIFIER * self.entropy_loss
                #get locals
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope_name)
                #update locals
                grads = tf.gradients(self.loss, local_vars)
                grads, grad_norms = tf.clip_by_global_norm(grads, CLIP_VALUE)
                self.update_ops = update_target_graph(GLOBAL_SCOPE, self.scope_name)
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, GLOBAL_SCOPE)
                capped_gvs = [(grad, var) for grad, var in zip(grads, global_vars)]
                self.global_update = self.optimizer.apply_gradients(capped_gvs)
    
    def predict(self, sess, state):
        policy = sess.run((self.policy), \
                                            feed_dict={\
                                                       self.X:state\
                                                      }\
                                           )
        policy = policy.flatten()
        #print('cur policy', policy)
        
        global last_iter
        
        if global_counter - 5000 > last_iter:
            last_iter = global_counter
            print('policy', policy, 'at iter', global_counter)
        
        prediction = np.random.choice(self.action_size, p=policy)
        #prediction = np.argmax(policy)
        #print('prediction', prediction)
        return prediction
            
    def act(self, sess, state):
        prediction = self.predict(sess, state)
        a = prediction
        next_state,r,d,_ = self.env.step(a)
        return state, a, r, d, next_state
    
    def get_value(self, sess, state):
        return sess.run(\
                        self.value, \
                        feed_dict={ \
                                   self.X: state\
                                  } \
                       )
    
    def train(self, sess, states, actions, target_values, advantages):
        gu, value_loss, policy_loss, entropy_loss = \
            sess.run((self.global_update, self.value_loss, self.policy_loss, self.entropy_loss), \
                     feed_dict={
                         self.X: states,
                         self.actions: actions,
                         self.target_values: target_values,
                         self.advantages: advantages
                     })
        return value_loss / len(states), policy_loss / len(states), entropy_loss / len(states)
    
    def update_to_global(self, sess):
        if self.scope_name != GLOBAL_SCOPE:
            sess.run(self.update_ops)

In [6]:
import time

global_counter = 0

start_time = time.time()

class Worker:
    def __init__(self, agent):
        self.agent = agent
        self.summary_writer = tf.summary.FileWriter(self.agent.scope_name)
    def work(self, sess, optimizer, thread_lock):
        
        global global_counter
        global start_time
        
        print('worker starting agent:', self.agent.scope_name)
        done = True
        s = None
        episode_reward = 0
        timestep = 0
        episode_counter = 0
        value_losses = []
        policy_losses = []
        entropy_losses = []
        last_rewards = []
        last_frames = []
        last_values = []
        last_advantages = []
        
        elapsed_time = time.time() - start_time
        
        with sess.as_default(), sess.graph.as_default():
            while True:#global_counter <= MAX_ITERATIONS and elapsed_time <= MAX_LEARNING_TIME:
                if done or timestep > MAX_EP_LENGTH:
                    self.agent.update_to_global(sess)
                    last_rewards.append(episode_reward)
                    last_frames.append(timestep)
                    if episode_counter > 0 and episode_counter % 5 == 0:
                        #print('for agent:', self.agent.scope_name)
                        #print('at episode', episode_counter, 'episode reward is', episode_reward)
                        if len(value_losses) > 0:
                            summary = tf.Summary()
                            
                            summary.value.add(tag='Performance/Reward', simple_value=float(sum(last_rewards) / len(last_rewards)))
                            summary.value.add(tag='Performance/Length', simple_value=float(sum(last_frames) / len(last_frames)))
                            summary.value.add(tag='Performance/Values mean', simple_value=float(sum(last_values) / len(last_values)))
                            summary.value.add(tag='Performance/Advantage mean', simple_value=float(sum(last_advantages) / len(last_advantages)))
                            summary.value.add(tag='Losses/Value Loss', simple_value=float(sum(value_losses) / len(value_losses)))
                            summary.value.add(tag='Losses/Policy Loss', simple_value=float(sum(policy_losses) / len(policy_losses)))
                            summary.value.add(tag='Losses/Entropy', simple_value=float(sum(entropy_losses) / len(entropy_losses)))
                            
                            self.summary_writer.add_summary(summary, episode_counter)

                            self.summary_writer.flush()
                            
                            last_rewards = []
                            last_frames = []
                            value_losses = []
                            policy_losses = []
                            entropy_losses = []
                            last_values = []
                            last_advantages = []
                    s = self.agent.env.reset()
                    done = False
                    episode_reward = 0
                    timestep = 0
                    episode_counter += 1
                    
                states = []
                actions = []
                rewards = []
                values = []
                advantages = []
                target_values = []
                
                while len(states) < MAX_STEPS and not done:
                    s, a, r, d, ns = self.agent.act(sess, s)
                    with thread_lock:
                        global_counter += 1
                    episode_reward += r
                    timestep += 1
                    r = np.clip(r, -1.0, 1.0)
                    states.append(s)
                    actions.append(a)
                    rewards.append(r)
                    done = d
                    val = self.agent.get_value(sess, s)
                    last_values.append(val)
                    values.append(val)
                
                R = 0
                if not done:
                    R = self.agent.get_value(sess, s)
                
                advantages = [0 for i in range(len(values))]
                
                for i in range(len(rewards)):
                    idx = len(rewards) - 1 - i
                    reward = rewards[idx]
                    R += DISCOUNT * reward
                    advantage = (R - values[idx])
                    advantages[idx] = advantage
                    last_advantages.append(advantage)
                    
                target_value = 0
                
                if not done:
                    target_value = self.agent.get_value(sess, s)
                
                for reward in reversed(rewards):
                    target_value = reward + DISCOUNT * target_value
                    target_values.append(target_value)
                #for i in range(len(rewards)-1):
                #    idx = len(rewards) - i - 1
                #    target_values[idx-1] = rewards[idx-1] + DISCOUNT * target_values[idx]
                states = np.vstack(states)
                actions = np.vstack(actions).ravel()
                target_values = np.vstack(target_values).ravel()
                advantages = np.vstack(advantages).ravel()
                
                value_loss, policy_loss, entropy_loss = \
                    self.agent.train(sess, states, actions, target_values, advantages)
                
                value_losses.append(value_loss)
                policy_losses.append(policy_loss)
                entropy_losses.append(entropy_loss)
                
                elapsed_time = time.time() - start_time

In [7]:
!mkdir models

mkdir: cannot create directory ‘models’: File exists


In [8]:
import time

worker_threads = []

env_global = EnvWrapper(ENV_NAME)
#global_agent = Agent(env_global, GLOBAL_SCOPE, tf.train.AdamOptimizer())
global_agent = Agent(env_global, GLOBAL_SCOPE, tf.train.GradientDescentOptimizer(LEARNING_RATE))

config = tf.ConfigProto(device_count = {'GPU': 0})
config.gpu_options.allow_growth=True

sess = tf.Session(config=config)

def global_saving_thread(agent, sess):
    
    global global_counter
    
    MAX_MODELS = 1
    cnt_model = 0
    
    with sess.as_default(), sess.graph.as_default():
    
        saver = tf.train.Saver()

        elapsed_time = time.time() - start_time
        
        #save model every 15 minutes
        while True:#global_counter <= MAX_ITERATIONS and elapsed_time <= MAX_LEARNING_TIME:
            print("Current model save name:", 'model_' + str(cnt_model % MAX_MODELS))
            save_path = saver.save(sess, "models/model_" + str(cnt_model % MAX_MODELS) + ".ckpt")
            print("Current global iteration", global_counter)
            cnt_model += 1
            time.sleep(15 * 60)
        print("Learning time was", int(elapsed_time/60/60), "hours", int((elapsed_time - int(elapsed_time/60/60)*60*60)/60), "minutes")

building model


In [None]:
cnt_threads = 20
thread_lock = threading.Lock()

def worker_fun(worker, sess, optimizer, thread_lock):
    worker.work(sess, optimizer, thread_lock)

for i in range(cnt_threads):
    env = EnvWrapper(ENV_NAME)
    optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    #optimizer = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE, decay=DECAY)
    worker = Worker(Agent(env, 'local' + str(i), optimizer))
    t = threading.Thread(target=worker_fun, args=(worker, sess, optimizer, thread_lock))
    worker_threads.append(t)
    time.sleep(0.15)

sess.run(tf.global_variables_initializer())
for t in worker_threads:
    t.start()
    time.sleep(0.15)
    
global_t = threading.Thread(target=global_saving_thread, args=(global_agent, sess))

worker_threads.append(global_t)
global_t.start()

for t in worker_threads:
    t.join()

building model
building agent: local0
building model
building agent: local1
building model
building agent: local2
building model
building agent: local3
building model
building agent: local4
building model
building agent: local5
building model
building agent: local6
building model
building agent: local7
building model
building agent: local8
building model
building agent: local9
building model
building agent: local10
building model
building agent: local11
building model
building agent: local12
building model
building agent: local13
building model
building agent: local14
building model
building agent: local15
building model
building agent: local16
building model
building agent: local17
building model
building agent: local18
building model
building agent: local19
worker starting agent: local0
worker starting agent: local1
worker starting agent: local2
worker starting agent: local3
worker starting agent: local4
worker starting agent: local5
worker starting agent: local6
worker starting agen

policy [0.15657529 0.08044509 0.34672964 0.41625   ] at iter 495256
policy [0.15268593 0.07261635 0.3291268  0.44557098] at iter 500259
policy [0.16681333 0.08628661 0.33166608 0.41523397] at iter 505263
policy [0.16926526 0.09389261 0.3379984  0.39884377] at iter 510266
policy [0.16209677 0.0967112  0.3487445  0.39244756] at iter 515267
policy [0.15289026 0.09529331 0.37391016 0.3779063 ] at iter 520270
policy [0.16100667 0.11066786 0.37257448 0.355751  ] at iter 525273
policy [0.16405666 0.11267477 0.3643177  0.35895088] at iter 530275
policy [0.15641999 0.10200611 0.3628719  0.37870198] at iter 535277
policy [0.14826791 0.09057293 0.377398   0.38376114] at iter 540279
policy [0.14613912 0.08811335 0.379785   0.38596258] at iter 545280
policy [0.16421874 0.0976996  0.35590357 0.38217804] at iter 550282
policy [0.16818206 0.10161626 0.35376486 0.3764368 ] at iter 555284
policy [0.16009659 0.08837586 0.34709635 0.40443122] at iter 560285
policy [0.16150904 0.08892628 0.3415978  0.40796

policy [0.15859158 0.10127153 0.2903766  0.44976026] at iter 1085561
policy [0.17760988 0.09358468 0.2825505  0.44625497] at iter 1090571
policy [0.1962799  0.09800624 0.2809186  0.42479527] at iter 1095572
policy [0.18926774 0.0943426  0.31287274 0.40351695] at iter 1100573
policy [0.17701508 0.1003195  0.33059454 0.39207092] at iter 1105574
policy [0.19020727 0.11077015 0.31204066 0.38698193] at iter 1110575
policy [0.1890851  0.10999441 0.3134802  0.3874403 ] at iter 1115581
policy [0.19106826 0.1154066  0.3200422  0.37348288] at iter 1120585
policy [0.1923377  0.12497652 0.31574017 0.3669456 ] at iter 1125589
policy [0.19842733 0.13103014 0.31373495 0.3568076 ] at iter 1130592
policy [0.20479298 0.13381436 0.31090596 0.35048673] at iter 1135595
policy [0.20880572 0.15463743 0.30423823 0.33231863] at iter 1140605
policy [0.2082017  0.15630278 0.3045028  0.3309927 ] at iter 1145606
policy [0.20003445 0.1461394  0.3109261  0.3429    ] at iter 1150608
policy [0.19718829 0.14111207 0.30

In [None]:
elapsed_time = time.time() - start_time

In [None]:
print("Learning time was", int(elapsed_time/60/60), "hours", int((elapsed_time - int(elapsed_time/60/60)*60*60)/60), "minutes")

In [None]:
def test_agent_fun(test_agent):
    test_env = EnvWrapper(ENV_NAME)
    #test_agent = Agent(test_env, 'tester', optimizer)
    test_agent.update_to_global(sess)

    done = False
    state = test_env.reset()

    reward = 0

    while not done:
        policy = sess.run((test_agent.policy), \
                                                feed_dict={\
                                                           test_agent.X:state\
                                                          }\
                                               )
        policy = policy.flatten()
        #print('cur policy', policy)
        #prediction = np.argmax(policy)
        prediction = np.random.choice(test_env.action_space.n, p=policy)
        if random.random() < 0.05:
            prediction = env.action_space.sample()

        ns, r, d, _ = test_env.step(prediction)
        test_env.env.render()
        state = ns
        reward += (r)
        done = d
    test_env.env.close()
    print('final reward is', reward)

In [None]:
test_env = EnvWrapper(ENV_NAME)
tester_agent = global_agent

In [None]:
test_agent_fun(tester_agent)