In [1]:
import tensorflow as tf
import random
import gym
import numpy as np
import skimage
from skimage import color, exposure, transform
import threading

  from ._conv import register_converters as _register_converters


In [2]:
IMG_WIDTH = 84
IMG_HEIGHT = 84
CNT_CHANNELS = 1
GLOBAL_SCOPE = 'global'
VALUE_MODIFIER = 0.5
POLICY_MODIFIER = 1.0
ENTROPY_MODIFIER = 0.075
MAX_STEPS = 30
DISCOUNT = 0.99
#ENV_NAME = 'BreakoutDeterministic-v4'
ENV_NAME = 'PongDeterministic-v4'
MAX_ITERATIONS = 1000000
MAX_EP_LENGTH = 500

In [3]:
def process_frame(x_t, img_rows, img_cols):
    x_t = skimage.color.rgb2gray(x_t)
    x_t = skimage.transform.resize(x_t,(img_rows, img_cols), mode='constant')
    x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255))
    x_t = x_t.reshape((1, img_rows, img_cols, 1))
    x_t /= 255.0
    return x_t

def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

In [4]:
class EnvWrapper:
    def __init__(self, env_name):
        self.env = gym.make(env_name)
        self.action_space = self.env.action_space
    def reset(self):
        s = self.env.reset()
        s = process_frame(s, IMG_WIDTH, IMG_HEIGHT)
        return s
    def step(self, a):
        s1, r, d, _ = self.env.step(a)
        s1 = process_frame(s1, IMG_WIDTH, IMG_HEIGHT)
        return s1, r, d, _

In [5]:
class Agent:
    def __init__(self, env, scope_name, optimizer):
        self.env = env
        self.scope_name = scope_name
        self.action_size = self.env.action_space.n
        self.optimizer = optimizer
        
        self.__build_model()
    def __build_model(self):
        print('building model')
        with tf.variable_scope(self.scope_name):
            self.X = tf.placeholder(shape=[None, IMG_WIDTH, IMG_HEIGHT, CNT_CHANNELS], dtype=tf.float32)
            conv1 = tf.contrib.layers.conv2d(self.X, 16, 8, stride=4, padding='VALID')
            conv2 = tf.contrib.layers.conv2d(conv1, 32, 8, stride=2, padding='VALID')
            flattened = tf.contrib.layers.flatten(conv2)
            embedding = tf.contrib.layers.fully_connected(flattened, 256, activation_fn=tf.nn.elu)
            
            step_size = tf.shape(self.X)[:1]
            
            rnn_in = tf.expand_dims(embedding, axis=0)
            
            lstm = tf.contrib.rnn.BasicLSTMCell(256)
            output, state = tf.nn.dynamic_rnn(lstm, rnn_in,sequence_length=step_size, dtype=tf.float32)
            output = tf.reshape(output, (-1, 256))
            
            self.policy = tf.contrib.layers.fully_connected(output, self.action_size, activation_fn=tf.nn.softmax, weights_initializer=tf.random_normal_initializer(0.0, 0.01))
            self.value = tf.contrib.layers.fully_connected(output, 1, activation_fn=None, weights_initializer=tf.random_normal_initializer(0.0, 0.5))
            
            if self.scope_name != GLOBAL_SCOPE:
                print('building agent:', self.scope_name)
                self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
                self.actions_oh = tf.one_hot(self.actions, depth=self.action_size, dtype=tf.float32)
                self.target_values = tf.placeholder(shape=[None], dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None], dtype=tf.float32)


                self.log_likelihood = tf.log(tf.reduce_sum(self.policy * self.actions_oh, axis=1))
                self.value_loss = tf.reduce_mean(tf.squared_difference(self.value, self.target_values))
                self.policy_loss = -tf.reduce_mean(self.log_likelihood * self.advantages)
                #entropija je E[-log(X)] = sum(p(x) * log(x))
                self.entropy_loss = -tf.reduce_mean(self.policy * -tf.log(self.policy))
                self.loss = VALUE_MODIFIER * self.value_loss + \
                            POLICY_MODIFIER * self.policy_loss + \
                            ENTROPY_MODIFIER * self.entropy_loss
                #get locals
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope_name)
                #update locals
                grads = tf.gradients(self.loss, local_vars)
                self.update_ops = update_target_graph(GLOBAL_SCOPE, self.scope_name)
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, GLOBAL_SCOPE)
                capped_gvs = [(tf.clip_by_value(grad, -40., 40.), var) for grad, var in zip(grads, global_vars)]
                self.global_update = self.optimizer.apply_gradients(capped_gvs)
    
    def predict(self, sess, state):
        policy = sess.run(self.policy, feed_dict={self.X:state}).flatten()
        #print('cur policy', policy)
        prediction = np.random.choice(self.action_size, p=policy)
        #print('prediction', prediction)
        return prediction
            
    def act(self, sess, state):
        ret = self.predict(sess, state)
        a = ret
        next_state,r,d,_ = self.env.step(a)
        return state, a, r, d, next_state
    
    def get_value(self, sess, state):
        return sess.run(\
                        self.value, \
                        feed_dict={ \
                                   self.X: state\
                                  } \
                       )
    
    def train(self, sess, states, actions, target_values, advantages):
        gu, value_loss, policy_loss, entropy_loss = \
            sess.run((self.global_update, self.value_loss, self.policy_loss, self.entropy_loss), \
                     feed_dict={
                         self.X: states,
                         self.actions: actions,
                         self.target_values: target_values,
                         self.advantages: advantages
                     })
        return value_loss, policy_loss, entropy_loss      
    
    def update_to_global(self, sess):
        if self.scope_name != GLOBAL_SCOPE:
            sess.run(self.update_ops)

In [6]:
global_counter = 0

class Worker:
    def __init__(self, agent):
        self.agent = agent
        self.summary_writer = tf.summary.FileWriter(self.agent.scope_name)
    def work(self, sess, optimizer, thread_lock):
        
        global global_counter
        
        print('worker starting agent:', self.agent.scope_name)
        done = True
        s = None
        episode_reward = 0
        timestep = 0
        episode_counter = 0
        value_losses = []
        policy_losses = []
        entropy_losses = []
        last_rewards = []
        last_frames = []
        last_values = []
        last_advantages = []
        
        with sess.as_default(), sess.graph.as_default():
            while global_counter <= MAX_ITERATIONS:
                self.agent.update_to_global(sess)
                if done or timestep > MAX_EP_LENGTH:
                    last_rewards.append(episode_reward)
                    last_frames.append(timestep)
                    if episode_counter > 0 and episode_counter % 5 == 0:
                        #print('for agent:', self.agent.scope_name)
                        #print('at episode', episode_counter, 'episode reward is', episode_reward)
                        if len(value_losses) > 0:
                            summary = tf.Summary()
                            
                            summary.value.add(tag='Performance/Reward', simple_value=float(sum(last_rewards) / len(last_rewards)))
                            summary.value.add(tag='Performance/Length', simple_value=float(sum(last_frames) / len(last_frames)))
                            summary.value.add(tag='Performance/Values mean', simple_value=float(sum(last_values) / len(last_values)))
                            summary.value.add(tag='Performance/Advantage mean', simple_value=float(sum(last_advantages) / len(last_advantages)))
                            summary.value.add(tag='Losses/Value Loss', simple_value=float(sum(value_losses) / len(value_losses)))
                            summary.value.add(tag='Losses/Policy Loss', simple_value=float(sum(policy_losses) / len(policy_losses)))
                            summary.value.add(tag='Losses/Entropy', simple_value=float(sum(entropy_losses) / len(entropy_losses)))
                            
                            self.summary_writer.add_summary(summary, episode_counter)

                            self.summary_writer.flush()
                            
                            last_rewards = []
                            last_frames = []
                            value_losses = []
                            policy_losses = []
                            entropy_losses = []
                            last_values = []
                            last_advantages = []
                    s = self.agent.env.reset()
                    done = False
                    episode_reward = 0
                    timestep = 0
                    episode_counter += 1
                    
                states = []
                actions = []
                rewards = []
                values = []
                advantages = []
                target_values = []
                while len(states) < MAX_STEPS and not done:
                    s, a, r, d, ns = self.agent.act(sess, s)
                    states.append(s)
                    actions.append(a)
                    rewards.append(r)
                    done = d
                    val = self.agent.get_value(sess, s)
                    last_values.append(val)
                    values.append(val)
                    with thread_lock:
                        global_counter += 1
                    if global_counter % 10000 == 0 and self.agent.scope_name == "local0":
                        print('agent local0 at iteration', global_counter)
                    episode_reward += r
                    timestep += 1
                R = 0
                if not done:
                    R = self.agent.get_value(sess, s)
                
                advantages = [0 for i in range(len(values))]
                
                for i in range(len(rewards)):
                    idx = len(rewards) - 1 - i
                    reward = rewards[idx]
                    R += DISCOUNT * reward
                    advantage = (R - values[idx])
                    advantages[idx] = advantage
                    last_advantages.append(advantage)
                    
                target_value = 0
                
                if not done:
                    target_value = self.agent.get_value(sess, s)
                
                for reward in reversed(rewards):
                    target_value = reward + DISCOUNT * target_value
                    target_values.append(target_value)
                #for i in range(len(rewards)-1):
                #    idx = len(rewards) - i - 1
                #    target_values[idx-1] = rewards[idx-1] + DISCOUNT * target_values[idx]
                states = np.vstack(states)
                actions = np.vstack(actions).ravel()
                target_values = np.vstack(target_values).ravel()
                advantages = np.vstack(advantages).ravel()
                value_loss, policy_loss, entropy_loss = self.agent.train(sess, states, actions, target_values, advantages)
                
                value_losses.append(value_loss)
                policy_losses.append(policy_loss)
                entropy_losses.append(entropy_loss)
                

In [7]:
!mkdir models

mkdir: cannot create directory ‘models’: File exists


In [8]:
import time

worker_threads = []

env_global = EnvWrapper(ENV_NAME)
global_agent = Agent(env_global, GLOBAL_SCOPE, tf.train.AdamOptimizer())
sess = tf.Session()

def global_saving_thread(agent, sess):
    
    MAX_MODELS = 5
    cnt_model = 0
    
    with sess.as_default(), sess.graph.as_default():
    
        saver = tf.train.Saver()

        #save model every 15 minutes
        while global_counter <= MAX_ITERATIONS:
            print("Current model save name:", 'model_' + str(cnt_model % MAX_MODELS))
            save_path = saver.save(sess, "models/model_" + str(cnt_model % MAX_MODELS) + ".ckpt")
            time.sleep(15 * 60)

building model


In [None]:
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)

cnt_threads = 20
thread_lock = threading.Lock()

def worker_fun(worker, sess, optimizer, thread_lock):
    worker.work(sess, optimizer, thread_lock)

for i in range(cnt_threads):
    env = EnvWrapper(ENV_NAME)
    worker = Worker(Agent(env, 'local' + str(i), optimizer))
    t = threading.Thread(target=worker_fun, args=(worker, sess, optimizer, thread_lock))
    worker_threads.append(t)
    time.sleep(0.5)

sess.run(tf.global_variables_initializer())
for t in worker_threads:
    t.start()
    time.sleep(0.5)
    
global_t = threading.Thread(target=global_saving_thread, args=(global_agent, sess))

worker_threads.append(global_t)
global_t.start()

for t in worker_threads:
    t.join()

building model
building agent: local0
building model
building agent: local1
building model
building agent: local2
building model
building agent: local3
building model
building agent: local4
building model
building agent: local5
building model
building agent: local6
building model
building agent: local7
building model
building agent: local8
building model
building agent: local9
building model
building agent: local10
building model
building agent: local11
building model
building agent: local12
building model
building agent: local13
building model
building agent: local14
building model
building agent: local15
building model
building agent: local16
building model
building agent: local17
building model
building agent: local18
building model
building agent: local19
worker starting agent: local0
worker starting agent: local1
worker starting agent: local2
worker starting agent: local3
worker starting agent: local4
worker starting agent: local5
worker starting agent: local6
worker starting agen