# Generative Adversarial Imitation Learning

In [1]:
import numpy as np
import tensorflow as tf

from mlagents.envs import UnityEnvironment
from mlagents.trainers.demo_loader import load_demonstration

## Load Data

In [2]:
def make_data(brain_infos):
    states, actions = [], []
    for idx, brain_info in enumerate(brain_infos):
        if idx > len(brain_infos) - 2:
            break
        current_brain_info = brain_infos[idx]
        next_brain_info = brain_infos[idx + 1]
        states.append(current_brain_info.vector_observations[0])
        actions.append(next_brain_info.previous_vector_actions[0])
    return np.array(states), np.array(actions)

def load_data(location):
    bp, infos, num_steps = load_demonstration(location)
    infos = infos[:max_steps+1]
    states, actions = make_data(infos)
    return bp, states, actions

In [3]:
max_steps = 300
expert_demo = './demos/ExpertBall.demo'
policy_demo = './demos/HeuristicBall.demo'

In [4]:
e_bp, e_states, e_actions = load_data(expert_demo)
p_bp, p_states, p_actions = load_data(policy_demo)

## Define Discrimiator

In [5]:
class Discriminator(object):
    def __init__(self, s_size, a_size, h_size, lr):
        self.h_size = h_size
        self.make_inputs(s_size, a_size)
        self.make_network()
        self.make_loss(lr)
        
    def make_inputs(self, s_size, a_size):
        self.state_in_expert = tf.placeholder(shape=[None, s_size], dtype=tf.float32)
        self.action_in_expert = tf.placeholder(shape=[None, a_size], dtype=tf.float32)
        self.state_in_policy = tf.placeholder(shape=[None, s_size], dtype=tf.float32)
        self.action_in_policy = tf.placeholder(shape=[None, a_size], dtype=tf.float32)
        
    def make_discriminator(self, state_in, action_in, reuse):
        with tf.variable_scope("discriminator"):
            concat_input = tf.concat([state_in, action_in], axis=1)
            
            hidden_1 = tf.layers.dense(
                concat_input, self.h_size, activation=tf.nn.elu,
                name="d_hidden_1", reuse=reuse)
            
            hidden_2 = tf.layers.dense(
                hidden_1, self.h_size, activation=tf.nn.elu, 
                name="d_hidden_2", reuse=reuse)
            
            d_value = tf.layers.dense(hidden_2, 1, activation=tf.nn.sigmoid, 
                                name="d_value", reuse=reuse)
            return d_value
        
    def make_loss(self, learning_rate):
        self.de = tf.reduce_mean(self.d_expert)
        self.dp = tf.reduce_mean(self.d_policy)
        self.d_loss = -tf.reduce_mean(tf.log(self.d_expert + 1e-10) + tf.log(1.0 - self.d_policy + 1e-10))
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.update_batch = optimizer.minimize(self.d_loss)
        
    def make_network(self):
        self.d_expert = self.make_discriminator(self.state_in_expert, self.action_in_expert, False)
        self.d_policy = self.make_discriminator(self.state_in_policy, self.action_in_policy, True)

## Train Model

In [6]:
batch_size = 32
num_epoch = 100
s_size = e_bp.vector_observation_space_size
a_size = e_bp.vector_action_space_size[0]
h_size = 64
lr = 1e-4

In [7]:
tf.reset_default_graph()

disc = Discriminator(s_size, a_size, h_size, lr)

init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init)

In [8]:
def random_ordering(array):
    s = np.arange(len(array))
    np.random.shuffle(s)
    return s

def shuffle_buffer(states, actions):
    ordering = random_ordering(states)
    shuffle_states = states[ordering]
    shuffle_actions = actions[ordering]
    return shuffle_states, shuffle_actions

def get_batch(index, batch_size, a_size, states, actions):
    batch_states = states[index*batch_size:(index+1)*batch_size]
    batch_actions = actions[index*batch_size:(index+1)*batch_size]
    batch_actions = np.reshape(batch_actions, [-1, a_size])
    return batch_states, batch_actions

In [9]:
for i in range(num_epoch):
    e_states, e_actions = shuffle_buffer(e_states, e_actions)    
    p_states, p_actions = shuffle_buffer(p_states, p_actions)
    p_batch_reward, e_batch_reward = [], []

    for j in range(len(p_states)//batch_size):
        e_batch_states, e_batch_actions = get_batch(j, batch_size, a_size, e_states, e_actions)
        p_batch_states, p_batch_actions = get_batch(j, batch_size, a_size, p_states, p_actions)
        
        feed_dict = {disc.state_in_expert: e_batch_states, 
                     disc.state_in_policy: p_batch_states,
                     disc.action_in_expert: e_batch_actions, 
                     disc.action_in_policy: p_batch_actions}
        
        run_list = [disc.de, disc.dp, disc.d_loss, disc.update_batch]
        d_e, d_p, loss, _ = sess.run(run_list, feed_dict=feed_dict)
        e_batch_reward.append(d_e)
        p_batch_reward.append(d_p)
    print("Epoch: {}".format(i), 
          "Expert Reward: {:.3f}".format(np.mean(e_batch_reward)), 
          "Policy Reward: {:.3f}".format(np.mean(p_batch_reward)), 
          "Model Loss: {:.3f}".format(loss))

Epoch: 0 Expert Reward: 0.554 Policy Reward: 0.621 Model Loss: 1.665
Epoch: 1 Expert Reward: 0.544 Policy Reward: 0.578 Model Loss: 1.504
Epoch: 2 Expert Reward: 0.536 Policy Reward: 0.532 Model Loss: 1.437
Epoch: 3 Expert Reward: 0.529 Policy Reward: 0.488 Model Loss: 1.309
Epoch: 4 Expert Reward: 0.524 Policy Reward: 0.448 Model Loss: 1.213
Epoch: 5 Expert Reward: 0.522 Policy Reward: 0.412 Model Loss: 1.114
Epoch: 6 Expert Reward: 0.523 Policy Reward: 0.380 Model Loss: 1.110
Epoch: 7 Expert Reward: 0.525 Policy Reward: 0.359 Model Loss: 1.072
Epoch: 8 Expert Reward: 0.529 Policy Reward: 0.338 Model Loss: 1.045
Epoch: 9 Expert Reward: 0.534 Policy Reward: 0.318 Model Loss: 0.983
Epoch: 10 Expert Reward: 0.540 Policy Reward: 0.305 Model Loss: 1.017
Epoch: 11 Expert Reward: 0.545 Policy Reward: 0.290 Model Loss: 0.982
Epoch: 12 Expert Reward: 0.550 Policy Reward: 0.281 Model Loss: 0.927
Epoch: 13 Expert Reward: 0.558 Policy Reward: 0.270 Model Loss: 0.907
Epoch: 14 Expert Reward: 0.564