# Generative Adversarial Imitation Learning

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import tensorflow as tf

from unityagents import UnityEnvironment

%matplotlib inline

## Generate Random Policy

### Load Environment

In [2]:
env_name = "Wall" # Name of the Unity environment binary to launch
train_mode = True # Whether to run the environment in training or inference mode

In [3]:
env = UnityEnvironment(file_name=env_name)

# Examine environment parameters
print(str(env))

INFO:unityagents:
'Academy' started successfully!


Unity Academy name: Academy
        Number of brains: 1
        Reset Parameters :
		min_wall_height -> 5.5
		max_wall_height -> 6.0
Unity brain name: Brain
        Number of observations (per agent): 0
        State space type: continuous
        State space size (per agent): 16
        Action space type: discrete
        Action space size (per agent): 6
        Memory space size (per agent): 0
        Action descriptions: , , , , , 


In [4]:
brain_info = env.brains['Brain']
s_size = brain_info.state_space_size
a_size = brain_info.action_space_size

### Collect Training Data

In [5]:
num_steps = 1000

In [6]:
p_states = np.zeros([0, s_size])
p_actions = np.zeros([0, 1])

In [7]:
brain_dict = env.reset(train_mode=True)
num_agents = len(brain_dict['Brain'].agents)
for step in range(num_steps):
    new_action = np.random.randint(0, a_size, size=[num_agents, 1])
    info = env.step(new_action)['Brain']
    p_actions = np.append(p_actions, new_action, axis=0)
    p_states = np.append(p_states, info.states, axis=0)

## Load Expert Data

In [9]:
data_path = './wall_imitation'

save_data = pickle.load(open(data_path+"/data.p", "rb"))
e_states = save_data["states"][0:num_agents*num_steps]
e_actions = save_data["actions"][0:num_agents*num_steps]

In [10]:
p_states = (p_states - np.mean(p_states)) / np.std(p_states)
e_states = (e_states - np.mean(e_states)) / np.std(e_states)

## Discrimiator

In [42]:
class Discriminator(object):
    def __init__(self, s_size, a_size, h_size, lr):
        self.state_in_expert = tf.placeholder(shape=[None, s_size], dtype=tf.float32)
        self.action_in_expert = tf.placeholder(shape=[None], dtype=tf.int32)
        self.action_expert = tf.one_hot(self.action_in_expert, a_size)
        
        self.state_in_policy = tf.placeholder(shape=[None, s_size], dtype=tf.float32)
        self.action_in_policy = tf.placeholder(shape=[None], dtype=tf.int32)
        self.action_policy = tf.one_hot(self.action_in_policy, a_size)
        self.s_size = s_size
        self.h_size = h_size
        self.lr = lr
        self.update()
        
    def get_d(self, state_in, action_in, reuse):
        with tf.variable_scope("discriminator"):
            concat_input = tf.concat([state_in, action_in], axis=1)
            hidden_1 = tf.layers.dense(concat_input, self.h_size, activation=tf.nn.tanh, use_bias=False, name="d_hidden_1", reuse=reuse)
            hidden_2 = tf.layers.dense(hidden_1, self.h_size, activation=tf.nn.tanh, use_bias=False, name="d_hidden_2", reuse=reuse)
            d = tf.layers.dense(hidden_2, 1, activation=tf.nn.sigmoid, use_bias=False, name="d_out", reuse=reuse)
            return d
        
    def update(self):
        self.d_expert = self.get_d(self.state_in_expert, self.action_expert, False)
        self.d_policy = self.get_d(self.state_in_policy, self.action_policy, True)
        self.de = tf.reduce_mean(self.d_expert)
        self.dp = tf.reduce_mean(self.d_policy)
        self.d_loss = -tf.reduce_mean(tf.log(self.d_expert + 1e-10) + tf.log(1 - self.d_policy + 1e-10))
        optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        self.update_batch = optimizer.minimize(self.d_loss)

In [43]:
batch_size = 32
num_epoch = 10
s_size = 16
a_size = 5
h_size = 32
lr = 1e-4

In [44]:
tf.reset_default_graph()

disc = Discriminator(s_size, a_size, h_size, lr)

init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init)

In [45]:
def random_ordering(array):
    s = np.arange(len(array))
    np.random.shuffle(s)
    return s

In [46]:
for i in range(num_epoch):
    e_s = random_ordering(e_states)
    e_shuffle_states = e_states[e_s]
    e_shuffle_actions = e_actions[e_s]
    e_batch_reward = []
    
    p_s = random_ordering(p_states)
    p_shuffle_states = p_states[p_s]
    p_shuffle_actions = p_actions[p_s]
    p_batch_reward = []

    for j in range(len(p_states)//batch_size):
        e_batch_states = e_shuffle_states[j*batch_size:(j+1)*batch_size]
        e_batch_actions = e_shuffle_actions[j*batch_size:(j+1)*batch_size]
        e_batch_actions = np.reshape(e_batch_actions, [-1])
        
        p_batch_states = p_shuffle_states[j*batch_size:(j+1)*batch_size]
        p_batch_actions = p_shuffle_actions[j*batch_size:(j+1)*batch_size]
        p_batch_actions = np.reshape(p_batch_actions, [-1])
        
        fd = {disc.state_in_expert: e_batch_states, disc.state_in_policy: p_batch_states,
              disc.action_in_expert: e_batch_actions, disc.action_in_policy: p_batch_actions}
        d_e, d_p, loss, _ = sess.run([disc.de, disc.dp, disc.d_loss, disc.update_batch], feed_dict=fd)
        e_batch_reward.append(d_e)
        p_batch_reward.append(d_p)
    print(np.mean(e_batch_reward), np.mean(p_batch_reward), loss)

0.675583 0.331891 0.655162
0.851769 0.147071 0.15693
0.96025 0.0396672 0.0444466
0.985637 0.0143886 0.130539
0.992664 0.00730666 0.00832548
0.995493 0.00449424 0.00393074
0.99703 0.0029626 0.00323037
0.998022 0.00198373 0.00258053
0.998582 0.00139701 0.00116604
0.998936 0.000997169 0.000661084
