# Unity ML Agents
## Imitation Learning (Behavioral Cloning)

### 1. Load dependencies

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import tensorflow as tf

from unityagents import UnityEnvironment

%matplotlib inline

### 2. Set environment parameters

In [None]:
env_name = "WallBC" # Name of the Unity environment binary to launch
train_mode = True # Whether to run the environment in training or inference mode

### 3. Start the environment

In [None]:
env = UnityEnvironment(file_name=env_name)

# Examine environment parameters
print(str(env))

In [None]:
brain_dict = env.reset(train_mode=False)
E = brain_dict['BrainE']
I = brain_dict['BrainI']

brain_info = env.brains['BrainE']
s_size = brain_info.state_space_size
a_size = brain_info.action_space_size

In [None]:
data_path = 'wall_human'

### 4A. Collect Training Data

In [None]:
num_steps = 2000

In [None]:
states = np.zeros([0, s_size])
actions = np.zeros([0, 1])

states = np.append(states, I.states, axis=0)

In [None]:
for step in range(num_steps):
    brain_dict = env.step(0)
    E = brain_dict['BrainE']
    I = brain_dict['BrainI']
    actions = np.append(actions, I.previous_actions, axis=0)
    if step < num_steps-1:
        states = np.append(states, I.states, axis=0)

In [None]:
if not os.path.exists(data_path):
    os.makedirs(data_path)

save_data = {"states": states, "actions": actions}
pickle.dump(save_data, open(data_path+"/data.p", "wb"))

### 4B. Load Data

In [None]:
save_data = pickle.load(open(data_path+"/data.p", "rb"))
states = save_data["states"]
actions = save_data["actions"]

### 5. Train NN

In [None]:
class ImitationNN(object):
    def __init__(self, state_size, action_size, h_size, lr):
        self.state = tf.placeholder(shape=[None, state_size], dtype=tf.float32)
        self.h_1 = tf.layers.dense(self.state, h_size, activation=tf.nn.elu)
        self.h_2 = tf.layers.dense(self.h_1, h_size, activation=tf.nn.elu)
        self.h_3 = tf.layers.dense(self.h_2, h_size, activation=tf.nn.elu)
        self.h_4 = tf.layers.dense(self.h_3, h_size, activation=tf.nn.elu)
        self.h_4d = tf.layers.dropout(self.h_4, 0.5)
        self.logits = tf.layers.dense(self.h_4d, action_size, activation=None)
        self.action_probs = tf.nn.softmax(self.logits)
        self.sample_action = tf.multinomial(self.logits, 1)
        
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        self.action_oh = tf.one_hot(self.action, action_size)
        self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10)*self.action_oh)
        
        self.action_percent = tf.reduce_mean(tf.cast(
            tf.equal(tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32),
                     self.action), tf.float32))

        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)

In [None]:
tf.reset_default_graph()

network = ImitationNN(s_size, a_size, 128, 1e-4)

num_epoch = 100
batch_size = 128
test_episodes = 50

In [None]:
sess = tf.InteractiveSession()
init = tf.global_variables_initializer()

In [None]:
losses = []
percentages = []
all_rewards = []

sess.run(init)
for i in range(num_epoch):
    s = np.arange(len(states))
    np.random.shuffle(s)
    shuffle_states = states[s]
    shuffle_actions = actions[s]
    batch_losses = []
    batch_percentages = []
    for j in range(len(states)//batch_size):
        batch_states = shuffle_states[j*batch_size:(j+1)*batch_size]
        batch_actions = shuffle_actions[j*batch_size:(j+1)*batch_size]
        feed_dict = {network.state: batch_states, network.action:np.reshape(batch_actions, -1)}
        loss, percent, _ = sess.run([network.loss, network.action_percent, network.update], feed_dict=feed_dict)
        batch_losses.append(loss)
        batch_percentages.append(percent)
    test_rewards = []
    for j in range(test_episodes):
        brain_dict = env.reset(train_mode=True)
        E = brain_dict['BrainE']
        rewards = 0
        while not E.local_done[0]:
            action = sess.run(network.sample_action, feed_dict={network.state:E.states})
            E_1 = env.step(action[0][0])['BrainE']
            rewards += E_1.rewards[0]
            E = E_1
        test_rewards.append(rewards)
    print("Epoch Loss: {} -- Epoch Percent: {} -- Test Rewards: {} -- {}"
          .format(np.mean(batch_losses), np.mean(batch_percentages), np.mean(test_rewards), i))
    losses.append(np.mean(batch_losses))
    percentages.append(np.mean(batch_percentages))
    all_rewards.append(np.mean(test_rewards))
plt.plot(all_rewards)

### 6. Close the environment when finished
When we are finished using an environment, we can close it with the function below.

In [None]:
env.close()