# Unity ML Agents
## Imitation Learning (Parallel Behavioral Cloning)

### 1. Load dependencies

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import tensorflow as tf

from unityagents import UnityEnvironment

%matplotlib inline

### 2. Set environment parameters

In [None]:
env_name = "race_i" # Name of the Unity environment binary to launch
train_mode = True # Whether to run the environment in training or inference mode

### 3. Start the environment

In [None]:
env = UnityEnvironment(file_name=env_name)

# Examine environment parameters
print(str(env))

In [None]:
brain_dict = env.reset(train_mode=False)
E = brain_dict['BrainE']
P = brain_dict['BrainP']

brain_info = env.brains['BrainE']
s_size = brain_info.state_space_size * brain_info.stacked_states
a_size = brain_info.action_space_size

### 5. Train NN

In [None]:
class ImitationNN(object):
    def __init__(self, state_size, action_size, h_size, lr, action_type):
        self.state = tf.placeholder(shape=[None, state_size], dtype=tf.float32)
        self.h_1 = tf.layers.dense(self.state, h_size, activation=tf.nn.elu)
        self.h_2 = tf.layers.dense(self.h_1, h_size, activation=tf.nn.elu)
        self.h_3 = tf.layers.dense(self.h_2, h_size, activation=tf.nn.elu)
        self.h_4 = tf.layers.dense(self.h_3, h_size, activation=tf.nn.elu)
        self.h_4d = tf.layers.dropout(self.h_4, 0.5)
        self.logits = tf.layers.dense(self.h_4d, action_size, activation=None)
        
        if (action_type == "discrete"):
            self.action_probs = tf.nn.softmax(self.logits)
            self.sample_action = tf.multinomial(self.logits, 1)
            self.true_action = tf.placeholder(shape=[None], dtype=tf.int32)
            self.action_oh = tf.one_hot(self.true_action, action_size)
            self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10)*self.action_oh)
        
            self.action_percent = tf.reduce_mean(tf.cast(
                tf.equal(tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32),
                         self.action), tf.float32))
        else:
            self.sample_action = self.logits
            self.true_action = tf.placeholder(shape=[None, action_size], dtype=tf.float32)
            self.loss = tf.reduce_sum(tf.squared_difference(self.true_action, self.sample_action))

        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)

In [None]:
tf.reset_default_graph()

network = ImitationNN(s_size, a_size, 128, 1e-4, brain_info.action_space_type)

num_steps = 2000
batch_size = 64
test_episodes = 1
test_frequency = 10
fast_testing = False

In [None]:
sess = tf.InteractiveSession()
init = tf.global_variables_initializer()

In [None]:
losses = []
percentages = []
all_rewards = []


sess.run(init)
test_rewards = []
brain_dict = env.reset(train_mode=fast_testing)
E = brain_dict['BrainE']
P = brain_dict['BrainP']

expert_states = np.zeros([0, s_size])
expert_states = np.append(expert_states, P.states, axis=0)
expert_actions = np.zeros([0, 1])
rewards = 0
for i in range(num_steps):
    agent_action = sess.run(network.sample_action, feed_dict={network.state: E.states})
    brains_1 = env.step(agent_action[0])
    E_1 = brains_1['BrainE']
    P_1 = brains_1['BrainP']
    expert_actions = np.append(expert_actions, P_1.previous_actions, axis=0)
    rewards += E_1.rewards[0]
    if len(expert_actions) > 1:
        s = np.arange(len(expert_states))
        np.random.shuffle(s)
        shuffle_states = expert_states[s]
        shuffle_actions = expert_actions[s]
        batch_losses = []
        for j in range(min(len(expert_states)//batch_size, 25)):
            batch_states = shuffle_states[j*batch_size:(j+1)*batch_size]
            batch_actions = shuffle_actions[j*batch_size:(j+1)*batch_size]
            if brain_info.action_space_type == "discrete":
                feed_dict = {network.state: batch_states, network.true_action:np.reshape(batch_actions, -1)}
            else:
                feed_dict = {network.state: batch_states, network.true_action:batch_actions}
            loss, _ = sess.run([network.loss, network.update], feed_dict=feed_dict)
            batch_losses.append(loss)
        losses.append(np.mean(batch_losses))
    expert_states = np.append(expert_states, P.states, axis=0)
    E = E_1
    P = P_1

### 6. Close the environment when finished
When we are finished using an environment, we can close it with the function below.

In [None]:
env.close()