# Unity ML Agents
## Proximal Policy Optimization (PPO)
Contains an implementation of PPO as described [here](https://arxiv.org/abs/1707.06347).

In [None]:
import numpy as np
import os
import tensorflow as tf
import re

from ppo.history import *
from ppo.models import *
from ppo.trainer import Trainer
from unityagents import *

### Hyperparameters

In [None]:
### General parameters
max_steps = 5e5 # Set maximum number of steps to run environment.
run_path = "ppo-hunter13" # The sub-directory name for model and summary statistics
load_model = False # Whether to load a saved model.
train_model =  True # Whether to train the model.
summary_freq = 1000 # Frequency at which to save training statistics.
save_freq = 5000 # Frequency at which to save model.
env_name = "hunter" # Name of the training environment file.
curriculum_file = None

### Algorithm-specific parameters for tuning
gamma = 0.997 # Reward discount rate.
lambd = 0.95 # Lambda parameter for GAE.
time_horizon = 2048 # How many steps to collect per agent before adding to buffer.
beta = 1e-3 # Strength of entropy regularization
num_epoch = 5 # Number of gradient descent steps per batch of experiences.
epsilon = 0.2 # Acceptable threshold around ratio of old and new policy probabilities.
buffer_size = 10240 # How large the experience buffer should be before gradient descent.
learning_rate = 5e-5 # Model learning rate.
hidden_units = 64 # Number of units in hidden layer.
batch_size = 1024 # How many experiences per gradient descent update step.

### Logging dictionary for hyperparameters
hyperparameter_dict = {'max_steps':max_steps, 'run_path':run_path, 'env_name':env_name,
    'curriculum_file':curriculum_file, 'gamma':gamma, 'lambd':lambd, 'time_horizon':time_horizon,
    'beta':beta, 'num_epoch':num_epoch, 'epsilon':epsilon, 'buffe_size':buffer_size,
    'leaning_rate':learning_rate, 'hidden_units':hidden_units, 'batch_size':batch_size}

### Load the environment

In [None]:
env = UnityEnvironment(file_name=env_name, curriculum=curriculum_file, worker_id = 2)
print(str(env))


### Train the Agent(s)

In [None]:
tf.reset_default_graph()

# graphs = {}
# for brain in env.external_brain_names:
#     graphs[brain] =  tf.Graph()

if curriculum_file == "None":
    curriculum_file = None

#curriculum is disabled
# def get_progress(brain):
#     if curriculum_file is not None:
#         if env._curriculum.measure_type == "progress":
#             return steps / max_steps
#         elif env._curriculum.measure_type == "reward":
#             return last_reward
#         else:
#             return None
#     else:
#         return None

# Create the Tensorflow model graph
models = {}

for brain in env.external_brain_names:
    with tf.variable_scope(re.sub('[^0-9a-zA-Z]+', '-', brain)):
        models[brain] = create_agent_model(env.brains[brain], lr=learning_rate,
                               h_size=hidden_units, epsilon=epsilon,
                               beta=beta, max_step=max_steps)


# is_continuous = (env.brains[brain_name].action_space_type == "continuous")
# use_observations = (env.brains[brain_name].number_observations > 0)
# use_states = (env.brains[brain_name].state_space_size > 0)
model_path = './models/{}'.format(run_path)
summary_paths = {}
for brain in env.external_brain_names:
    summary_paths[brain] = './summaries/{}'.format(run_path+'_'+brain)
    if not os.path.exists(summary_paths[brain]):
        os.makedirs(summary_paths[brain])

if not os.path.exists(model_path):
    os.makedirs(model_path)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    # Instantiate model parameters
    if load_model:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(init)
    steps = {}
    last_rewards = {}
    summary_writers = {}
    for brain in env.external_brain_names:
        steps[brain], last_rewards[brain] = sess.run([models[brain].global_step, models[brain].last_reward])    
        summary_writers[brain] = tf.summary.FileWriter(summary_paths[brain])
#         if train_model:
#             trainers[brain].write_text(summary_writers[brain], 'Hyperparameters', hyperparameter_dict, steps)

#     info = env.reset(train_mode=train_model, progress=get_progress())
    info = env.reset(train_mode=train_model)
    trainers = {}
    for brain in env.external_brain_names:
        trainers[brain] = Trainer(models[brain], sess, info[brain],
           (env.brains[brain].action_space_type == "continuous"),
            (env.brains[brain].number_observations > 0),
             (env.brains[brain].state_space_size > 0),
              train_model)

        
        
        
    while min([steps[b] for b in env.external_brain_names]) <= max_steps:
        if env.global_done:
#             info = env.reset(train_mode=train_model, progress=get_progress())
            info = env.reset(train_mode=train_model)
        # Decide and take an action
        take_action_actions = {}
        take_action_outputs = {}
        for brain in env.external_brain_names:
            take_action_actions[brain], take_action_outputs[brain] = trainers[brain].take_action(
                info[brain], env, brain, steps[brain])

        try:
            new_info = env.step(take_action_actions)
        except:
            print(take_action_actions)
            raise
        for brain in env.external_brain_names:
            trainers[brain].add_experiences(
                info[brain], new_info[brain],take_action_actions[brain] , take_action_outputs[brain])

        info = new_info
        for brain in env.external_brain_names:
            trainers[brain].process_experiences(info[brain], time_horizon, gamma, lambd)
            if len(trainers[brain].training_buffer['actions']) > buffer_size and train_model:
                    # Perform gradient descent with experience buffer
                    trainers[brain].update_model(batch_size, num_epoch)
            if steps[brain] % summary_freq == 0 and steps[brain] != 0 and train_model:
                # Write training statistics to tensorboard.
                trainers[brain].write_summary(summary_writers[brain], brain, steps[brain], env._curriculum.lesson_number)
            if steps[brain] % save_freq == 0 and steps[brain] != 0 and train_model:
                # Save Tensorflow model
                # This does not need to be for each brain 
                save_model(sess, model_path=model_path, steps=steps[brain], saver=saver)
            steps[brain] += 1
            sess.run(models[brain].increment_step)
            if len(trainers[brain].stats['cumulative_reward']) > 0:
                mean_reward = np.mean(trainers[brain].stats['cumulative_reward'])
                sess.run(models[brain].update_reward, feed_dict={models[brain].new_reward: mean_reward})
                last_reward = sess.run(models[brain].last_reward)
    for brain in env.external_brain_names:
        # Final save Tensorflow model
        if steps[brain] != 0 and train_model:
            save_model(sess, model_path=model_path, steps=steps[brain], saver=saver)
env.close()
nodes = []
for brain in env.external_brain_names:
    scope = (re.sub('[^0-9a-zA-Z]+', '-', brain)) + '/'
    nodes +=[scope + x for x in ["action","value_estimate","action_probs"]]
    
export_graph(model_path, env_name, target_nodes=','.join(nodes))

### Export the trained Tensorflow graph
Once the model has been trained and saved, we can export it as a .bytes file which Unity can embed.

In [None]:
nodes = []
for brain in env.external_brain_names:
    scope = (re.sub('[^0-9a-zA-Z]+', '-', brain)) + '/'
    nodes +=[scope + x for x in ["action","value_estimate","action_probs"]]
    
export_graph(model_path, env_name, target_nodes=','.join(nodes))