# Warfighter PPO

API Reference: https://github.com/Unity-Technologies/ml-agents/blob/release_20_docs/docs/Python-LLAPI.md

In [None]:
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.environment import ActionTuple
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
import numpy as np
import matplotlib.pyplot as plt

## Startup & Config

In [None]:
editor_timescale = 20.0
n_timesteps = 10000

In [None]:
config_channel = EngineConfigurationChannel()

# Connect to the Unity Environment (Press the Play button after running this block)
env = UnityEnvironment(seed=1, side_channels=[config_channel])
config_channel.set_configuration_parameters(time_scale=editor_timescale)
env.reset()

print("API Version: [" + env.API_VERSION + "]")
print("Base Env Port: [" + str(env.BASE_ENVIRONMENT_PORT) + "]")
print("Default Editor Port: [" + str(env.DEFAULT_EDITOR_PORT) + "]")

## Random Actions

In [None]:
group_name = list(env.behavior_specs)[0]
print("Group name: [" + group_name + "]")

behavior_spec = env.behavior_specs[group_name]
print(behavior_spec)
print()

rewards = []

for t in range(0, n_timesteps):
    decision_steps, terminal_steps = env.get_steps(group_name)

    if (terminal_steps):
        rewards.append(terminal_steps.reward)
        env.reset()
        decision_steps, _ = env.get_steps(group_name)

    action = ActionTuple(np.random.randn(len(decision_steps.agent_id), behavior_spec.action_spec.continuous_size))
    env.set_actions(group_name, action)
    env.step()

print("Done!")

In [None]:
plt.plot(rewards)
plt.ylabel('Reward')
plt.xlabel('Episode')
plt.show()

## PPO

In [None]:
from mlagents_envs.envs.unity_gym_env import UnityToGymWrapper
from stable_baselines3 import PPO

sb3_env = UnityToGymWrapper(env, uint8_visual=False, flatten_branched=False, allow_multiple_obs=False)

In [None]:
model = PPO(
    policy = 'MlpPolicy',
    env = sb3_env,
    n_steps = 1024,
    batch_size = 64,
    n_epochs = 4,
    gamma = 0.999,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    verbose=1)

model.learn(total_timesteps=1e5)
model.save("warfighter-ppo-1e5")

In [None]:
model = PPO.load("warfighter-ppo-1e5", env=sb3_env)
rewards = []

observation = sb3_env.reset()
for _ in range(1000):
    action, _states = model.predict(observation, deterministic=True)
    observation, reward, terminated, truncated = sb3_env.step(action)

    if terminated:
        rewards.append(reward)
        observation = sb3_env.reset()

plt.plot(rewards)
plt.ylabel('Reward')
plt.xlabel('Episode')
plt.show()

## Close Connection

In [None]:
env.close()