In [None]:
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import multiprocessing

In [None]:
is_fork = multiprocessing.get_start_method() == "fork"

device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)

## Environment Preparation

#### Load unity environment using `mlagents_envs`

In [None]:
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.environment import UnityEnvironment

channel = EngineConfigurationChannel()
env_path = "C:/Users/Pawel/Documents/Unity_Project/warehouse-bot-training/environment_builds/test_env/Warehouse_Bot.exe"

from torchrl.envs import UnityMLAgentsEnv

unity_env = UnityEnvironment(
  file_name=env_path,
  side_channels=[channel],
  additional_args=["-batchmode", "-nographics"]
)
channel.set_configuration_parameters(time_scale=10)

#### Transform environment from `mlagents` to `gymnasium`

In [None]:
import gymnasium as gym

In [None]:
print(gym.__version__)

In [None]:
from env_gymnasium_wrapper import UnityGymWrapper

gymnasium_env = UnityGymWrapper(unity_env)

#### Creating stable_baselines3 model

In [None]:
# from stable_baselines3.common.env_util import make_vec_env
# from stable_baselines3.common.vec_env import SubprocVecEnv

# env = make_vec_env(gymnasium_env, n_envs=8, vec_env_cls=SubprocVecEnv)
# model = PPO("MlpPolicy", env, device="cpu")
# model.learn(total_timesteps=25_000)

PPO Hyperparams from mlagents-learn config file

```yaml
behaviors:
  Dlivery_Bot_2:
    trainer_type: ppo
    hyperparameters:
      batch_size: 512
      buffer_size: 2560
      learning_rate: 0.0003
      beta: 0.005
      epsilon: 0.2
      lambd: 0.95
      num_epoch: 3
      learning_rate_schedule: linear
    network_settings:
      normalize: True
      hidden_units: 256
      num_layers: 2
      vis_encode_type: simple
    reward_signals:
      extrinsic:
        gamma: 0.99
        strength: 1.0
    keep_checkpoints: 5
    checkpoint_interval: 100000
    max_steps: 4000000
    time_horizon: 1024
    summary_freq: 10000
    # threaded: False
```

In [None]:
from stable_baselines3 import PPO

model = PPO("MlpPolicy", gymnasium_env, verbose=1,
            learning_rate=3e-4,
            n_steps=2560,
            batch_size=512,
            n_epochs=3,
            clip_range=0.2,
            gamma=0.99,
            gae_lambda=0.96,
            seed=0,
            ent_coef=0.005,
            vf_coef=0.5,
)

# model = PPO.load('./saved_models/warehouse-raycasts_0', gymnasium_env)

In [None]:
model.learn(total_timesteps=500_000)

In [None]:
model.save('./saved_models/warehouse-raycasts_1')