In [1]:
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import multiprocessing

In [2]:
is_fork = multiprocessing.get_start_method() == "fork"

device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)

## Environment Preparation

#### Load unity environment using `mlagents_envs`

In [3]:
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.environment import UnityEnvironment

channel = EngineConfigurationChannel()
env_path = "C:/Users/Pawel/Documents/Unity_Project/warehouse-bot-training/environment_builds/warehouse_stage1_complex_pos_neg/Warehouse_Bot.exe"

unity_env = UnityEnvironment(
  file_name=env_path,
  side_channels=[channel],
  additional_args=["-batchmode", "-nographics"]
)
channel.set_configuration_parameters(time_scale=50)

#### Transform environment from `mlagents` to `gymnasium`

In [4]:
import gymnasium as gym

In [5]:
print(gym.__version__)

1.0.0


In [6]:
from env_gymnasium_wrapper import UnityGymWrapper

gymnasium_env = UnityGymWrapper(unity_env)

#### Creating stable_baselines3 model

##### Building onw network

PPO Hyperparams from mlagents-learn config file

```yaml
behaviors:
  Dlivery_Bot_2:
    trainer_type: ppo
    hyperparameters:
      batch_size: 512
      buffer_size: 2560
      learning_rate: 0.0003
      beta: 0.005
      epsilon: 0.2
      lambd: 0.95
      num_epoch: 3
      learning_rate_schedule: linear
    network_settings:
      normalize: True
      hidden_units: 256
      num_layers: 2
      vis_encode_type: simple
    reward_signals:
      extrinsic:
        gamma: 0.99
        strength: 1.0
    keep_checkpoints: 5
    checkpoint_interval: 100000
    max_steps: 4000000
    time_horizon: 1024
    summary_freq: 10000
    # threaded: False
```

#### Policy Config

In [8]:
from stable_baselines3 import PPO
import torch.nn as nn

# model = PPO("MlpPolicy", gymnasium_env, verbose=1,
#             learning_rate=3e-4,
#             n_steps=10240,
#             batch_size=512,
#             n_epochs=8,
#             clip_range=0.2,
#             gamma=0.995,
#             gae_lambda=0.96,
#             seed=0,
#             ent_coef=0.005,
#             vf_coef=0.5,
#             policy_kwargs={
#               "net_arch": [dict(pi=[52, 24], vf=[52, 16])],
#               "activation_fn": nn.ReLU
#             },
#             tensorboard_log = './logs/stage1'
# )

model = PPO.load('./saved_models/baselines/stage1/warehouse_stage1_complex_pos_neg_3.0_1m', gymnasium_env)
model.tensorboard_log = './logs/stage1'

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [9]:
print(model.policy)

ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=100, out_features=52, bias=True)
      (1): ReLU()
      (2): Linear(in_features=52, out_features=24, bias=True)
      (3): ReLU()
    )
    (value_net): Sequential(
      (0): Linear(in_features=100, out_features=52, bias=True)
      (1): ReLU()
      (2): Linear(in_features=52, out_features=16, bias=True)
      (3): ReLU()
    )
  )
  (action_net): Linear(in_features=24, out_features=3, bias=True)
  (value_net): Linear(in_features=16, out_features=1, bias=True)
)


In [10]:
model.learn(total_timesteps=2_000_000, tb_log_name="complex_3", reset_num_timesteps=False)

Logging to ./logs/stage1\complex_3_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 62.1     |
|    ep_rew_mean     | -39.7    |
| time/              |          |
|    fps             | 425      |
|    iterations      | 1        |
|    time_elapsed    | 24       |
|    total_timesteps | 1013760  |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 64.9        |
|    ep_rew_mean          | -19.6       |
| time/                   |             |
|    fps                  | 415         |
|    iterations           | 2           |
|    time_elapsed         | 49          |
|    total_timesteps      | 1024000     |
| train/                  |             |
|    approx_kl            | 0.002785012 |
|    clip_fraction        | 0.0211      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.483      |
|    explained_variance   | 0.739  

<stable_baselines3.ppo.ppo.PPO at 0x1e2c8c4b7c0>

In [11]:
model.save('./saved_models/baselines/stage1/warehouse_stage1_complex_pos_neg_3.1_3m.zip')

In [12]:
model.learn(total_timesteps=1_000_000, tb_log_name="complex_3", reset_num_timesteps=False)

Logging to ./logs/stage1\complex_3_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 69.3     |
|    ep_rew_mean     | 58       |
| time/              |          |
|    fps             | 413      |
|    iterations      | 1        |
|    time_elapsed    | 24       |
|    total_timesteps | 3020800  |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 72.9        |
|    ep_rew_mean          | 55.5        |
| time/                   |             |
|    fps                  | 417         |
|    iterations           | 2           |
|    time_elapsed         | 49          |
|    total_timesteps      | 3031040     |
| train/                  |             |
|    approx_kl            | 0.004042534 |
|    clip_fraction        | 0.0292      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.334      |
|    explained_variance   | 0.788  

<stable_baselines3.ppo.ppo.PPO at 0x1e2c8c4b7c0>

In [13]:
model.save('./saved_models/baselines/stage1/warehouse_stage1_complex_pos_neg_3.2_4m.zip')