In [None]:
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import multiprocessing

In [None]:
is_fork = multiprocessing.get_start_method() == "fork"

device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)

## Environment Preparation

#### Load unity environment using `mlagents_envs`

In [None]:
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.environment import UnityEnvironment

channel = EngineConfigurationChannel()
env_path = "C:/Users/Pawel/Documents/Unity_Project/warehouse-bot-training/environment_builds/warehouse_step1_full/Warehouse_Bot.exe"

unity_env = UnityEnvironment(
  file_name=env_path,
  side_channels=[channel],
  additional_args=["-batchmode", "-nographics"]
)
channel.set_configuration_parameters(time_scale=20)

#### Transform environment from `mlagents` to `gymnasium`

In [None]:
import gymnasium as gym

In [None]:
print(gym.__version__)

In [None]:
from env_gymnasium_wrapper import UnityGymWrapper

gymnasium_env = UnityGymWrapper(unity_env)

#### Creating stable_baselines3 model

##### Building onw network

In [None]:
# import torch.nn as nn
# from stable_baselines3.common.policies import ActorCriticPolicy
# # from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
# from stable_baselines3 import PPO

# class CustomActorCriticPolicy(ActorCriticPolicy):
#     def __init__(self, observation_space, action_space, lr_schedule, *args, **kwargs):
#         super(CustomActorCriticPolicy, self).__init__(observation_space, action_space, lr_schedule, *args, **kwargs)

#         print(self.features_extractor.features_dim, action_space)
        
#         # Define a custom shared feature extractor
#         self.shared_net = nn.Sequential(
#             nn.Linear(self.features_extractor.features_dim, 256),
#             nn.ReLU(),
#             nn.Linear(256, 128),
#             nn.ReLU()
#         )
        
#         self.policy_net = nn.Sequential(
#             nn.Linear(feature_dim, last_layer_dim_pi), nn.ReLU()
#         )
#         # Value network
#         self.value_net = nn.Sequential(
#             nn.Linear(feature_dim, last_layer_dim_vf), nn.ReLU()
#         )

#         print(f"After override, value_net: {self.value_net}")  # Debugging to check the size again

#     def forward(self, obs, deterministic=False):
#         features = self.extract_features(obs)
#         print(f"Extracted features shape: {features.shape}")  # Check the extracted feature shape

#         features = self.shared_net(features)
#         print(f"After shared_net, features shape: {features.shape}")  # Should be (batch_size, 128)

#         action_logits = self.policy_net(features)

#         print(f"Value net input shape (before passing to value_net): {features.shape}")  # Must be (batch_size, 128)
#         value = self.value_net(features)  # Should be fine if features.shape[1] == 128

#         return action_logits, value

In [None]:
# Based on https://stable-baselines3.readthedocs.io/en/v1.0/guide/custom_policy.html

# from typing import Callable, Dict, List, Optional, Tuple, Type, Union

# import torch as th
# from torch import nn

# from stable_baselines3.common.policies import ActorCriticPolicy

# class CustomNetwork(nn.Module):
#     """
#     Custom network for policy and value function.
#     It receives as input the features extracted by the feature extractor.

#     :param feature_dim: dimension of the features extracted with the features_extractor (e.g. features from a CNN)
#     :param last_layer_dim_pi: (int) number of units for the last layer of the policy network
#     :param last_layer_dim_vf: (int) number of units for the last layer of the value network
#     """

#     def __init__(
#         self,
#         feature_dim: int,
#         last_layer_dim_pi: int = 64,
#         last_layer_dim_vf: int = 64,
#     ):
#         super(CustomNetwork, self).__init__()

#         # IMPORTANT:
#         # Save output dimensions, used to create the distributions
#         self.latent_dim_pi = last_layer_dim_pi
#         self.latent_dim_vf = last_layer_dim_vf

#         # Policy network
#         self.policy_net = nn.Sequential(
#             nn.Linear(feature_dim, 128),
#             nn.ReLU(),
#             nn.Linear(128, last_layer_dim_pi),
#             nn.ReLU()
#         )
#         # Value network
#         self.value_net = nn.Sequential(
#             nn.Linear(feature_dim, 64),
#             nn.ReLU(),
#             nn.Linear(64, last_layer_dim_vf),
#             nn.ReLU()
#         )

#     def forward(self, features: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
#         """
#         :return: (th.Tensor, th.Tensor) latent_policy, latent_value of the specified network.
#             If all layers are shared, then ``latent_policy == latent_value``
#         """
#         return self.policy_net(features), self.value_net(features)


# class CustomActorCriticPolicy(ActorCriticPolicy):
#     def __init__(
#         self,
#         observation_space: gym.spaces.Space,
#         action_space: gym.spaces.Space,
#         lr_schedule: Callable[[float], float],
#         net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None,
#         activation_fn: Type[nn.Module] = nn.Tanh,
#         *args,
#         **kwargs,
#     ):

#         super(CustomActorCriticPolicy, self).__init__(
#             observation_space,
#             action_space,
#             lr_schedule,
#             net_arch,
#             activation_fn,
#             # Pass remaining arguments to base class
#             *args,
#             **kwargs,
#         )
#         # Disable orthogonal initialization
#         self.ortho_init = False

#     def _build_mlp_extractor(self) -> None:
#         self.mlp_extractor = CustomNetwork(self.features_dim)

PPO Hyperparams from mlagents-learn config file

```yaml
behaviors:
  Dlivery_Bot_2:
    trainer_type: ppo
    hyperparameters:
      batch_size: 512
      buffer_size: 2560
      learning_rate: 0.0003
      beta: 0.005
      epsilon: 0.2
      lambd: 0.95
      num_epoch: 3
      learning_rate_schedule: linear
    network_settings:
      normalize: True
      hidden_units: 256
      num_layers: 2
      vis_encode_type: simple
    reward_signals:
      extrinsic:
        gamma: 0.99
        strength: 1.0
    keep_checkpoints: 5
    checkpoint_interval: 100000
    max_steps: 4000000
    time_horizon: 1024
    summary_freq: 10000
    # threaded: False
```

#### Policy Config

In [None]:
from stable_baselines3 import PPO
import torch.nn as nn

# model = PPO("MlpPolicy", gymnasium_env, verbose=1,
#             learning_rate=3e-4,
#             n_steps=10240,
#             batch_size=512,
#             n_epochs=8,
#             clip_range=0.2,
#             gamma=0.995,
#             gae_lambda=0.96,
#             seed=0,
#             ent_coef=0.005,
#             vf_coef=0.5,
#             policy_kwargs={
#               "net_arch": [dict(pi=[128, 64], vf=[64, 32])],
#               "activation_fn": nn.ReLU
#             }
# )


model = PPO.load('./saved_models/warehouse_step1_full_2_2', gymnasium_env)

In [None]:
print(model.policy)

In [None]:
model.learn(total_timesteps=500_000)

In [None]:
model.save('./saved_models/warehouse_step1_full_2_3')