In [None]:
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import multiprocessing

In [None]:
is_fork = multiprocessing.get_start_method() == "fork"

device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)

## Environment Preparation

#### Load unity environment using `mlagents_envs`

In [None]:
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.environment import UnityEnvironment

channel = EngineConfigurationChannel()
env_path = "D:/_Thesis/warehouse-bot-training/environment_builds/warehouse_stage2_find/Warehouse_Bot.exe"

unity_env = UnityEnvironment(
  file_name=env_path,
  side_channels=[channel],
  additional_args=["-batchmode", "-nographics"]
)
channel.set_configuration_parameters(time_scale=50)

#### Transform environment from `mlagents` to `gymnasium`

In [None]:
import gymnasium as gym
print(gym.__version__)

In [None]:
from env_camera_raycasts_gymnasium_wrapper import UnityCameraRaycastsGymWrapper

gymnasium_env = UnityCameraRaycastsGymWrapper(unity_env)

#### Creating stable_baselines3 model

##### Building own network

PPO Hyperparams from mlagents-learn config file

```yaml
behaviors:
  Dlivery_Bot_2:
    trainer_type: ppo
    hyperparameters:
      batch_size: 512
      buffer_size: 2560
      learning_rate: 0.0003
      beta: 0.005
      epsilon: 0.2
      lambd: 0.95
      num_epoch: 3
      learning_rate_schedule: linear
    network_settings:
      normalize: True
      hidden_units: 256
      num_layers: 2
      vis_encode_type: simple
    reward_signals:
      extrinsic:
        gamma: 0.99
        strength: 1.0
    keep_checkpoints: 5
    checkpoint_interval: 100000
    max_steps: 4000000
    time_horizon: 1024
    summary_freq: 10000
    # threaded: False
```

#### Policy Config

In [None]:
import torch as th
import torch.nn as nn
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CustomCombinedExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Dict, features_dim: int = 256):
        super().__init__(observation_space, features_dim)

        # Shapes of image and vector inputs: [<batch size>, <bands, height, width>], [<batch size>, <length>]
        
        # Create a sub-extractor for each modality
        # For images of size: 3x144x256
        self.image_net = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Linear(64, 64)
        )
        
        # Compute output shape of CNN dynamically
        with th.no_grad():
            sample_image = th.as_tensor(observation_space.spaces["image"].sample()[None]).float()
            print(sample_image.shape)
            cnn_output_size = self.image_net(sample_image).shape[1]
            print(self.image_net(sample_image).shape)
            
        # Get size of the vector data dynamically
        with th.no_grad():
            vector_input_size = observation_space.spaces["vector"].sample().shape[0]

        self.vector_net = nn.Sequential(
            nn.Linear(vector_input_size, 32),
            nn.ReLU()
        )
        
        # Get  output shape of the vector NN dynamically
        with th.no_grad():
            sample_vector = th.as_tensor(observation_space.spaces["vector"].sample()).float()
            vector_network_output_size = self.vector_net(sample_vector).shape[0]

        # Final linear layer
        self.linear = nn.Sequential(
            nn.Linear(cnn_output_size + vector_network_output_size, features_dim),
            nn.ReLU()
        )

    def forward(self, observations):
        image = observations["image"].float() / 255.0  # Normalize if needed
        vector = observations["vector"]

        image_features = self.image_net(image)
        vector_features = self.vector_net(vector)

        combined = th.cat([image_features, vector_features], dim=1)
        return self.linear(combined)


In [None]:
from stable_baselines3 import PPO
import torch.nn as nn

policy_kwargs = dict(
    features_extractor_class=CustomCombinedExtractor,
    features_extractor_kwargs=dict(features_dim=128),
)

# model = PPO("MultiInputPolicy",
#             gymnasium_env, verbose=1,
#             learning_rate=3e-4,
#             n_steps=10240,
#             batch_size=512,
#             n_epochs=8,
#             clip_range=0.2,
#             gamma=0.995,
#             gae_lambda=0.96,
#             seed=0,
#             ent_coef=0.005,
#             vf_coef=0.5,
#             policy_kwargs=policy_kwargs,
#             tensorboard_log = './logs/stage2'
# )

model = PPO("MultiInputPolicy",
            gymnasium_env, verbose=1,
            learning_rate=3e-4,
            n_steps=512,
            batch_size=512,
            n_epochs=2,
            clip_range=0.2,
            gamma=0.995,
            gae_lambda=0.96,
            seed=0,
            ent_coef=0.005,
            vf_coef=0.5,
            policy_kwargs=policy_kwargs,
            tensorboard_log = './logs/stage2'
)

# model.tensorboard_log = './logs/stage2'

In [None]:
print(model.policy.features_extractor)

In [None]:
model.learn(total_timesteps=100_000, tb_log_name="find_warehouse_1_spike", reset_num_timesteps=False)