In [None]:
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import multiprocessing

In [None]:
is_fork = multiprocessing.get_start_method() == "fork"

device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)

## Environment Preparation

#### Load unity environment using `mlagents_envs`

#### Transform environment from `mlagents` to `gymnasium`

In [None]:
import gymnasium as gym
print(gym.__version__)

In [None]:
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.environment import UnityEnvironment

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, VecTransposeImage
from env_camera_gymnasium_wrapper import UnityCameraGymWrapper
from env_camera_raycasts_gymnasium_wrapper import UnityCameraRaycastsGymWrapper

env_path = "D:/_Thesis/warehouse-bot-training/environment_builds/warehouse_stage2_find_potential_dist/Warehouse_Bot.exe"
def make_env():

  channel = EngineConfigurationChannel()

  unity_env = UnityEnvironment(
    file_name=env_path,
    side_channels=[channel],
    # additional_args=["-batchmode", "-nographics"]
  )
  
  channel.set_configuration_parameters(time_scale=1)
  
  gymnasium_env = UnityCameraRaycastsGymWrapper(unity_env)
  gymnasium_env = Monitor(gymnasium_env)
  
  print(gymnasium_env.observation_space)
  
  return gymnasium_env

In [None]:
env = DummyVecEnv([make_env])
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)

### Creating stable_baselines3 model

#### Policy Config

In [None]:
import torch as th
import torch.nn as nn
import gymnasium as gym
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)
      
class CustomCombinedExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Dict, image_channels=3, vector_obs_size=128, features_dim = 64):
        super().__init__(observation_space, features_dim)

        # Shapes of image and vector inputs: [<batch size>, <bands, height, width>], [<batch size>, <length>]
        
        # Visual branch
        self.visual_net = nn.Sequential(
            nn.Conv2d(image_channels, 16, kernel_size=5, stride=4, padding=0),
            nn.LeakyReLU(0.01),
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.01),
            nn.Flatten()
        )
        
        # Compute flattened visual output size from dummy input
        dummy_input = torch.zeros(1, image_channels, 36, 64)
        with torch.no_grad():
            visual_out_size = self.visual_net(dummy_input).shape[1]

        # Vector branch (raycast)
        self.vector_net = nn.Sequential(
            nn.Linear(vector_obs_size, 64),
            Swish(),
            nn.Linear(64, 64),                             
            Swish()
        )

        # Combined MLP after concatenating visual + vector
        self.mlp = nn.Sequential(
            nn.Linear(visual_out_size + 64, 64),
            nn.ReLU(),
            nn.Linear(64, features_dim)
        )

    def forward(self, observations):
        image = observations["image"].float()
        vector = observations["vector"]

        image_features = self.visual_net(image)
        vector_features = self.vector_net(vector)

        # print(image_features.shape, vector_features.shape)
        combined = th.cat([image_features, vector_features], dim=1)
        return self.mlp(combined)


#### Decaying Entropy Coefficient

In [None]:
from stable_baselines3.common.callbacks import BaseCallback

class DecayingEntropyCalback(BaseCallback):
    def __init__(self, initial_value=0.1, final_value=0.01, max_steps=100_000, verbose=0):
        super().__init__(verbose)
        self.initial_value = initial_value
        self.final_value = final_value
        self.max_steps = max_steps

    def _on_step(self) -> bool:
        progress = min(1.0, self.num_timesteps / self.max_steps)
        current_ent_coef = self.initial_value * (1.0 - progress) + self.final_value * progress
        self.model.ent_coef = current_ent_coef
        return True

#### Training Algorithm

In [None]:
from stable_baselines3 import PPO

policy_kwargs = dict(
    features_extractor_class=CustomCombinedExtractor,
    features_extractor_kwargs=dict(image_channels=3, vector_obs_size=80, features_dim=32),
    net_arch=[dict(pi=[32, 16], vf=[32, 16])],
    activation_fn=nn.ReLU
)

model = PPO("MultiInputPolicy",
            env, verbose=1,
            learning_rate=1e-3,
            n_steps=2560,
            batch_size=512,
            n_epochs=10,
            clip_range=0.2,
            gamma=0.99,
            gae_lambda=0.95,
            seed=0,
            ent_coef=0.1,
            vf_coef=0.005,
            policy_kwargs=policy_kwargs,
            tensorboard_log = './logs/stage2',
            # stats_window_size=1
)

In [None]:
# model.policy

In [None]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

In [None]:
print(get_n_params(model.policy))

In [None]:
decayingEntropyCallback = DecayingEntropyCalback(initial_value=0.02, final_value=0.005, max_steps=100_000)
model.learn(total_timesteps=300_000, tb_log_name="find_2_potential_dist_2", reset_num_timesteps=False, callback=decayingEntropyCallback)