In [1]:
import warnings
warnings.filterwarnings("ignore")

import torch as th
from torch import multiprocessing

In [2]:
is_fork = multiprocessing.get_start_method() == "fork"

device = (
    th.device(0)
    if th.cuda.is_available() and not is_fork
    else th.device("cpu")
)

## Environment Preparation

#### Transform environment from `mlagents` to `gymnasium`

In [3]:
import gymnasium as gym
print(gym.__version__)

1.1.1


In [4]:
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.environment import UnityEnvironment
from gymnasium.wrappers import NormalizeObservation, NormalizeReward

from src.environments.env_camera_raycasts_gymnasium_wrapper import UnityCameraRaycastsGymWrapper

env_path = "D:/_Thesis/warehouse-bot-training/environment_builds/stage2/find_camera_raycasts_16x5/Warehouse_Bot.exe"
def make_env():

  channel = EngineConfigurationChannel()

  unity_env = UnityEnvironment(
    file_name=env_path,
    side_channels=[channel],
    no_graphics=True
  )
  
  channel.set_configuration_parameters(time_scale=1)
  
  gymnasium_env = UnityCameraRaycastsGymWrapper(unity_env)
  
  # Add reward normalization
  # gymnasium_env = NormalizeReward(gymnasium_env)
  
  print(gymnasium_env.observation_space)
  
  return gymnasium_env

#### Policy Config

In [5]:
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Categorical

# Actor-Critic Network for only vector observations
class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(obs_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh()
        )
        self.policy_head = nn.Linear(64, act_dim)
        self.value_head = nn.Linear(64, 1)

    def forward(self, x):
        x = self.shared(x)
        return self.policy_head(x), self.value_head(x)

    def get_action(self, obs):
        logits, value = self.forward(obs)
        dist = Categorical(logits=logits)
        action = dist.sample()
        return action, dist.log_prob(action), dist.entropy(), value.squeeze()

    def evaluate_actions(self, obs, actions):
        logits, values = self.forward(obs)
        dist = Categorical(logits=logits)
        log_probs = dist.log_prob(actions)
        entropy = dist.entropy()
        return log_probs, entropy, values.squeeze()

class Swish(nn.Module):
    def forward(self, x):
        return x * th.sigmoid(x)

# Actor-Critic Network for multimodal observations (image plus vector)
class ActorCriticMultimodal(nn.Module):
    def __init__(self, act_dim, visual_size=[3, 36, 64], vector_obs_size=128):
        super().__init__()
        bands = visual_size[0]

        # Shapes of image and vector inputs: [<batch size>, <bands, height, width>], [<batch size>, <length>]

        visual_out_size = 64
        vector_out_size = 32

        # Visual Encoder
        self.visual_encoder_cnn = nn.Sequential(
            nn.Conv2d(bands, 16, kernel_size=5, stride=4, padding=0),
            nn.LeakyReLU(0.01),
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.01),
            nn.Conv2d(32, 32, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.01),
            nn.Flatten(),
        )

        # Compute flattened visual output size from dummy input
        dummy_input = th.zeros(1, bands, visual_size[1], visual_size[2])
        with th.no_grad():
            visual_encoder_cnn_out_size = self.visual_encoder_cnn(dummy_input).shape[1]

        self.visual_encoder_mlp = nn.Sequential(
            nn.Linear(visual_encoder_cnn_out_size, 64),
            Swish(),
            nn.Linear(64, visual_out_size),
            Swish()
        )
        
        # Vector Encoder
        self.vector_encoder = nn.Sequential(
            nn.Linear(vector_obs_size, 32),
            Swish(),
            nn.Linear(32, vector_out_size),                             
            Swish()
        )

        # Concatenation Network
        self.shared = nn.Sequential(
            nn.Linear(visual_out_size + vector_out_size, 64),
            nn.Tanh(),
            nn.Linear(64, 32),
            nn.Tanh()
        )
        self.policy_head = nn.Linear(32, act_dim)
        self.value_head = nn.Linear(32, 1)

    
    def forward(self, observations):
        image = observations["image"].float()
        vector = observations["vector"]

        image_features = self.visual_encoder_cnn(image)
        image_features = self.visual_encoder_mlp(image_features)
        vector_features = self.vector_encoder(vector)

        combined = th.cat([image_features, vector_features], dim=1)
        x = self.shared(combined)
        return self.policy_head(x), self.value_head(x)

    def get_action(self, obs):
        logits, value = self.forward(obs)
        dist = Categorical(logits=logits)
        action = dist.sample()
        return action, dist.log_prob(action), dist.entropy(), value.squeeze()

    def evaluate_actions(self, obs, actions):
        logits, values = self.forward(obs)
        dist = Categorical(logits=logits)
        log_probs = dist.log_prob(actions)
        entropy = dist.entropy()
        return log_probs, entropy, values.squeeze()

In [6]:
def count_parameters(model):
    """
    Count parameters in each block of the network and total parameters.
    
    Args:
        model: PyTorch model
        
    Returns:
        dict: Dictionary containing parameter counts for each block and total
    """
    total_params = 0
    block_params = {}
    
    for name, module in model.named_children():
        params = sum(p.numel() for p in module.parameters())
        block_params[name] = params
        total_params += params
        
    block_params['total'] = total_params
    return block_params


#### Training Algorithm

In [11]:
# Create settings dictionary
settings = {
    'gamma': 0.99,
    'lam': 0.95,
    'clip_eps': 0.2,
    'ppo_epochs': 2,
    'batch_size': 64,
    'update_timesteps': 512,
    'lr': 1e-4,
    'val_loss_coef': 0.5 / 100,
    'ent_loss_coef':  0.001,
    'device': th.device("cuda" if th.cuda.is_available() else "cpu")
}

In [8]:
# Create environment
env = make_env()

Dict('image': Box(0, 255, (3, 36, 64), uint8), 'vector': Box(0.0, 255.0, (80,), float32))


In [13]:
act_dim = env.action_space.n

model_net = ActorCriticMultimodal(act_dim, visual_size=[3, 36, 64], vector_obs_size=80)
param_counts = count_parameters(model_net)
print(param_counts)

from src.algorithms.PPO_algorithm import PPOAgent
agent = PPOAgent(model_net, settings)

{'visual_encoder_cnn': 15104, 'visual_encoder_mlp': 20608, 'vector_encoder': 3648, 'shared': 8288, 'policy_head': 99, 'value_head': 33, 'total': 47780}


In [14]:
agent.train(env, 25)

Iteration 0 completed. Episodes: 8 | Mean Return: -83.5684 | Std Return: 6.3631 | Mean steps: 70.7500 | Std steps: 9.8203 | Mean losses: total: 0.004701, policy: 0.000814, value: 0.004985, entropy: 0.001098
Iteration 1 completed. Episodes: 8 | Mean Return: -19.4160 | Std Return: 92.6767 | Mean steps: 67.8750 | Std steps: 35.2187 | Mean losses: total: 0.007368, policy: 0.003493, value: 0.004974, entropy: 0.001098
Iteration 2 completed. Episodes: 7 | Mean Return: -45.9314 | Std Return: 61.1068 | Mean steps: 84.5714 | Std steps: 37.3396 | Mean losses: total: -0.008490, policy: -0.013262, value: 0.005870, entropy: 0.001099
Iteration 3 completed. Episodes: 7 | Mean Return: -61.0048 | Std Return: 66.1800 | Mean steps: 84.8571 | Std steps: 34.7868 | Mean losses: total: -0.005278, policy: -0.009408, value: 0.005227, entropy: 0.001098
Iteration 4 completed. Episodes: 8 | Mean Return: -35.1805 | Std Return: 80.5388 | Mean steps: 79.2500 | Std steps: 28.6564 | Mean losses: total: 0.003565, policy

KeyboardInterrupt: 