In [1]:
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import multiprocessing
from torch import nn
from tensordict.nn import TensorDictModule

from collections import defaultdict

In [2]:
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import multiprocessing

In [3]:
is_fork = multiprocessing.get_start_method() == "fork"

device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)

## Environment Preparation

#### Load unity environment using `mlagents_envs`

In [4]:
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.environment import UnityEnvironment

channel = EngineConfigurationChannel()
env_path = "C:/Users/Pawel/Documents/Unity_Project/warehouse-bot-training/environment_builds/test_env_simplified/Warehouse_Bot.exe"

from torchrl.envs import UnityMLAgentsEnv

unity_env = UnityEnvironment(
  file_name=env_path,
  side_channels=[channel],
  additional_args=["-batchmode", "-nographics"]
)
channel.set_configuration_parameters(time_scale=10)

  unity_communicator_version = StrictVersion(unity_com_ver)


#### Transform environment from `mlagents` to `gymnasium`

In [5]:
import gymnasium as gym

In [6]:
print(gym.__version__)

1.0.0


In [7]:
import numpy as np
from gymnasium import spaces
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import ActionTuple

class UnityGymWrapper(gym.Env):
    def __init__(self, unity_env, seed=None):
        super().__init__()
        self.unity_env = unity_env
        self.unity_env.reset()
        self.behavior_name = list(self.unity_env.behavior_specs.keys())[0]
        self.spec = self.unity_env.behavior_specs[self.behavior_name]   
        
        # Define observation space (assuming visual input)
        obs_shape = self.spec.observation_specs[0].shape
        self.observation_space = spaces.Box(low=0, high=255, shape=obs_shape, dtype=np.uint8) # ???
        
        # Define action space
        if self.spec.action_spec.is_discrete():
            self.action_space = spaces.Discrete(self.spec.action_spec.discrete_branches[0])

        
    def reset(self, seed=None, options=None):
        self.unity_env.reset()
        decision_steps, _ = self.unity_env.get_steps(self.behavior_name)
        obs = decision_steps.obs[0]  # Assuming single-agent scenario
        return obs, {}

    def step(self, action):
        action_tuple = ActionTuple()
        
        if self.spec.action_spec.is_discrete():
            action_tuple.add_discrete(np.array(action).reshape(1, -1))
        
        self.unity_env.set_action_for_agent(self.behavior_name, 0, action_tuple)
        self.unity_env.step()
        
        decision_steps, terminal_steps = self.unity_env.get_steps(self.behavior_name)

        if 0 in terminal_steps:
            obs = terminal_steps.obs[0]
            reward = terminal_steps.reward[0]
            
            # terminated - Natural episode ending.
            terminated = not terminal_steps.interrupted[0]
            
            # truncated - "Whether the truncation condition outside the scope of the MDP is satisfied. Typically, this is a timelimit"
            # interrupted - "The episode ended due to max steps or external termination, not because the episode ended naturally (failed/succeeded)."
            truncated = terminal_steps.interrupted[0]
            
            # terminated and truncated are mutually exclusive
        else:
            obs = decision_steps.obs[0]
            reward = decision_steps.reward[0]
            terminated = False
            truncated = False
        
        return obs, reward, terminated, truncated, {}

    def render(self, mode='human'):
        pass  # Unity renders its own environment
    
    def close(self):
        self.unity_env.close()

In [8]:
gymnasium_env = UnityGymWrapper(unity_env)

#### Creating stable_baselines3 model

In [9]:
# from stable_baselines3.common.env_util import make_vec_env
# from stable_baselines3.common.vec_env import SubprocVecEnv

# env = make_vec_env(gymnasium_env, n_envs=8, vec_env_cls=SubprocVecEnv)
# model = PPO("MlpPolicy", env, device="cpu")
# model.learn(total_timesteps=25_000)

In [10]:
from stable_baselines3 import PPO

model = PPO("MlpPolicy", gymnasium_env, verbose=1,
            learning_rate=3e-4,
            n_steps=10240,
            batch_size=256,
            n_epochs=6,
            clip_range=0.2,
            gamma=0.99,
            gae_lambda=0.96,
            seed=0,
            ent_coef=0.005,
            vf_coef=0.5,
            
            
)
model.learn(total_timesteps=250_000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 245      |
|    ep_rew_mean     | -110     |
| time/              |          |
|    fps             | 498      |
|    iterations      | 1        |
|    time_elapsed    | 20       |
|    total_timesteps | 10240    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 276          |
|    ep_rew_mean          | -117         |
| time/                   |              |
|    fps                  | 490          |
|    iterations           | 2            |
|    time_elapsed         | 41           |
|    total_timesteps      | 20480        |
| train/                  |              |
|    approx_kl            | 0.0040261084 |
|    clip_fraction        | 0.0285       |
|    clip_range           | 0.2          |
|    en

<stable_baselines3.ppo.ppo.PPO at 0x1f4d92b1c90>

In [11]:
model.save('./saved_models/test')