In [1]:
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import multiprocessing
from torch import nn
from tensordict.nn import TensorDictModule

from collections import defaultdict

In [2]:
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import multiprocessing

In [3]:
is_fork = multiprocessing.get_start_method() == "fork"

device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)

## Environment Preparation

#### Load unity environment using `mlagents_envs`

In [4]:
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.environment import UnityEnvironment

channel = EngineConfigurationChannel()
env_path = "C:/Users/Pawel/Documents/Unity_Project/warehouse-bot-training/environment_builds/test_env_simplified/Warehouse_Bot.exe"

from torchrl.envs import UnityMLAgentsEnv

unity_env = UnityEnvironment(
  file_name=env_path,
  side_channels=[channel],
  # additional_args=["-batchmode", "-nographics"]
)
channel.set_configuration_parameters(time_scale=3)

  unity_communicator_version = StrictVersion(unity_com_ver)


#### Transform environment to Gym format from `mlagents_envs` package

In [5]:
# from mlagents_envs.envs.unity_gym_env import UnityToGymWrapper
# gym_env = UnityToGymWrapper(unity_env, allow_multiple_obs=True)

In [6]:
# # Flattening observations (removing Tuple(Box) -> Box)
# from gym.wrappers import FlattenObservation

# gym_env = FlattenObservation(gym_env)

#### Transform environment from `mlagents` to `gymnasium`

In [7]:
import gymnasium as gym

In [8]:
print(gym.__version__)

1.0.0


In [9]:
import numpy as np
from gymnasium import spaces
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import ActionTuple

class UnityGymWrapper(gym.Env):
    def __init__(self, unity_env, seed=None):
        super().__init__()
        self.unity_env = unity_env
        self.unity_env.reset()
        self.behavior_name = list(self.unity_env.behavior_specs.keys())[0]
        self.spec = self.unity_env.behavior_specs[self.behavior_name]   
        
        # Define observation space (assuming visual input)
        obs_shape = self.spec.observation_specs[0].shape
        self.observation_space = spaces.Box(low=0, high=255, shape=obs_shape, dtype=np.uint8) # ???
        
        # Define action space
        # if self.spec.action_spec.is_continuous():
            # self.action_space = spaces.Box(
            #     low=self.spec.action_spec.continuous_action_spec[0],
            #     high=self.spec.action_spec.continuous_action_spec[1],
            #     shape=(self.spec.action_spec.continuous_size,),
            #     dtype=np.float32
            # )
        if self.spec.action_spec.is_discrete():
            self.action_space = spaces.Discrete(self.spec.action_spec.discrete_branches[0])

        
    def reset(self, seed=None, options=None):
        self.unity_env.reset()
        decision_steps, _ = self.unity_env.get_steps(self.behavior_name)
        obs = decision_steps.obs[0]  # Assuming single-agent scenario
        return obs, {}

    def step(self, action):
        action_tuple = ActionTuple()
        # if self.spec.action_spec.is_continuous():
        #     action_tuple.add_continuous(np.array(action).reshape(1, -1))
        # else:
        #     action_tuple.add_discrete(np.array(action).reshape(1, -1))
        
        if self.spec.action_spec.is_discrete():
            action_tuple.add_discrete(np.array(action).reshape(1, -1))
        
        # print(action_tuple, np.array(action).reshape(1, -1))
        self.unity_env.set_action_for_agent(self.behavior_name, 0, action_tuple)
        self.unity_env.step()
        
        decision_steps, terminal_steps = self.unity_env.get_steps(self.behavior_name)

        if 0 in terminal_steps:
            obs = terminal_steps.obs[0]
            reward = terminal_steps.reward[0]
            
            # terminated - Natural episode ending.
            terminated = not terminal_steps.interrupted[0]
            
            # truncated - "Whether the truncation condition outside the scope of the MDP is satisfied. Typically, this is a timelimit"
            # interrupted - "The episode ended due to max steps or external termination, not because the episode ended naturally (failed/succeeded)."
            truncated = terminal_steps.interrupted[0]
            
            # terminated and truncated are mutually exclusive
        else:
            obs = decision_steps.obs[0]
            reward = decision_steps.reward[0]
            terminated = False
            truncated = False
        
        return obs, reward, terminated, truncated, {}

    def render(self, mode='human'):
        pass  # Unity renders its own environment
    
    def close(self):
        self.unity_env.close()

In [10]:
gymnasium_env = UnityGymWrapper(unity_env)

In [11]:
# gymnasium_env.step(0)

#### Creating stable_baselines3 model

In [12]:
from stable_baselines3 import PPO

model = PPO("MlpPolicy", gymnasium_env, verbose=1)
model.learn(total_timesteps=50_000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 272      |
|    ep_rew_mean     | -126     |
| time/              |          |
|    fps             | 145      |
|    iterations      | 1        |
|    time_elapsed    | 14       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 254         |
|    ep_rew_mean          | -113        |
| time/                   |             |
|    fps                  | 144         |
|    iterations           | 2           |
|    time_elapsed         | 28          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.004109394 |
|    clip_fraction        | 0.0107      |
|    clip_range           | 0.2         |
|    entropy_loss   

<stable_baselines3.ppo.ppo.PPO at 0x2a74674ee00>

In [16]:
model.learn(total_timesteps=50_000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 309      |
|    ep_rew_mean     | -95.1    |
| time/              |          |
|    fps             | 152      |
|    iterations      | 1        |
|    time_elapsed    | 13       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 246         |
|    ep_rew_mean          | -82.5       |
| time/                   |             |
|    fps                  | 147         |
|    iterations           | 2           |
|    time_elapsed         | 27          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008337107 |
|    clip_fraction        | 0.0761      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.01       |
|    explained_variance   | 0.784       |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x2a74674ee00>

In [13]:
# vec_env = model.get_env()

In [14]:
# vec_env.reset()

In [15]:

# vec_env = model.get_env()
# obs = vec_env.reset()
# for i in range(1000):
#     action, _states = model.predict(obs, deterministic=True)
#     obs, reward, done, info = vec_env.step(action)
#     vec_env.render()
#     # VecEnv resets automatically
#     # if done:
#     #   obs = env.reset()