In [1]:
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import multiprocessing

In [2]:
is_fork = multiprocessing.get_start_method() == "fork"

device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)

## Environment Preparation

#### Transform environment from `mlagents` to `gymnasium`

In [3]:
import gymnasium as gym
print(gym.__version__)

1.1.1


In [4]:
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.environment import UnityEnvironment

from env_camera_gymnasium_wrapper import UnityCameraGymWrapper
from env_camera_raycasts_gymnasium_wrapper import UnityCameraRaycastsGymWrapper

env_path = "D:/_Thesis/warehouse-bot-training/environment_builds/warehouse_stage2_find_potential_dist/Warehouse_Bot.exe"
def make_env():

  channel = EngineConfigurationChannel()

  unity_env = UnityEnvironment(
    file_name=env_path,
    side_channels=[channel],
  )
  
  channel.set_configuration_parameters(time_scale=1)
  
  gymnasium_env = UnityCameraRaycastsGymWrapper(unity_env)
  
  print(gymnasium_env.observation_space)
  
  return gymnasium_env

In [5]:
# env = make_env()

#### Policy Config

#### Training Algorithm

In [19]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import namedtuple
from torch.distributions import Categorical

# Hyperparameters
env_id = "CartPole-v1"
gamma = 0.99
lam = 0.95
clip_eps = 0.2
ppo_epochs = 10
batch_size = 64
update_timesteps = 2048
lr = 3e-4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Actor-Critic Network
class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(obs_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh()
        )
        self.policy_head = nn.Linear(64, act_dim)
        self.value_head = nn.Linear(64, 1)

    def forward(self, x):
        x = self.shared(x)
        return self.policy_head(x), self.value_head(x)

    def get_action(self, obs):
        logits, value = self.forward(obs)
        dist = Categorical(logits=logits)
        action = dist.sample()
        return action, dist.log_prob(action), dist.entropy(), value.squeeze()

    def evaluate_actions(self, obs, actions):
        logits, values = self.forward(obs)
        dist = Categorical(logits=logits)
        log_probs = dist.log_prob(actions)
        entropy = dist.entropy()
        return log_probs, entropy, values.squeeze()

# Rollout Buffer
class RolloutBuffer:
    def __init__(self):
        self.buffer = {
            'obs': [],
            'acts': [],
            'logps': [],
            'rews': [],
            'vals': [],
            'dones': []
        }

    def add(self, obs, act, logp, rew, val, done):
        self.buffer['obs'].append(obs)
        self.buffer['acts'].append(act)
        self.buffer['logps'].append(logp)
        self.buffer['rews'].append(rew)
        self.buffer['vals'].append(val)
        self.buffer['dones'].append(done)

    def compute_gae(self, gamma=0.99, lam=0.95, last_val=0.0):
        vals = self.buffer['vals'] + [last_val]
        vals = torch.tensor(vals, dtype=torch.float32, device=device)

        advantages = []
        gae = 0.0
        for t in reversed(range(len(self.buffer['rews']))):
            delta = self.buffer['rews'][t] + gamma * vals[t + 1] * (1 - self.buffer['dones'][t]) - vals[t]
            gae = delta + gamma * lam * (1 - self.buffer['dones'][t]) * gae
            advantages.insert(0, gae)

        advantages = torch.tensor(advantages, dtype=torch.float32, device=device)
        returns = advantages + vals[:-1]

        obs = torch.tensor(self.buffer['obs'], dtype=torch.float32, device=device)
        acts = torch.tensor(self.buffer['acts'], dtype=torch.int64, device=device)
        logps = torch.tensor(self.buffer['logps'], dtype=torch.float32, device=device)

        # Clear buffer
        for key in self.buffer:
            self.buffer[key].clear()

        return obs, acts, logps, returns, advantages

# PPO Agent
class PPOAgent:
    def __init__(self, obs_dim, act_dim):
        self.model = ActorCritic(obs_dim, act_dim).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)

    def update(self, obs, acts, old_logps, returns, advantages):
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        for _ in range(ppo_epochs):
            idxs = np.random.permutation(len(obs))
            for start in range(0, len(obs), batch_size):
                end = start + batch_size
                mb_idx = idxs[start:end]

                mb_obs = obs[mb_idx]
                mb_acts = acts[mb_idx]
                mb_old_logps = old_logps[mb_idx]
                mb_returns = returns[mb_idx]
                mb_advantages = advantages[mb_idx]

                logps, entropy, values = self.model.evaluate_actions(mb_obs, mb_acts)
                ratios = torch.exp(logps - mb_old_logps)

                surr1 = ratios * mb_advantages
                surr2 = torch.clamp(ratios, 1 - clip_eps, 1 + clip_eps) * mb_advantages
                policy_loss = -torch.min(surr1, surr2).mean()

                value_loss = ((values - mb_returns)**2).mean()
                entropy_bonus = entropy.mean()

                loss = policy_loss + 0.5 * value_loss - 0.01 * entropy_bonus

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
    def train(self, env, iterations):
        for iteration in range(iterations):
            obs, _ = env.reset()
            buffer = RolloutBuffer()
            ep_return = 0
            ep_returns = []

            for t in range(update_timesteps):
                obs_tensor = torch.tensor(obs, dtype=torch.float32, device=device)
                action, logp, _, value = self.model.get_action(obs_tensor)

                next_obs, reward, terminated, truncated, _ = env.step(action.item())
                done = terminated or truncated

                buffer.add(obs, action.item(), logp.item(), reward, value.item(), done)
                ep_return += reward
                obs = next_obs

                if done:
                    ep_returns.append(ep_return)
                    ep_return = 0
                    obs, _ = env.reset()

            # Print done stats
            dones_tensor = torch.tensor(buffer.buffer['dones'], dtype=torch.bool)
            unique_vals, counts = torch.unique(dones_tensor, return_counts=True)
            for val, count in zip(unique_vals, counts):
                print(f"{val.item()}: {count.item()}")

            # Training step
            obs, acts, logps, returns, advantages = buffer.compute_gae(gamma, lam)
            self.update(obs, acts, logps, returns, advantages)

            # Stats per real episode
            ep_returns_np = np.array(ep_returns)
            mean_return = ep_returns_np.mean() if len(ep_returns_np) > 0 else 0.0
            std_return = ep_returns_np.std(ddof=0) if len(ep_returns_np) > 0 else 0.0

            print(f"Iteration {iteration} | Episodes: {len(ep_returns)} | "
                  f"Mean Return: {mean_return:.2f} | Std Return: {std_return:.2f}")

In [None]:
# Training Loop
# env = gym.make(env_id)
env = make_env()

Box(0, 255, (80,), uint8)


In [None]:
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n

agent = PPOAgent(obs_dim, act_dim)

agent.train(env, 5)

False: 2024
True: 24
Iteration 0 | Episodes: 24 | Mean Return: -99.64 | Std Return: 33.16


In [None]:
# obs, _ = env.reset()
# # print(obs)
# for i in range(500):
#     action = 1
#     obs, reward, terminated, truncated, info = env.step(action)
#     print(reward, terminated, truncated)
#     if (terminated):
#         break