In [17]:
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import multiprocessing
from torch import nn
from tensordict.nn import TensorDictModule

from collections import defaultdict

In [18]:
import warnings
warnings.filterwarnings("ignore")

from torch import multiprocessing

In [19]:
is_fork = multiprocessing.get_start_method() == "fork"

device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)

## Environment Preparation

#### Load unity environment using `mlagents_envs`

In [20]:
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.environment import UnityEnvironment

channel = EngineConfigurationChannel()
env_path = "C:/Users/Pawel/Documents/Unity_Project/warehouse-bot-training/environment_builds/test_env_simplified/Warehouse_Bot.exe"

from torchrl.envs import UnityMLAgentsEnv

unity_env = UnityEnvironment(
  file_name=env_path,
  side_channels=[channel],
  # additional_args=["-batchmode", "-nographics"]
)
channel.set_configuration_parameters(time_scale=3)

#### Transform environment from `mlagents` to `gymnasium`

In [21]:
import gymnasium as gym

In [22]:
print(gym.__version__)

1.0.0


In [23]:
import numpy as np
from gymnasium import spaces
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import ActionTuple

class UnityGymWrapper(gym.Env):
    def __init__(self, unity_env, seed=None):
        super().__init__()
        self.unity_env = unity_env
        self.unity_env.reset()
        self.behavior_name = list(self.unity_env.behavior_specs.keys())[0]
        self.spec = self.unity_env.behavior_specs[self.behavior_name]   
        
        # Define observation space (assuming visual input)
        obs_shape = self.spec.observation_specs[0].shape
        self.observation_space = spaces.Box(low=0, high=255, shape=obs_shape, dtype=np.uint8) # ???
        
        # Define action space
        # if self.spec.action_spec.is_continuous():
            # self.action_space = spaces.Box(
            #     low=self.spec.action_spec.continuous_action_spec[0],
            #     high=self.spec.action_spec.continuous_action_spec[1],
            #     shape=(self.spec.action_spec.continuous_size,),
            #     dtype=np.float32
            # )
        if self.spec.action_spec.is_discrete():
            self.action_space = spaces.Discrete(self.spec.action_spec.discrete_branches[0])

        
    def reset(self, seed=None, options=None):
        self.unity_env.reset()
        decision_steps, _ = self.unity_env.get_steps(self.behavior_name)
        obs = decision_steps.obs[0]  # Assuming single-agent scenario
        return obs, {}

    def step(self, action):
        action_tuple = ActionTuple()
        # if self.spec.action_spec.is_continuous():
        #     action_tuple.add_continuous(np.array(action).reshape(1, -1))
        # else:
        #     action_tuple.add_discrete(np.array(action).reshape(1, -1))
        
        if self.spec.action_spec.is_discrete():
            action_tuple.add_discrete(np.array(action).reshape(1, -1))
        
        # print(action_tuple, np.array(action).reshape(1, -1))
        self.unity_env.set_action_for_agent(self.behavior_name, 0, action_tuple)
        self.unity_env.step()
        
        decision_steps, terminal_steps = self.unity_env.get_steps(self.behavior_name)

        if 0 in terminal_steps:
            obs = terminal_steps.obs[0]
            reward = terminal_steps.reward[0]
            
            # terminated - Natural episode ending.
            terminated = not terminal_steps.interrupted[0]
            
            # truncated - "Whether the truncation condition outside the scope of the MDP is satisfied. Typically, this is a timelimit"
            # interrupted - "The episode ended due to max steps or external termination, not because the episode ended naturally (failed/succeeded)."
            truncated = terminal_steps.interrupted[0]
            
            # terminated and truncated are mutually exclusive
        else:
            obs = decision_steps.obs[0]
            reward = decision_steps.reward[0]
            terminated = False
            truncated = False
        
        return obs, reward, terminated, truncated, {}

    def render(self, mode='human'):
        pass  # Unity renders its own environment
    
    def close(self):
        self.unity_env.close()

In [24]:
gymnasium_env = UnityGymWrapper(unity_env)

#### Define functions

#### Hyperparameters

In [25]:
GAMMA = 0.99
LAMBDA = 0.95
CLIP_EPS = 0.2
LR = 3e-4

EPOCHS = 10
BATCH_SIZE = 64
# ROLLOUT_SIZE = 2048

BUFFER_SIZE = 10240
TOTAL_STEPS = 102400

In [26]:
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, state):
        action_probs = self.actor(state)
        value = self.critic(state)
        return action_probs, value

In [27]:
def compute_advantages(rewards, values, dones):
    advantages = []
    gae = 0
    next_value = values[-1]
    for t in reversed(range(len(rewards))):
        delta = rewards[t] + GAMMA * next_value * (1 - dones[t]) - values[t]
        gae = delta + GAMMA * LAMBDA * (1 - dones[t]) * gae
        advantages.insert(0, gae)
        next_value = values[t]
    returns = np.array(advantages) + values[:-1]
    return torch.tensor(advantages, dtype=torch.float32, device=device), \
           torch.tensor(returns, dtype=torch.float32, device=device)

In [45]:
def ppo_update(model, optimizer, states, actions, old_log_probs, returns, advantages):
    for _ in range(EPOCHS):
        indices = np.random.permutation(len(states))
        print(indices)
        for i in range(0, len(states), BATCH_SIZE):
            batch_idx = indices[i:i + BATCH_SIZE]
            print(batch_idx)
            
            state_batch = states[batch_idx]
            action_batch = actions[batch_idx]
            old_log_prob_batch = old_log_probs[batch_idx]
            return_batch = returns[batch_idx]
            advantage_batch = advantages[batch_idx]

            action_probs, values = model(state_batch)
            dist = Categorical(action_probs)
            new_log_probs = dist.log_prob(action_batch)
            entropy = dist.entropy().mean()
            
            ratio = torch.exp(new_log_probs - old_log_prob_batch) # ???
            clipped_ratio = torch.clamp(ratio, 1 - CLIP_EPS, 1 + CLIP_EPS)
            policy_loss = -torch.min(ratio * advantage_batch, clipped_ratio * advantage_batch).mean()
            value_loss = nn.MSELoss()(values.squeeze(), return_batch)
            loss = policy_loss + 0.5 * value_loss - 0.01 * entropy
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [46]:
def train_bufffer(env, model, optimizer):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device)
    
    for i in range(BUFFER_SIZE):
        states, actions, rewards, dones, log_probs, values = [], [], [], [], [], []
        
        # Choose Action
        action_probs, value = model(state)
        dist = Categorical(action_probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        
        # Gather trajectories
        next_state, reward, terminated, truncated, _ = env.step(action.item())
        done = terminated or truncated
        
        # Fill buffer
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        dones.append(done)
        log_probs.append(log_prob)
        values.append(value.item())
        
        if done:
            state, _ = env.reset()
        else:
            state = torch.tensor(next_state, dtype=torch.float32, device=device)
    
        # No sense - rewrite
        # states = torch.stack(states)
        # actions = torch.tensor(actions, dtype=torch.int64, device=device)
        # log_probs = torch.stack(log_probs)
        # values.append(0)  # Bootstrap last value
        
        # Compute advantage
        advantages, returns = compute_advantages(rewards, values, dones)
        print(f"advantages: {advantages}, returns:{returns}")
        
        # Update policy
        ppo_update(model, optimizer, states, actions, log_probs, returns, advantages)
        
        print(f"Mean Reward: {sum(rewards) / len(rewards)}")

    env.close()
    torch.save(model.state_dict(), "ppo_cartpole.pth")

In [47]:
state_dim = gymnasium_env.observation_space.shape[0]
action_dim = gymnasium_env.action_space.n
model = ActorCritic(state_dim, action_dim).to(device)

optimizer = optim.Adam(model.parameters(), lr=LR)

train_bufffer(gymnasium_env, model, optimizer)

advantages: tensor([-0.2029]), returns:tensor([])
[0]
[0]


TypeError: only integer scalar arrays can be converted to a scalar index