In [1]:
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.optim import Adam
import numpy as np
import gym
from gym.spaces import Discrete, Box
import random

from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.envs.unity_gym_env import UnityToGymWrapper

In [2]:
def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
    # Build a feedforward neural network.
    layers = []
    for j in range(len(sizes)-1):
        act = activation if j < len(sizes)-2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
    return nn.Sequential(*layers)

In [3]:
def reward_to_go(rews):
    n = len(rews)
    rtgs = np.zeros_like(rews)
    for i in reversed(range(n)):
        rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
    return rtgs

In [4]:
env_path = "C:/Users/Pawel/Documents/Unity_Project/warehouse-bot-training/environment_builds/test_env"

In [5]:
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel

channel = EngineConfigurationChannel()


unity_env = UnityEnvironment(file_name=env_path, side_channels=[channel])
channel.set_configuration_parameters(time_scale = 5.0)
env = UnityToGymWrapper(unity_env, allow_multiple_obs=True)

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
# import torch
# from torch.optim import Adam
# from torch.distributions.categorical import Categorical
# from mlagents_envs.environment import UnityEnvironment
# from gym_unity.envs import UnityToGymWrapper
# import numpy as np

# # Set the device to GPU
# device = "cuda" if torch.cuda.is_available() else "cpu"

# unity_env = UnityEnvironment(file_name=env_path, no_graphics=False)
# env = UnityToGymWrapper(unity_env, allow_multiple_obs=True)

def train(env, hidden_sizes=[32], lr=1e-2, epochs=50, batch_size=1000, exp_factor = 0.05, render=False):
    obs_dim = np.array(env.reset()).shape[1]
    n_acts = env.action_space.n
    print(n_acts, obs_dim)

    # Define the policy network and move it to GPU
    logits_net = mlp(sizes=[obs_dim] + hidden_sizes + [n_acts]).to(device)

    def get_policy(obs):
        logits = logits_net(obs)
        return Categorical(logits=logits)

    def get_action(obs):
        draw = random.random()
        
        if draw <= exp_factor:
            # Return random action
            action = random.randrange(n_acts)
            return action
        
        return get_policy(obs).sample().item() 

    def compute_loss(obs, act, weights):
        logp = get_policy(obs).log_prob(act)
        return -(logp * weights).mean()

    optimizer = Adam(logits_net.parameters(), lr=lr)

    def train_one_epoch(env):
        batch_obs = []
        batch_acts = []
        batch_weights = []
        batch_rets = []
        batch_lens = []

        obs = env.reset()
        done = False
        ep_rews = []
        finished_rendering_this_epoch = False

        while True:
            batch_obs.append(obs.copy())
            act = get_action(torch.as_tensor(obs, dtype=torch.float32).to(device))
            obs, rew, done, _ = env.step(act)

            batch_acts.append(act)
            ep_rews.append(rew)

            if done:
                ep_ret, ep_len = sum(ep_rews), len(ep_rews)
                batch_rets.append(ep_ret)
                batch_lens.append(ep_len)
                batch_weights += list(reward_to_go(ep_rews))
                obs = env.reset()
                done, ep_rews = False, []

                finished_rendering_this_epoch = True
                if len(batch_obs) > batch_size:
                    break

        optimizer.zero_grad()
        batch_loss = compute_loss(
            obs=torch.as_tensor(batch_obs, dtype=torch.float32).to(device),
            act=torch.as_tensor(batch_acts, dtype=torch.int32).to(device),
            weights=torch.as_tensor(batch_weights, dtype=torch.float32).to(device)
        )
        batch_loss.backward()
        optimizer.step()
        return batch_loss, batch_rets, batch_lens

    for i in range(epochs):
        batch_loss, batch_rets, batch_lens = train_one_epoch(env)
        print(f'epoch: {i:3d} \t loss: {batch_loss:.3f} \t return: {np.mean(batch_rets):.3f} \t ep_len: {np.mean(batch_lens):.3f}')


In [9]:
train(env, hidden_sizes=[256, 256, 128], lr=1e-3, batch_size=4000, exp_factor=0.1)

3 500
epoch:   0 	 loss: -106.058 	 return: -131.871 	 ep_len: 299.929
epoch:   1 	 loss: -28.674 	 return: -73.050 	 ep_len: 501.000
epoch:   2 	 loss: -23.322 	 return: -85.050 	 ep_len: 501.000
epoch:   3 	 loss: -18.777 	 return: -89.350 	 ep_len: 501.000
epoch:   4 	 loss: -17.710 	 return: -91.050 	 ep_len: 501.000
epoch:   5 	 loss: -20.040 	 return: -93.050 	 ep_len: 501.000
epoch:   6 	 loss: -17.335 	 return: -93.050 	 ep_len: 501.000
epoch:   7 	 loss: -21.982 	 return: -92.300 	 ep_len: 501.000
epoch:   8 	 loss: -23.719 	 return: -92.700 	 ep_len: 501.000
epoch:   9 	 loss: -24.172 	 return: -93.400 	 ep_len: 501.000


KeyboardInterrupt: 