In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from mlagents_envs.environment import UnityEnvironment

In [2]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else: 
    device= torch.device("cpu")
    print(device)

cuda NVIDIA GeForce RTX 3060


### Connect to Unity to examine behavior names and the state and action design in this training environment

In [3]:
env = UnityEnvironment(file_name= None, base_port=5004)

In [4]:
env.reset()
behaviorNames = list(env.behavior_specs.keys())
print(behaviorNames)
for behaviorName in behaviorNames:
    behavior_spec = env.behavior_specs[behaviorName]
    print(behaviorName, behavior_spec)

['PushBlock?team=0']
PushBlock?team=0 BehaviorSpec(observation_shapes=[(105,), (105,)], action_spec=ActionSpec(continuous_size=0, discrete_branches=(7,)))


In [5]:
env.close()

### DQN

In [6]:
N_STATES  = 210  # 105+105
N_ACTIONS = 7  # 1 branch with 7 values, move forward/backward, rotate R/L, move R/L 
N_AGENTS = 3

hidden_units = 256 #from ymal file

LEARNING_RATE = 0.0003
MEMORY_CAPACITY = 500 #10000
BATCH_SIZE = 128 

In [7]:
class Net(nn.Module):
    def __init__(self, ):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(N_STATES, hidden_units)
        self.layer2 = nn.Linear(hidden_units, hidden_units)
        self.out = nn.Linear(hidden_units, N_ACTIONS)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.out(x)

In [8]:
eval_net = Net().to(device)

In [9]:
optimizer = torch.optim.Adam(eval_net.parameters(), lr=LEARNING_RATE)
loss_func = nn.MSELoss()

In [10]:
target_net = Net().to(device)

### Replay buffer

In [23]:
MEMORY = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))  # (s, a, r, s_) 
print(MEMORY.shape)

(500, 422)


### Test interaction with Unity to collect transactions to memory

In [11]:
env = UnityEnvironment(file_name= None, base_port=5004)

In [12]:
env.reset()
behaviorNames = list(env.behavior_specs.keys())
behaviorName = behaviorNames[0]
print(behaviorName)

PushBlock?team=0


In [13]:
MemoryIdx = 0
DecisionSteps, TerminalSteps = env.get_steps(behaviorName)
# merge vector observatin, perception 
s1 = torch.FloatTensor(DecisionSteps.obs[0])
s2 = torch.FloatTensor(DecisionSteps.obs[1])
s = torch.cat((s1, s2), 1).to(device)
print(s.shape)

torch.Size([3, 210])


In [14]:
action = eval_net(s)
print(action, action.shape)

tensor([[ 0.0460, -0.0088,  0.0599,  0.0435, -0.0347,  0.0683, -0.0125],
        [ 0.0748, -0.0147,  0.0506,  0.0532, -0.0379,  0.0772, -0.0175],
        [ 0.0623, -0.0146,  0.0442,  0.0519, -0.0418,  0.0971, -0.0041]],
       device='cuda:0', grad_fn=<AddmmBackward>) torch.Size([3, 7])


In [15]:
MaxIdxOfEachAgent = torch.unsqueeze(torch.max(action, 1)[1], 1)
ActionIdxArray = MaxIdxOfEachAgent.cpu().data.numpy()
print(ActionIdxArray, ActionIdxArray.shape)

[[5]
 [5]
 [5]] (3, 1)


In [16]:
# send action indices to Unity
env.set_actions(behaviorName, ActionIdxArray)

In [17]:
env.step()

In [18]:
# get next state and reward
DecisionSteps, TerminalSteps = env.get_steps(behaviorName)

In [19]:
print(DecisionSteps.agent_id, TerminalSteps.agent_id)

[0 1 2] []


In [20]:
# merge vector observatin, perception 
s1 = torch.FloatTensor(DecisionSteps.obs[0])
s2 = torch.FloatTensor(DecisionSteps.obs[1])
s_ = torch.cat((s1, s2), 1).to(device)
r = DecisionSteps.reward
print(s_.shape, r.shape)

torch.Size([3, 210]) (3,)


In [21]:
#get a transition from agent 1
transition = np.hstack((s[0].cpu().numpy(), ActionIdxArray[0], r[0], s_[0].cpu().numpy()))
print(transition.shape)

(422,)


In [24]:
# save all agents transactions to Memory
MemoryIdx = 100
for agentIdx in range(N_AGENTS):
    transition = np.hstack((s[agentIdx].cpu().numpy(), ActionIdxArray[agentIdx], r[agentIdx], s_[agentIdx].cpu().numpy()))
    MEMORY[MemoryIdx, :] = transition
    MemoryIdx += 1

In [25]:
env.close()

Interact with Unity to fill the memory<br /> 
When the agent's Decision period >1, there will be cases where some agents do not have decision steps. We will collect data only when all agents have decision steps, i.e., len(DecisionSteps)==NoAgents

In [26]:
MEMORY_CAPACITY = 500

In [27]:
env = UnityEnvironment(file_name= None, base_port=5004)

In [28]:
env.reset()
behaviorNames = list(env.behavior_specs.keys())
behaviorName = behaviorNames[0]
print(behaviorName)

PushBlock?team=0


In [29]:
MemoryIdx = 0
env.reset()
DecisionSteps, TerminalSteps = env.get_steps(behaviorName)
while (MemoryIdx < MEMORY_CAPACITY):
    if(len(DecisionSteps)==0):
        print("Step", MemoryIdx, ": no decision steps, reset!")
        env.reset()
        DecisionSteps, TerminalSteps = env.get_steps(behaviorName)
        continue
        
    #interacts with Unity one step, but collect data only when all agents
    #have decision steps
    s1 = torch.FloatTensor(DecisionSteps.obs[0])
    s2 = torch.FloatTensor(DecisionSteps.obs[1])
    s = torch.cat((s1, s2), 1).to(device)
    action = eval_net(s)
    MaxIdxOfEachAgent = torch.unsqueeze(torch.max(action, 1)[1], 1)
    ActionIdxArray = MaxIdxOfEachAgent.cpu().data.numpy()
    env.set_actions(behaviorName, ActionIdxArray) 
    env.step()
    NextDecisionSteps, NextTerminalSteps = env.get_steps(behaviorName)
    if(len(DecisionSteps)!= N_AGENTS or len(NextDecisionSteps)!= N_AGENTS): 
        print(MemoryIdx, "not all agents having decision steps", \
              DecisionSteps.agent_id, "next: ", NextDecisionSteps.agent_id)
    else:
        #after one step, if all agents have decision steps then collect data
        #collect reward of this action from next decision and terminal steps
        s1 = torch.FloatTensor(NextDecisionSteps.obs[0])
        s2 = torch.FloatTensor(NextDecisionSteps.obs[1])
        s_ = torch.cat((s1, s2), 1).to(device)
        r = NextDecisionSteps.reward
        for i in range(N_AGENTS):
            transition = np.hstack((s[i].cpu().numpy(), ActionIdxArray[i], \
                                    r[i], s_[i].cpu().numpy()))
            MEMORY[MemoryIdx, :] = transition
            MemoryIdx += 1
            if(MemoryIdx == MEMORY_CAPACITY):
                break
    DecisionSteps, TerminalSteps = NextDecisionSteps, NextTerminalSteps  

In [30]:
env.close()