In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from mlagents_envs.environment import UnityEnvironment

In [2]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else: 
    device= torch.device("cpu")
    print(device)

cuda NVIDIA GeForce RTX 3060


### DQN

In [3]:
N_STATES  = 210  # 105+105
N_ACTIONS = 6  # 1 branch with 6 values, move forward/backward, rotate R/L, move R/L 
N_AGENTS = 3

hidden_units = 256 #from ymal file

LEARNING_RATE = 0.0003
MEMORY_CAPACITY = 500 #10000

In [4]:
class Net(nn.Module):
    def __init__(self, ):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(N_STATES, hidden_units)
        self.layer2 = nn.Linear(hidden_units, hidden_units)
        self.out = nn.Linear(hidden_units, N_ACTIONS)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.out(x)

In [5]:
eval_net = Net().to(device)

In [6]:
optimizer = torch.optim.Adam(eval_net.parameters(), lr=LEARNING_RATE)
loss_func = nn.MSELoss()

In [7]:
target_net = Net().to(device)

In [8]:
MEMORY = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))  # (s, a, r, s_) 
print(MEMORY.shape)

(500, 422)


### Interact with Unity to fill the memory<br /> 

In [9]:
env = UnityEnvironment(file_name= None, base_port=5004)

In [10]:
env.reset()
behaviorNames = list(env.behavior_specs.keys())
behaviorName = behaviorNames[0]
print(behaviorName)

PushBlock?team=0


In [11]:
def Interact_with_Unity_one_step (DecisionSteps):
    s1 = torch.FloatTensor(DecisionSteps.obs[0])
    s2 = torch.FloatTensor(DecisionSteps.obs[1])
    s = torch.cat((s1, s2), 1).to(device)
    action = eval_net(s)
    MaxIdxOfEachAgent = torch.unsqueeze(torch.max(action, 1)[1], 1)
    ActionIdxArray = MaxIdxOfEachAgent.cpu().data.numpy()
    env.set_actions(behaviorName, ActionIdxArray+1) 
    env.step()
    return s, ActionIdxArray

In [12]:
MemoryIdx = 0
DecisionSteps, TerminalSteps = env.get_steps(behaviorName)
while (MemoryIdx < MEMORY_CAPACITY):
    if(len(DecisionSteps)==0):
        print("Step", MemoryIdx, ": no decision steps, reset!")
        env.reset()
        DecisionSteps, TerminalSteps = env.get_steps(behaviorName)
        continue

    #interacts with Unity one step, but collect data only when all agents
    #have decision steps
    s, ActionIdxArray = Interact_with_Unity_one_step (DecisionSteps)
    
    NextDecisionSteps, NextTerminalSteps = env.get_steps(behaviorName)
    if(len(DecisionSteps) != N_AGENTS or len(NextDecisionSteps) != N_AGENTS): 
        print(MemoryIdx, "not all agents having decision steps", \
              DecisionSteps.agent_id, "next: ", NextDecisionSteps.agent_id)
    else:
        #after one step, if all agents have decision steps then collect data
        #collect reward of this action from next decision and terminal steps
        s1 = torch.FloatTensor(NextDecisionSteps.obs[0])
        s2 = torch.FloatTensor(NextDecisionSteps.obs[1])
        s_ = torch.cat((s1, s2), 1).to(device)
        r = NextDecisionSteps.reward
        for agentIdx in range(N_AGENTS):
            transition = np.hstack((s[agentIdx].cpu().numpy(), ActionIdxArray[agentIdx], r[agentIdx], s_[agentIdx].cpu().numpy()))
            MEMORY[MemoryIdx, :] = transition
            MemoryIdx += 1
            if(MemoryIdx == MEMORY_CAPACITY):
                break
    DecisionSteps, TerminalSteps = NextDecisionSteps, NextTerminalSteps  

###  Start to learn when memory is filled. 

In [13]:
BATCH_SIZE = 5

In [14]:
# sample batch transitions
sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
b_memory = MEMORY[sample_index, :]
print(b_memory.shape)

(5, 422)


In [15]:
b_s = torch.FloatTensor(b_memory[:, :N_STATES]).to(device)
print(b_s.shape)

torch.Size([5, 210])


In [16]:
b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int)).to(device)
print(b_a.shape)

torch.Size([5, 1])


In [17]:
b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2]).to(device)
b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:]).to(device)
print(b_r.shape)
print(b_s_.shape)

torch.Size([5, 1])
torch.Size([5, 210])


In [18]:
tmp = eval_net(b_s)
print(tmp.shape)

torch.Size([5, 6])


In [19]:
# take max. QValue 
q_eval = torch.gather(tmp, dim=1, index=b_a)
print(q_eval.shape)

torch.Size([5, 1])


In [20]:
# send s to NN, and max. QValue 
q_eval = eval_net(b_s).gather(1, b_a)
print(q_eval.shape)

torch.Size([5, 1])


In [21]:
q_next = target_net(b_s_).detach()
print(q_next.shape)

torch.Size([5, 6])


In [22]:
q_next.max(1)[0].shape

torch.Size([5])

In [23]:
GAMMA = 0.9

In [24]:
q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)
print(q_target.shape)

torch.Size([5, 1])


In [25]:
loss = loss_func(q_eval, q_target)
print(loss)

tensor(0.0043, device='cuda:0', grad_fn=<MseLossBackward>)


In [26]:
optimizer.zero_grad()
loss.backward()
optimizer.step()

In [27]:
env.close()