<a href="https://colab.research.google.com/github/TienLungSun/RL-Unity-ML-Agent/blob/main/8.%20DQN%20to%20learn%20Push%20block%20(2)%20(MLAgent_10).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from mlagents_envs.environment import UnityEnvironment

In [None]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else: 
    device= torch.device("cpu")
    print(device)

cuda NVIDIA GeForce RTX 3060


### DQN

In [None]:
N_STATES  = 210  # 105+105
N_ACTIONS = 7  # 1 branch with 7 values, move forward/backward, rotate R/L, move R/L 
N_AGENTS = 3

hidden_units = 256 #from ymal file

LEARNING_RATE = 0.0003
MEMORY_CAPACITY = 500 #10000

In [None]:
class Net(nn.Module):
    def __init__(self, ):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(N_STATES, hidden_units)
        self.layer2 = nn.Linear(hidden_units, hidden_units)
        self.out = nn.Linear(hidden_units, N_ACTIONS)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.out(x)

In [None]:
eval_net = Net().to(device)

In [None]:
optimizer = torch.optim.Adam(eval_net.parameters(), lr=LEARNING_RATE)
loss_func = nn.MSELoss()

In [None]:
target_net = Net().to(device)

In [None]:
MEMORY = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))  # (s, a, r, s_) 
print(MEMORY.shape)

(500, 422)


### Interact with Unity to fill the memory<br /> 

In [None]:
env = UnityEnvironment(file_name= None, base_port=5004)

In [None]:
env.reset()
behaviorNames = list(env.behavior_specs.keys())
behaviorName = behaviorNames[0]
print(behaviorName)

PushBlock?team=0


In [None]:
def Interact_with_Unity_one_step (DecisionSteps):
    s1 = torch.FloatTensor(DecisionSteps.obs[0])
    s2 = torch.FloatTensor(DecisionSteps.obs[1])
    s = torch.cat((s1, s2), 1).to(device)
    action = eval_net(s)
    MaxIdxOfEachAgent = torch.unsqueeze(torch.max(action, 1)[1], 1)
    ActionIdxArray = MaxIdxOfEachAgent.cpu().data.numpy()
    env.set_actions(behaviorName, ActionIdxArray+1) 
    env.step()
    return s, ActionIdxArray

In [None]:
MemoryIdx = 0
DecisionSteps, TerminalSteps = env.get_steps(behaviorName)
while (MemoryIdx < MEMORY_CAPACITY):
    if(len(DecisionSteps)==0):
        print("Step", MemoryIdx, ": no decision steps, reset!")
        env.reset()
        DecisionSteps, TerminalSteps = env.get_steps(behaviorName)
        continue

    #interacts with Unity one step, but collect data only when all agents
    #have decision steps
    s, ActionIdxArray = Interact_with_Unity_one_step (DecisionSteps)
    
    NextDecisionSteps, NextTerminalSteps = env.get_steps(behaviorName)
    if(len(DecisionSteps) != N_AGENTS or len(NextDecisionSteps) != N_AGENTS): 
        print(MemoryIdx, "not all agents having decision steps", \
              DecisionSteps.agent_id, "next: ", NextDecisionSteps.agent_id)
    else:
        #after one step, if all agents have decision steps then collect data
        #collect reward of this action from next decision and terminal steps
        s1 = torch.FloatTensor(NextDecisionSteps.obs[0])
        s2 = torch.FloatTensor(NextDecisionSteps.obs[1])
        s_ = torch.cat((s1, s2), 1).to(device)
        r = NextDecisionSteps.reward
        for agentIdx in range(N_AGENTS):
            transition = np.hstack((s[agentIdx].cpu().numpy(), ActionIdxArray[agentIdx], r[agentIdx], s_[agentIdx].cpu().numpy()))
            MEMORY[MemoryIdx, :] = transition
            MemoryIdx += 1
            if(MemoryIdx == MEMORY_CAPACITY):
                break
    DecisionSteps, TerminalSteps = NextDecisionSteps, NextTerminalSteps  

###  Start to learn when memory is filled. 

In [None]:
BATCH_SIZE = 5

In [None]:
# sample batch transitions
sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
b_memory = MEMORY[sample_index, :]
print(sample_index, b_memory.shape)

[146 362  40 433  63] (5, 422)


In [None]:
b_s = torch.FloatTensor(b_memory[:, :N_STATES]).to(device)
print(b_s.shape)

torch.Size([5, 210])


In [None]:
b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int)).to(device)
print(b_a.shape)

torch.Size([5, 1])


In [None]:
b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2]).to(device)
b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:]).to(device)
print(b_r.shape)
print(b_s_.shape)

torch.Size([5, 1])
torch.Size([5, 210])


In [None]:
tmp = eval_net(b_s) #tmp = Q(s,a) for 6 actions
print(tmp, tmp.shape)

tensor([[-0.0022, -0.0360,  0.1145,  0.0114, -0.0873, -0.0044, -0.0500],
        [-0.0015, -0.0141,  0.1546,  0.0118, -0.0757,  0.0061, -0.0323],
        [ 0.0173, -0.0545,  0.1260, -0.0225, -0.0863, -0.0427, -0.0432],
        [ 0.0898,  0.0115,  0.0661, -0.0442, -0.0605, -0.0430, -0.0348],
        [ 0.0667, -0.0224,  0.1412, -0.0283, -0.1050, -0.0296, -0.0045]],
       device='cuda:0', grad_fn=<AddmmBackward>) torch.Size([5, 7])


In [None]:
# take max. QValue 
# torch.gather: Gathers values along an axis specified by dim using index 
tmp1 = torch.gather(tmp, dim=1, index=b_a) 
print(b_a, '\n', tmp1, '\n', tmp1.shape)

tensor([[2],
        [2],
        [2],
        [0],
        [2]], device='cuda:0') 
 tensor([[0.1145],
        [0.1546],
        [0.1260],
        [0.0898],
        [0.1412]], device='cuda:0', grad_fn=<GatherBackward>) 
 torch.Size([5, 1])


In [None]:
# send s to NN, and max. QValue 
q_eval = eval_net(b_s).gather(1, b_a)
print(q_eval)

tensor([[0.1145],
        [0.1546],
        [0.1260],
        [0.0898],
        [0.1412]], device='cuda:0', grad_fn=<GatherBackward>)


In [None]:
q_next = target_net(b_s_).detach()
print(q_next, '\n', q_next.shape)

tensor([[ 7.5441e-02, -9.4403e-03, -1.2026e-01,  5.2563e-03, -6.0626e-04,
          7.1022e-03, -3.1734e-02],
        [ 7.6038e-02, -1.1588e-02, -1.1274e-01,  1.1500e-02, -7.0926e-03,
         -2.6147e-05, -3.2972e-02],
        [ 5.0594e-02, -1.4637e-02, -7.9552e-02,  5.2660e-02, -3.5022e-02,
         -4.3848e-02, -6.6712e-02],
        [ 1.1675e-01,  2.8246e-02, -4.1408e-02, -1.9622e-02, -5.3549e-03,
          1.0145e-02, -1.8638e-02],
        [ 8.8568e-02,  3.2290e-02, -1.2802e-01,  5.6021e-02, -9.0759e-02,
          3.9527e-02, -4.6172e-02]], device='cuda:0') 
 torch.Size([5, 7])


In [None]:
q_next.max(1)

torch.return_types.max(
values=tensor([0.0754, 0.0760, 0.0527, 0.1167, 0.0886], device='cuda:0'),
indices=tensor([0, 0, 3, 0, 0], device='cuda:0'))

In [None]:
q_next.max(1)[0]

tensor([0.0754, 0.0760, 0.0527, 0.1167, 0.0886], device='cuda:0')

In [None]:
GAMMA = 0.9

In [None]:
q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)
print(q_target.shape)

torch.Size([5, 1])


In [None]:
loss = loss_func(q_eval, q_target)
print(loss)

tensor(0.0041, device='cuda:0', grad_fn=<MseLossBackward>)


In [None]:
optimizer.zero_grad()
loss.backward()
optimizer.step()

In [None]:
env.close()