<a href="https://colab.research.google.com/github/TienLungSun/Intelligent-Robot/blob/main/LearnPPO-AC/2.%20NN%20with%20policy%20interacts%20with%20Wall%20Jump(MLAgent_10).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import numpy as np

from mlagents_envs.environment import UnityEnvironment

In [None]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else:
    device= torch.device("cpu")
    print(device)

cuda GeForce GTX 1660 SUPER


### Connect to Unity to examine two types of behavior names and their state and action design 

In [None]:
env = UnityEnvironment(file_name= None, base_port=5004)

In [None]:
# we see only one behavior name at first time
env.reset()
behaviorNames = list(env.behavior_specs.keys())
print(behaviorNames)
for behaviorName in behaviorNames:
    behavior_spec = env.behavior_specs[behaviorName]
    print(behaviorName, behavior_spec)

['SmallWallJump?team=0']
SmallWallJump?team=0 BehaviorSpec(observation_shapes=[(210,), (210,), (24,)], action_spec=ActionSpec(continuous_size=0, discrete_branches=(3, 3, 3, 2)))


In [None]:
# after 2nd reset, we can see two behaivor names
env.reset()
behaviorNames = list(env.behavior_specs.keys())
print(behaviorNames)
for behaviorName in behaviorNames:
    behavior_spec = env.behavior_specs[behaviorName]
    print(behaviorName, behavior_spec)

['SmallWallJump?team=0', 'BigWallJump?team=0']
SmallWallJump?team=0 BehaviorSpec(observation_shapes=[(210,), (210,), (24,)], action_spec=ActionSpec(continuous_size=0, discrete_branches=(3, 3, 3, 2)))
BigWallJump?team=0 BehaviorSpec(observation_shapes=[(210,), (210,), (24,)], action_spec=ActionSpec(continuous_size=0, discrete_branches=(3, 3, 3, 2)))


In [None]:
env.close()

### Define and generate NN 

In [None]:
N_STATES  = 444  # 210+210+24
N_ACTIONS =4     # 4 branches
N_AGENTS = 3

In [None]:
#test: tensor of size (1, N_Actions)
a = torch.ones(1, N_ACTIONS)
print(a, a.shape)

tensor([[1., 1., 1., 1.]]) torch.Size([1, 4])


In [None]:
#test: NN parameter with gradients
b = nn.Parameter(torch.ones(1, N_ACTIONS) * 0.0)
print(b, b.shape)

Parameter containing:
tensor([[0., 0., 0., 0.]], requires_grad=True) torch.Size([1, 4])


In [None]:
#test: generate a NN parameter [0, 0]
log_std = nn.Parameter(torch.ones(1, N_ACTIONS) * 0.0)
print(log_std.shape)

torch.Size([1, 4])


In [None]:
c = log_std.exp()
print(c, c.shape)

tensor([[1., 1., 1., 1.]], grad_fn=<ExpBackward>) torch.Size([1, 4])


In [None]:
# mu = actor(state)
mu = torch.FloatTensor([[0]*N_ACTIONS]) 
print(mu, mu.shape)

tensor([[0., 0., 0., 0.]]) torch.Size([1, 4])


In [None]:
#expand as the size of mu
std = log_std.exp().expand_as(mu)

In [None]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

In [None]:
class Net(nn.Module):
    def __init__(self, ):
        super(Net, self).__init__()
        
        self.actor = nn.Sequential(
            nn.Linear(N_STATES, 128),
            nn.LayerNorm(128),
            nn.Linear(128, 128),
            nn.LayerNorm(128),
            nn.Linear(128, N_ACTIONS)
        )
        self.log_std = nn.Parameter(torch.ones(1, N_ACTIONS) * 0.0)
        self.apply(init_weights)
    
    def forward(self, x):
        mu    = self.actor(x)
        std   = self.log_std.exp().expand_as(mu)
        dist  = Normal(mu, std)
        return dist

### Generate two NNs - one learns to jump over small wall and the other learns to jump over big wall

In [None]:
SmallWallNet = Net().to(device)
BigWallNet = Net().to(device)

### Use Small wall NN to interact with Unity scene for one step (Big wall NN is the same)

In [None]:
env = UnityEnvironment(file_name= None, base_port=5004)

In [None]:
env.reset()
behaviorNames = list(env.behavior_specs.keys())
behaviorName = behaviorNames[0]
print(behaviorName)

SmallWallJump?team=0


In [None]:
DecisionSteps, TerminalSteps = env.get_steps(behaviorName)

In [None]:
print(DecisionSteps.agent_id, TerminalSteps.agent_id)

[0 1 2] []


In [None]:
print(DecisionSteps.obs[0].shape, DecisionSteps.obs[1].shape, DecisionSteps.obs[2].shape)

(3, 210) (3, 210) (3, 24)


In [None]:
# test: practice tensor merge
a = torch.tensor([[1, 2, 3],[1, 2, 3], [1, 2, 3]])
b = torch.tensor([[4, 5, 6],[4, 5, 6], [4, 5, 6]])
c = torch.tensor([[7], [8], [9]])
print(a.shape, b.shape, c.shape)

d = torch.cat((a, b, c), 1)
print(d, d.shape)

torch.Size([3, 3]) torch.Size([3, 3]) torch.Size([3, 1])
tensor([[1, 2, 3, 4, 5, 6, 7],
        [1, 2, 3, 4, 5, 6, 8],
        [1, 2, 3, 4, 5, 6, 9]]) torch.Size([3, 7])


In [None]:
# merge vector observatin, perception 
s1 = torch.FloatTensor(DecisionSteps.obs[0])
s2 = torch.FloatTensor(DecisionSteps.obs[1])
s3 = torch.FloatTensor(DecisionSteps.obs[2])

states = torch.cat((s1, s2, s3), 1).to(device)
print(states.shape)

torch.Size([3, 444])


In [None]:
# send state to NN
dist = SmallWallNet(states)
print(dist)

Normal(loc: torch.Size([3, 4]), scale: torch.Size([3, 4]))


In [None]:
actions = dist.sample()
print(actions)

tensor([[ 0.4810,  0.4763,  2.5353, -1.3833],
        [ 1.3642, -0.9851,  1.8801, -0.5936],
        [ 0.5316, -0.8967,  2.5598,  0.1636]], device='cuda:0')


#### Convert sampled action values to action indices <br />
dirToGoForwardAction = act[0]  => 0, 1, 2<br />
rotateDirAction = act[1]       => 0, 1, 2<br />
dirToGoSideAction = act[2]     => 0, 1, 2<br />
jumpAction = act[3]            => 0, 1<br />

In [None]:
# find the cumulative density function evaluated at the sampled value
actionCFD = dist.cdf(actions)
print(actionCFD)

tensor([[0.1705, 0.7793, 0.6945, 0.3938],
        [0.3669, 0.2046, 0.4522, 0.7293],
        [0.2507, 0.1648, 0.7298, 0.9201]], device='cuda:0',
       grad_fn=<MulBackward0>)


In [None]:
# print agent's action branch and the value sampled by NN
for agentID in range(N_AGENTS):
    for idx, value in enumerate(actions[agentID]):
        cfd = float(actionCFD[agentID][idx])
        print("Agent", agentID, "=> Action branch", idx, ",value ", value, ", cfd", cfd)

Agent 0 => Action branch 0 ,value  tensor(0.4810, device='cuda:0') , cfd 0.17048412561416626
Agent 0 => Action branch 1 ,value  tensor(0.4763, device='cuda:0') , cfd 0.779264509677887
Agent 0 => Action branch 2 ,value  tensor(2.5353, device='cuda:0') , cfd 0.6944706439971924
Agent 0 => Action branch 3 ,value  tensor(-1.3833, device='cuda:0') , cfd 0.39379215240478516
Agent 1 => Action branch 0 ,value  tensor(1.3642, device='cuda:0') , cfd 0.3669050335884094
Agent 1 => Action branch 1 ,value  tensor(-0.9851, device='cuda:0') , cfd 0.2046242654323578
Agent 1 => Action branch 2 ,value  tensor(1.8801, device='cuda:0') , cfd 0.4522150754928589
Agent 1 => Action branch 3 ,value  tensor(-0.5936, device='cuda:0') , cfd 0.7292628884315491
Agent 2 => Action branch 0 ,value  tensor(0.5316, device='cuda:0') , cfd 0.2507057189941406
Agent 2 => Action branch 1 ,value  tensor(-0.8967, device='cuda:0') , cfd 0.16483044624328613
Agent 2 => Action branch 2 ,value  tensor(2.5598, device='cuda:0') , cfd 0

In [None]:
# convert sampled value to index
actionIdx = []
for agentID in range(N_AGENTS):
    agentActionIdx = []
    for idx, value in enumerate(actions[agentID]):
        cfd = float(actionCFD[agentID][idx])
        if(idx <=2): #branch 0, 1, 2 => 3 action indices
            if(cfd<0.33333):
                agentActionIdx.append(0)
            elif(cfd<0.66666):
                agentActionIdx.append(1)
            else:
                agentActionIdx.append(2)
        else: #branch 3 => 2 action indices
            if(cfd<0.5):
                agentActionIdx.append(0)
            else:
                agentActionIdx.append(1)
    actionIdx.append(agentActionIdx)
print(actionIdx)

[[0, 2, 2, 0], [1, 0, 1, 1], [0, 0, 2, 1]]


In [None]:
actionIdx = np.array(actionIdx)
print(actionIdx.shape)

(3, 4)


In [None]:
# send action indices to Unity
env.set_actions(behaviorName, actionIdx)

In [None]:
env.step()

In [None]:
DecisionSteps, TerminalSteps = env.get_steps(behaviorName)
print(DecisionSteps.agent_id, TerminalSteps.agent_id)

[] [0 1 2]


In [None]:
env.close()

### Define a function to convert sampled value to index

In [None]:
def GenerateActionIndex(DecisionAgentNo, actions):
    actionCFD = dist.cdf(actions)    
    actionIdx = []
    for agentID in range(DecisionAgentNo):
        agentActionIdx = []
        for idx, value in enumerate(actions[agentID]):
            cfd = float(actionCFD[agentID][idx])
            if(idx <=2): #branch 0, 1, 2 => 3 action indices
                if(cfd<0.33333):
                    agentActionIdx.append(0)
                elif(cfd<0.66666):
                    agentActionIdx.append(1)
                else:
                    agentActionIdx.append(2)
            else: #branch 3 => 2 action indices
                if(cfd<0.5):
                    agentActionIdx.append(0)
                else:
                    agentActionIdx.append(1)
        actionIdx.append(agentActionIdx)
    return np.array(actionIdx)

# Two NNs (Small wall and Big wall) interact with Unity for N steps

### Set Decision period=1 in Decision Requester component. Otherwise terminated agent will not recover

In [None]:
N_AGENTS = 3

In [None]:
env = UnityEnvironment(file_name= None, base_port=5004)

In [None]:
#reset twice to get behaviors of this training environment
env.reset()
behaviorNames = list(env.behavior_specs.keys())
env.reset()
behaviorNames = list(env.behavior_specs.keys())
print(behaviorNames)

['SmallWallJump?team=0', 'BigWallJump?team=0']


In [None]:
for frame in range(500):
    for idx, behaviorName in enumerate(behaviorNames):
        DecisionSteps, TerminalSteps = env.get_steps(behaviorName)
        if(len(TerminalSteps)>0): #some agent terminates
            print(behaviorName, "Step", frame, ", decision agents ", \
                  DecisionSteps.agent_id, ", terminated agents ", TerminalSteps.agent_id)    
        if(len(DecisionSteps)==0):
            print(behaviorName, "step", frame, ": no decision steps, reset!")
            env.reset()
        else:
            s1 = torch.FloatTensor(DecisionSteps.obs[0])
            s2 = torch.FloatTensor(DecisionSteps.obs[1])
            s3 = torch.FloatTensor(DecisionSteps.obs[2])
            state = torch.cat((s1, s2, s3), 1).to(device)
            if(idx==0):
                dist = SmallWallNet(state)
            else:
                dist = BigWallNet(state)
            action = dist.sample()
            actionIdx = GenerateActionIndex(len(DecisionSteps), action)  
            env.set_actions(behaviorName, actionIdx)   
            env.step()

SmallWallJump?team=0 Step 0 , decision agents  [1 2] , terminated agents  [0 1 2]
BigWallJump?team=0 Step 25 , decision agents  [] , terminated agents  [1]
BigWallJump?team=0 step 25 : no decision steps, reset!
SmallWallJump?team=0 Step 26 , decision agents  [] , terminated agents  [2]
SmallWallJump?team=0 step 26 : no decision steps, reset!
BigWallJump?team=0 Step 26 , decision agents  [1] , terminated agents  [0 2]
SmallWallJump?team=0 Step 56 , decision agents  [] , terminated agents  [0]
SmallWallJump?team=0 step 56 : no decision steps, reset!
SmallWallJump?team=0 step 57 : no decision steps, reset!
BigWallJump?team=0 Step 57 , decision agents  [] , terminated agents  [0 1 2]
BigWallJump?team=0 step 57 : no decision steps, reset!
SmallWallJump?team=0 Step 58 , decision agents  [1] , terminated agents  [0 2]
SmallWallJump?team=0 step 70 : no decision steps, reset!
SmallWallJump?team=0 step 71 : no decision steps, reset!
SmallWallJump?team=0 step 72 : no decision steps, reset!
BigWal

In [None]:
env.close()