In [1]:
import torch
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import deque  
from mlagents_envs.environment import UnityEnvironment

### Create DDPG agent!

In [2]:
N_STATES  = 210  # 105+105
N_ACTIONS = 1  # 1 branch with 7 values, move forward/backward, rotate R/L, move R/L 
N_AGENTS = 3

In [3]:
BUFFER_SIZE = int(1e3) #int(1e5)        # replay buffer size
BATCH_SIZE =  1   #128              # minibatch size
GAMMA = 0.99                  # discount factor
TAU = 1e-3                    # for soft update of target parameters
LR_ACTOR = 1e-4               # learning rate of the actor 
LR_CRITIC = 1e-4              # learning rate of the critic
WEIGHT_DECAY = 0.0            # L2 weight decay

In [4]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else: 
    device= torch.device("cpu")
    print(device)

cuda GeForce GTX 1660 SUPER


In [5]:
# load NN, reply buffer, and Agent class
%run "9. DDPG_NN_and_MemoryBuffer.ipynb"   
%run "9. DDPG_Agent.ipynb"       

In [6]:
agent = Agent(state_size=N_STATES, action_size=N_ACTIONS, num_agents=N_AGENTS, random_seed=0)

# Step by step to understand the traing loop

In [7]:
env = UnityEnvironment(file_name= None, base_port=5004)

In [8]:
env.reset()
behaviorNames = list(env.behavior_specs.keys())
behaviorName = behaviorNames[0]
print(behaviorName)

PushBlock?team=0


In [9]:
# get initial state  
DecisionSteps, TerminalSteps = env.get_steps(behaviorName)
s1 = DecisionSteps.obs[0]
s2 = DecisionSteps.obs[1]
states = np.concatenate((s1, s2), 1)
print(states.shape)

(3, 210)


In [10]:
# reset the training agent for new episode
agent.reset()

In [11]:
# set the initial episode score to zero.
agent_scores = np.zeros(N_AGENTS)

=========================================================== <br>
actions = agent.act(states) will call Agent class's act method <br>
Step by step to run agent.act(states) in Agent class

In [12]:
#we create an empty action matrix
acts = np.zeros((N_AGENTS, N_ACTIONS))
print(acts.shape)

(3, 1)


In [13]:
#To set NN in test mode instead of train mode
agent.actor_local.eval()

Actor(
  (fc1): Linear(in_features=210, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=1, bias=True)
)

In [14]:
# test: obtain Q(s,a) for agent 1
s = states[0,:]
s = torch.from_numpy(s).float().to(device)
agent.actor_local(s)

tensor([-0.0215], device='cuda:0', grad_fn=<TanhBackward>)

In [15]:
#again, we don't want to calculate gradients because we are only testing!
with torch.no_grad():
    for agentIdx in range(N_AGENTS):

        #forward propagate states in actor local NN to get actions!
        s = states[agentIdx,:]
        s = torch.from_numpy(s).float().to(device)
        acts[agentIdx,:] = agent.actor_local(s).cpu().data.numpy()

In [16]:
#This is to set the model back to training mode, but no training is done here yet!
agent.actor_local.train()
print()




In [17]:
#adding noise for exploration!
noise = OUNoise((N_AGENTS, N_ACTIONS), 0)
print(noise)

<__main__.OUNoise object at 0x000002458C8026A0>


In [18]:
acts += noise.sample()
print(acts)

[[ 0.08913055]
 [ 0.05316845]
 [-0.09125599]]


In [19]:
acts = np.clip(acts, -1, 1)
print(acts)

[[ 0.08913055]
 [ 0.05316845]
 [-0.09125599]]


=========================================================== <br>
finish agent.act(states) in Agent class, go back to training loop

In [20]:
# define a function to convert action value [-1, 1] to index 0~6
def GenerateActionIndex(acts):
    result = []
    for AgentIdx in range(N_AGENTS):
        value = acts[AgentIdx][0]
        interval = 2/7
        lower = -1
        for i in range(7):  #action index 0~6
            upper = lower + (i+1)*interval
            if(value >= lower and value <= upper):
                result.append([i])
                break
            else:
                lower = upper
    return result

In [21]:
ActionIdxArray = np.array(GenerateActionIndex(acts))
print(ActionIdxArray)

[[2]
 [2]
 [2]]


In [22]:
# send the actions to Unity
env.set_actions(behaviorName, ActionIdxArray)
env.step()

In [23]:
# get next states
NextDecisionSteps, NextTerminalSteps = env.get_steps(behaviorName)
s1 = NextDecisionSteps.obs[0]
s2 = NextDecisionSteps.obs[1]
next_states = np.concatenate((s1, s2), 1)
print(next_states.shape)

(3, 210)


In [24]:
rewards = NextDecisionSteps.reward
print(rewards)

[-0.001 -0.001 -0.001]


In [25]:
dones = np.array([[0]]*N_AGENTS ) 
print(dones)

[[0]
 [0]
 [0]]


In [26]:
print(NextDecisionSteps.agent_id, NextTerminalSteps.agent_id)

[0 1 2] []


In [27]:
for AgentID in  NextTerminalSteps.agent_id:
    dones[AgentID] = 1
print(dones)

[[0]
 [0]
 [0]]


=========================================================== <br>
agent.step(states, actions, rewards, next_states, dones) <br>
Send (S, A, R, S') info to the training agent for replay buffer (memory) and network updates <br>
Step by step to run agent.step(states, actions, rewards, next_states, dones)

In [28]:
# Save experience / reward
"""7. Store transitions into replay buffer D"""
for agentIdx in range(N_AGENTS):
    agent.memory.add(states[agentIdx,:], ActionIdxArray[agentIdx,:], \
                     rewards[agentIdx], next_states[agentIdx,:],\
                     dones[agentIdx])

In [29]:
len(agent.memory)
# 3 agents, so one step will collect 3 transitions

3

In [30]:
# Learn, if enough samples are available in memory

#Grab a batch of transitions from memory 
# (states, actions, rewards, next_states, dones)
experiences = agent.memory.sample() 

=========================================================== <br>
call agent.learn(experiences) from agent.step(...) <br>
Step by step to run agent.learn(experiences)

In [31]:
states, actions, rewards, next_states, dones = experiences
print(states.shape, actions.shape, rewards.shape, next_states.shape, dones.shape)

torch.Size([1, 210]) torch.Size([1, 1]) torch.Size([1, 1]) torch.Size([1, 210]) torch.Size([1, 1])


In [32]:
actions_next = agent.actor_target(next_states)
print(actions_next)

tensor([[-0.0202]], device='cuda:0', grad_fn=<TanhBackward>)


In [33]:
Q_targets_next = agent.critic_target(next_states, actions_next)
print(Q_targets_next)

tensor([[0.0106]], device='cuda:0', grad_fn=<AddmmBackward>)


In [34]:
# Compute Q targets for current states (y_i)
Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))
print(Q_targets)

tensor([[0.0095]], device='cuda:0', grad_fn=<AddBackward0>)


In [35]:
# Compute critic loss
Q_expected = agent.critic_local(states, actions)

In [36]:
critic_loss = F.mse_loss(Q_expected, Q_targets)
print(critic_loss)

tensor(4.5395e-06, device='cuda:0', grad_fn=<MseLossBackward>)


In [37]:
agent.critic_optimizer.step()

In [38]:
# ---------------------------- update actor ---------------------------- #
# Compute actor loss
actions_pred = agent.actor_local(states)
print(actions_pred)
actor_loss = -agent.critic_local(states, actions_pred).mean()
print(actor_loss)

tensor([[-0.0209]], device='cuda:0', grad_fn=<TanhBackward>)
tensor(-0.0113, device='cuda:0', grad_fn=<NegBackward>)


In [39]:
# Minimize the loss
agent.actor_optimizer.zero_grad()
actor_loss.backward()
agent.actor_optimizer.step()

In [40]:
# ----------------------- update target networks ----------------------- #
agent.soft_update(agent.critic_local, agent.critic_target, TAU)
agent.soft_update(agent.actor_local, agent.actor_target, TAU)                     

=========================================================== <br>
finish agent.step(states, actions, rewards, next_states, dones) <br>
go back to training loop

In [41]:
env.close()