Ref: RL-Adventure https://github.com/higgsfield/RL-Adventure-2/blob/master/3.ppo.ipynb 

In [1]:
from mlagents_envs.environment import UnityEnvironment
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

In [3]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else:
    device= torch.device("cpu")
    print(device)

cpu


# Define NN for PPO AC

In [4]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

In [5]:
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
        )
        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)
        
        self.apply(init_weights)
        
    def forward(self, x):
        value = self.critic(x)
        mu    = self.actor(x)
        std   = self.log_std.exp().expand_as(mu)
        dist  = Normal(mu, std)
        return dist, value

# Connect to Unity
We handle a Unity scene that contains only one training world

In [52]:
env = UnityEnvironment(file_name= None, base_port=5004) 

In [53]:
env.reset()

In [54]:
behaviorNames = list(env.behavior_specs.keys())
behaviorName = behaviorNames[0]
behavior_spec = env.behavior_specs[behaviorName]
num_inputs  = behavior_spec.observation_shapes[0][0]
num_outputs = behavior_spec.action_shape
print(num_inputs, num_outputs)

12 2


# Instantiate a PPO-AC neural network

In [55]:
hidden_size      = 256
lr               = 3e-4

In [56]:
model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

# while frame_idx < max_frames and not early_stop:

In [57]:
# After N iterations, len(...) =N
log_probs = []
values    = []
states    = [] # states[i].shape=[No. of Agents, state no.]
actions   = [] # actions[i].shape = [No. of Agents, action no.]
rewards   = [] # rewards[i].shape=[No. of Agents, 1]
masks     = [] # masks[i].shape=[No. of Agents, 1]
entropy = 0

# for _ in range(num_steps)

In [58]:
step_result = env.get_steps(behaviorName) 
DecisionSteps = step_result[0] 
TerminalSteps = step_result[1]

In [59]:
state = DecisionSteps.obs[0]
print(state)

[[ 4.2919607 -7.8069067 28.800442   1.7667229  1.9213159  2.4526217
   4.         4.         4.         4.         3.4204714  3.111842 ]]


In [60]:
state = torch.FloatTensor(state).to(device)
print(state.shape)

torch.Size([1, 12])


In [61]:
dist, value = model(state)
print(dist)
print(value)

Normal(loc: torch.Size([1, 2]), scale: torch.Size([1, 2]))
tensor([[-1.3420]], grad_fn=<AddmmBackward>)


In [62]:
action = dist.sample()
print(action, action.shape)

tensor([[ 2.5054, -1.2700]]) torch.Size([1, 2])


In [63]:
env.set_actions(behaviorName, np.array(action.cpu()))
env.step()

In [64]:
step_result = env.get_steps(behaviorName) 
DecisionSteps = step_result[0] 
TerminalSteps = step_result[1]

if(len(TerminalSteps) >0): # if we have terminal step, collect (s, r) from terminal step
    next_state = TerminalSteps.obs[0]
    reward = TerminalSteps.reward
    mask=0
elif(len(DecisionSteps) >0): #otherwise collect (s, r) from decision steps
    next_state = DecisionSteps.obs[0]
    reward = DecisionSteps.reward
    mask=1
    
print(next_state.shape)
print(reward)
print(mask)

(1, 12)
[-0.11507208]
1


In [65]:
log_prob = dist.log_prob(action)
print(log_prob)

tensor([[-1.3377, -5.2006]], grad_fn=<SubBackward0>)


In [66]:
dist.entropy().mean()

tensor(1.4189, grad_fn=<MeanBackward0>)

In [67]:
entropy += dist.entropy().mean()
print(entropy)

tensor(1.4189, grad_fn=<AddBackward0>)


In [68]:
print(log_probs)
log_probs.append(log_prob)
print(log_probs)

[]
[tensor([[-1.3377, -5.2006]], grad_fn=<SubBackward0>)]


In [69]:
print(values)
values.append(value)
print(values)

[]
[tensor([[-1.3420]], grad_fn=<AddmmBackward>)]


In [70]:
print(rewards)
rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
print(rewards, rewards[0].shape)

[]
[tensor([[-0.1151]])] torch.Size([1, 1])


In [71]:
print(masks)
masks.append(torch.FloatTensor(mask).unsqueeze(1).to(device))
print(masks, masks[0].shape)

[]
[tensor([[0.]])] torch.Size([1, 1])


In [72]:
print(states)
states.append(state)
print(states, states[0].shape)

[]
[tensor([[ 4.2920, -7.8069, 28.8004,  1.7667,  1.9213,  2.4526,  4.0000,  4.0000,
          4.0000,  4.0000,  3.4205,  3.1118]])] torch.Size([1, 12])


In [73]:
print(actions)
actions.append(action)
print(actions, actions[0].shape)

[]
[tensor([[ 2.5054, -1.2700]])] torch.Size([1, 2])


In [74]:
state = next_state

# End one interation. Run loop for N times

In [75]:
num_steps = 20
for _ in range(1, num_steps):
    state = torch.FloatTensor(state).to(device)
    dist, value = model(state)

    action = dist.sample()
    env.set_actions(behaviorName, np.array(action.cpu()))
    env.step()
    
    step_result = env.get_steps(behaviorName) 
    DecisionSteps = step_result[0] 
    TerminalSteps = step_result[1]
    if(len(TerminalSteps) >0): # if we have terminal step, collect (s, r) from terminal step
        next_state = TerminalSteps.obs[0]
        reward = TerminalSteps.reward
        mask=0
    elif(len(DecisionSteps) >0): #otherwise collect (s, r) from decision steps
        next_state = DecisionSteps.obs[0]
        reward = DecisionSteps.reward
        mask=1
        
    log_prob = dist.log_prob(action)
    entropy += dist.entropy().mean()

    log_probs.append(log_prob)
    values.append(value)
    rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
    masks.append(torch.FloatTensor(mask).unsqueeze(1).to(device))

    states.append(state)
    actions.append(action)

    state = next_state

In [76]:
print(len(log_probs), log_probs[0].shape)
print(len(values), values[0].shape)
print(len(rewards), rewards[0].shape)
print(len(masks), masks[0].shape)
print(len(states), states[0].shape)
print(len(actions), actions[0].shape)

20 torch.Size([1, 2])
20 torch.Size([1, 1])
20 torch.Size([1, 1])
20 torch.Size([1, 1])
20 torch.Size([1, 12])
20 torch.Size([1, 2])


In [77]:
next_state = torch.FloatTensor(next_state).to(device)

In [78]:
_, next_value = model(next_state)

# returns = compute_gae(next_value, rewards, masks, values)

def compute_gae(...):

    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns

In [79]:
gamma=0.99
tau=0.95

In [80]:
values1 = values + [next_value]
print(len(values), values[0].shape)
print(len(values1), values[1].shape)

20 torch.Size([1, 1])
21 torch.Size([1, 1])


In [81]:
gae = 0
returns = []

In [82]:
for step in reversed(range(len(rewards))):
    print(step, end = ", ")

19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 

for step in reversed(range(len(rewards))):

In [83]:
step = 19
delta = rewards[step] + gamma * values1[step + 1] * masks[step] - values1[step]
print(delta)

tensor([[0.9543]], grad_fn=<SubBackward0>)


In [84]:
gae = delta + gamma * tau * masks[step] * gae
print(gae)

tensor([[0.9543]], grad_fn=<AddBackward0>)


In [85]:
returns.insert(0, gae + values1[step])
print(len(returns), returns[0].shape)

1 torch.Size([1, 1])


### finish one loop, now run for loop

In [86]:
values1 = values + [next_value]
gae = 0
returns = []
for step in reversed(range(len(rewards))):
    delta = rewards[step] + gamma * values1[step + 1] * masks[step] - values1[step]
    gae = delta + gamma * tau * masks[step] * gae
    returns.insert(0, gae + values1[step])

In [87]:
print(len(returns), returns[0].shape)

20 torch.Size([1, 1])


# Finish GAE

In [88]:
returns = torch.cat(returns).detach()
print(returns.shape)

torch.Size([20, 1])


In [89]:
print(len(log_probs), log_probs[0].shape)
log_probs = torch.cat(log_probs).detach()
print(len(log_probs), log_probs[0].shape)

20 torch.Size([1, 2])
20 torch.Size([2])


In [90]:
print(len(values), values[0].shape)
values=torch.cat(values).detach()
print(len(values), values[0].shape)

20 torch.Size([1, 1])
20 torch.Size([1])


In [91]:
print(len(states), states[0].shape)
states = torch.cat(states)
print(len(states), states[0].shape)

20 torch.Size([1, 12])
20 torch.Size([12])


In [92]:
print(len(actions), actions[0].shape)
actions = torch.cat(actions)
print(len(actions), actions[0].shape)

20 torch.Size([1, 2])
20 torch.Size([2])


In [93]:
advantage = returns - values
print(advantage.shape)

torch.Size([20, 1])


# Enter  ppo_update(...)

def ppo_update(...):

    for _ in range(ppo_epochs):
        
        for ... in ppo_iter(...):
        
            dist, value = model(state)
            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(action)

            ratio = (new_log_probs - old_log_probs).exp()
            surr1 = ratio * advantage
            surr2 = torch.clamp(...) * advantage

            actor_loss  = - torch.min(surr1, surr2).mean()
            critic_loss = (return_ - value).pow(2).mean()

            loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

def ppo_iter( ):
    
    batch_size = states.size(0)
    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

In [94]:
states.shape

torch.Size([20, 12])

In [95]:
states.size(0)

20

In [96]:
batch_size = states.size(0)

In [97]:
mini_batch_size  = 5

In [98]:
batch_size // mini_batch_size

4

In [99]:
for _ in range(batch_size // mini_batch_size):
    print("*", end = ",")

*,*,*,*,

In [100]:
np.random.randint(0, batch_size, mini_batch_size) # 5 numbers between 0~39

array([ 3,  1, 15,  3,  0])

In [101]:
for i in range(10):
    rand_ids = np.random.randint(0, batch_size, mini_batch_size)
    print(rand_ids, end = ", ")

[15 16  3 10 11], [ 5 18  5 14  6], [8 2 9 0 6], [ 6 15  4  1  3], [19  4  0 14  9], [ 6  6 12  1 13], [ 5 11  7  7 10], [ 2  5  3 19 13], [15 14 18 15  0], [ 7 11 11 13  3], 

In [102]:
print(states.shape)
print(rand_ids)
print(states[rand_ids, :].shape)

torch.Size([20, 12])
[ 7 11 11 13  3]
torch.Size([5, 12])


In [103]:
print(actions.shape)
print(actions[rand_ids, :].shape)

torch.Size([20, 2])
torch.Size([5, 2])


In [104]:
print(log_probs.shape)
print(log_probs[rand_ids, :].shape)

torch.Size([20, 2])
torch.Size([5, 2])


In [105]:
print(returns.shape)
print(returns[rand_ids, :].shape)

torch.Size([20, 1])
torch.Size([5, 1])


In [106]:
print(advantage.shape)
print(advantage[rand_ids, :].shape)

torch.Size([20, 1])
torch.Size([5, 1])


In [107]:
def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)
    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

In [108]:
for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    break

In [109]:
dist, value = model(state)
print(dist)
print(value.shape)

Normal(loc: torch.Size([5, 2]), scale: torch.Size([5, 2]))
torch.Size([5, 1])


In [110]:
entropy = dist.entropy().mean()
print(entropy)

tensor(1.4189, grad_fn=<MeanBackward0>)


In [111]:
new_log_probs = dist.log_prob(action)
print(new_log_probs.shape)

torch.Size([5, 2])


In [112]:
clip_param=0.2

In [113]:
ratio = (new_log_probs - old_log_probs).exp()
surr1 = ratio * advantage
surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

In [114]:
actor_loss  = - torch.min(surr1, surr2).mean()
critic_loss = (return_ - value).pow(2).mean()
loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy

In [115]:
optimizer.zero_grad()
loss.backward()
optimizer.step()

# Test performance 

In [118]:
Max_test_frames = 50

In [120]:
def test_env():
    env.reset()
    step_result = env.get_steps(behaviorName) 
    DecisionSteps = step_result[0] 
    state = DecisionSteps.obs[0]        
    total_reward = 0
    frame_count = 0
    stop_test = False
    while (not stop_test):  
        frame_count = frame_count + 1
        if(frame_count > Max_test_frames):
            stop_test = True
        else:
            state = torch.FloatTensor(state).to(device)
            dist, _ = model(state)
            action = dist.sample()
            env.set_actions(behaviorName, np.array(action.cpu()))
            env.step()
            step_result = env.get_steps(behaviorName) 
            DecisionSteps = step_result[0]
            TerminalSteps = step_result[1]
            if(len(TerminalSteps) >0): # if reach terminal step, then stop
                reward = DecisionSteps.reward
                total_reward += reward
                stop_test = True
            else:
                next_state = DecisionSteps.obs[0]
                reward = DecisionSteps.reward
                total_reward += reward
                state = next_state
    return total_reward

In [121]:
test_env()

array([-13.389808], dtype=float32)

In [122]:
[test_env() for _ in range(2)]

[array([-18.51781], dtype=float32), array([-30.503582], dtype=float32)]

In [123]:
np.mean([test_env() for _ in range(10)])

-15.021767

In [124]:
env.close()