Reference: RL-Adventure https://github.com/higgsfield/RL-Adventure-2/blob/master/3.ppo.ipynb

In [1]:
import torch
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else:
    device= torch.device("cpu")
    print(device)

cpu


# Define NN for PPO AC

In [2]:
import torch.nn as nn

def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

In [3]:
from torch.distributions import Normal

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size1, hidden_size2, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size1),
            nn.LayerNorm(hidden_size1),
            nn.Tanh(),
            nn.Linear(hidden_size1, hidden_size2),
            nn.LayerNorm(hidden_size2),
            nn.Tanh(),    
            nn.Linear(hidden_size2, 1),
        )
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size1),
            nn.LayerNorm(hidden_size1),
            nn.Tanh(),
            nn.Linear(hidden_size1, hidden_size2),
            nn.LayerNorm(hidden_size2),
            nn.Tanh(),    
            nn.Linear(hidden_size2, num_outputs),
            nn.Tanh()   
        )
        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)
        
        self.apply(init_weights)
        
    def forward(self, x):
        value = self.critic(x)
        mu    = self.actor(x)
        std   = self.log_std.exp().expand_as(mu)
        dist  = Normal(mu, std)
        return dist, value

# PPO

In [4]:
def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns

In [5]:
import numpy as np 

def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)
    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

In [6]:
def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2):
    for _ in range(ppo_epochs):
        for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
            dist, value = model(state)
            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(action)

            ratio = (new_log_probs - old_log_probs).exp()
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

            actor_loss  = - torch.min(surr1, surr2).mean()
            critic_loss = (return_ - value).pow(2).mean()

            loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    return float(loss)

# Test NN performance

In [7]:
def test_env(Max_test_frames=100):
    env.reset()
    step_result = env.get_steps(behaviorName) 
    DecisionSteps = step_result[0] 
    state = DecisionSteps.obs[0]        
    total_reward = 0
    frame_count = 0
    stop_test = False
    while (not stop_test):  
        frame_count = frame_count + 1
        if(frame_count > Max_test_frames):
            stop_test = True
        else:
            state = torch.FloatTensor(state).to(device)
            dist, _ = model(state)
            action = dist.sample()
            env.set_actions(behaviorName, np.array(action.cpu()))
            env.step()
            step_result = env.get_steps(behaviorName) 
            DecisionSteps = step_result[0]
            TerminalSteps = step_result[1]
            if(len(TerminalSteps) >0): # if reach terminal step, then stop
                reward = TerminalSteps.reward
                total_reward += reward
                stop_test = True
            else:
                next_state = DecisionSteps.obs[0]
                reward = DecisionSteps.reward
                total_reward += reward
                state = next_state
    return total_reward

# Connect to Unity

In [8]:
from mlagents_envs.environment import UnityEnvironment

### 確認 Unity CarAgent 的 Behaivor Type = Default

In [9]:
env = UnityEnvironment(file_name= None, base_port=5004) 

In [10]:
env.reset()

# Instantiate AC PPO NN

In [11]:
behaviorNames = list(env.behavior_specs.keys())
behaviorName = behaviorNames[0]
behavior_spec = env.behavior_specs[behaviorName]
print(behaviorName, "\n", behavior_spec)

My Behavior?team=0 
 BehaviorSpec(observation_shapes=[(19,)], action_type=<ActionType.CONTINUOUS: 1>, action_shape=2)


In [12]:
num_inputs  = behavior_spec.observation_shapes[0][0]
num_outputs = behavior_spec.action_shape
print(num_inputs, num_outputs)

19 2


In [13]:
# NN parameters
hidden_size1      =128
hidden_size2      = 64
lr               = 3e-4

In [14]:
import torch.optim as optim
model = ActorCritic(num_inputs, num_outputs, hidden_size1, hidden_size2).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

# 訓練前先啟動 tensor board server

In [15]:
# 此行顯示目前這個 ipython notebook 的目錄
!echo The current directory is %CD%

The current directory is C:\Users\admin\Google 雲端硬碟\0. 教學 IE 351, 562 VR1 Unity 3D


In [None]:
# 在 python terminal window, 到這個 ipython notebook 的目錄下輸入:
# tensorboard --logdir=runs

In [None]:
# 網頁 localhost:6006 連到 TB server

In [16]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

# Train 

In [17]:
#NN training params:
num_steps        = 20  
mini_batch_size  = 5
ppo_epochs       = 4
threshold_reward = 9.98 #9.5

In [18]:
frame_idx  = 0
max_frames = 5000   #15000
env.reset()
early_stop = False
__printDetails = False

while frame_idx < max_frames and not early_stop: #Run simulation max_frames steps
    if(__printDetails):
        print("Frame = ", frame_idx, end=", ")
    log_probs = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    masks     = []
    entropy = 0

    step_result = env.get_steps(behaviorName) 
    DecisionSteps = step_result[0] 
    state = DecisionSteps.obs[0]
    if(__printDetails):
        print("step", end = ":")
    for step in range(num_steps):
        if(__printDetails and (step+1) % 5==0):
            print(step+1, end = ", ")
        state = torch.FloatTensor(state).to(device)
        dist, value = model(state)

        action = dist.sample()
        if(int(torch.isnan(torch.min(action))) == 1) : #we have Nan when sampling distribution
            print("Error: distribution=", dist, "%.2f, %.2f" % (float(action[0][0]), float(action[0][1])))
        env.set_actions(behaviorName, np.array(action.cpu()))
        env.step()

        step_result = env.get_steps(behaviorName) 
        DecisionSteps = step_result[0] 
        TerminalSteps = step_result[1]
        if(len(TerminalSteps) >0): # if episode is termined, collect (s, r) from terminal step
            next_state = TerminalSteps.obs[0]
            reward = TerminalSteps.reward
            if(__printDetails):
                print("Reach goal, r= %.2f" % reward)
            mask=[0.0]
        elif(len(DecisionSteps) >0): #otherwise collect (s, r) from decision steps
            next_state = DecisionSteps.obs[0]
            reward = DecisionSteps.reward
            if(__printDetails and reward >= 5):
                print("Hit obstacle!, r=", reward)
            mask=[1.0]

        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()

        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(mask).unsqueeze(1).to(device))

        states.append(state)
        actions.append(action)
        
        frame_idx += 1
        
        if frame_idx % 100 == 0:  # Test NN performance every 100 frames
            print("Frame ", frame_idx, ", test NN", end=", ")
            
            test_reward = np.mean([test_env(Max_test_frames=200) for _ in range(10)])
            
            print("Avg reward = %.2f " % test_reward)
            writer.add_scalar("Reward", test_reward, frame_idx)
            if (test_reward > threshold_reward):
                early_stop = True
                
        if(len(TerminalSteps) >0): #if agent's episode is terminated, we need to get new episode state
            if(__printDetails):
                print("Run simulation one step to get initial state")
            step_result = env.get_steps(behaviorName) 
            DecisionSteps = step_result[0]
            state = DecisionSteps.obs[0]
        else:
            state = next_state
            
    next_state = torch.FloatTensor(next_state).to(device)
    _, next_value = model(next_state)
    returns = compute_gae(next_value, rewards, masks, values)

    returns   = torch.cat(returns).detach()
    log_probs = torch.cat(log_probs).detach()
    values    = torch.cat(values).detach()
    states    = torch.cat(states)
    actions   = torch.cat(actions)
    advantage = returns - values
    
    loss = ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)
    print("PPO update, loss = %.2f" % loss)
    writer.add_scalar("Loss", loss, frame_idx)
writer.flush()

PPO update, loss = 13.60
PPO update, loss = 2.98
PPO update, loss = 2.53
PPO update, loss = 6.37
Frame  100 , test NN, Avg reward = 9.95 
PPO update, loss = 1.92
PPO update, loss = 2.75
PPO update, loss = 1.77
PPO update, loss = 2.04
PPO update, loss = 0.73
Frame  200 , test NN, Avg reward = 9.95 
PPO update, loss = 0.35
PPO update, loss = 2.19
PPO update, loss = 2.03
PPO update, loss = 0.72
PPO update, loss = 0.94
Frame  300 , test NN, Avg reward = 9.95 
PPO update, loss = 0.88
PPO update, loss = 0.13
PPO update, loss = 0.16
PPO update, loss = 0.05
PPO update, loss = 0.29
Frame  400 , test NN, Avg reward = 9.96 
PPO update, loss = 0.08
PPO update, loss = 0.10
PPO update, loss = -0.43
PPO update, loss = -0.40
PPO update, loss = -0.06
Frame  500 , test NN, Avg reward = 9.97 
PPO update, loss = -0.47
PPO update, loss = -0.62
PPO update, loss = -0.46
PPO update, loss = -0.05
PPO update, loss = 0.04
Frame  600 , test NN, Avg reward = 8.97 
PPO update, loss = 0.03
PPO update, loss = 2.98
PP

### 如果要中斷與 Unity 的連結:

In [19]:
env.close()

### 如果想要結束目前的 TB writter, 重起一個 TB writter

In [27]:
writer.close()

In [28]:
writer = SummaryWriter()

### env.close後如果需要重新 connect Unity 訓練可用下面捷徑

In [None]:
env = UnityEnvironment(file_name= None, base_port=5004) 
env.reset()

### 如果訓練表現不好, 需要重新生一個 NN, 可用下面捷徑

In [None]:
model = ActorCritic(num_inputs, num_outputs, hidden_size1, hidden_size2).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

### 如果訓練效果不錯, 想要保留目前 NN (Save model)

In [None]:
from datetime import datetime
datetime_dt = datetime.today()# 獲得當地時間
datetime_str = datetime_dt.strftime("%Y_%m_%d_%H_%M_%S")  # 格式化日期
fname = datetime_str + "_" + str(frame_idx) + ".pkl"
torch.save(model.state_dict(), fname)
print("Model is saved: ", fname)

### 想要 load previously saved model

In [None]:
model = ActorCritic(num_inputs, num_outputs, hidden_size1, hidden_size2).to(device)
model.load_state_dict(torch.load("2020_11_23_15_40_28_800.pkl"))
optimizer = optim.Adam(model.parameters(), lr=lr)

### 看看目前 model 的控制表現 (Examine NN performance)

In [None]:
for i in range(20):
    r = test_env(Max_test_frames=200)
    print(r, end =",")