In [1]:
# How to run:
# For standard version: Run all cells in this notebook.
# For hardcore version: 
# - Change the environment name to BipedalWalkerHardcore-v3 and change the hyperparameters.
# - Run all cells in this notebook, 

# Code reference: 
# First version of SAC in BipedalWalker-v2: 
# https://github.com/Rafael1s/Deep-Reinforcement-Learning-Algorithms/tree/master/BipedalWalker-Soft-Actor-Critic 
# SAC in BipedalWalker-v3:
# https://github.com/CoderAT13/BipedalWalkerHardcore-SAC

# Tuning reference:
# rl-baseline-zoo: https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/sac.yml

# Paper reference:
# First paper of SAC: https://arxiv.org/pdf/1801.01290.pdf
# Second paper of SAC(with auto-tuned alpha): https://arxiv.org/pdf/1812.05905.pdf 

!apt update
!apt install xvfb -y
!pip install 'swig'
!pip install 'pyglet==1.5.27'
!pip install 'gym[box2d]==0.20.0'
!pip install 'pyvirtualdisplay==3.0'

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal

from torch.optim import Adam
import os

import random
import numpy as np

import gym
import torch.optim as optim

import matplotlib.pyplot as plt
import sys
from pyvirtualdisplay import Display
from IPython import display as disp
%matplotlib inline

display = Display(visible=0,size=(600,600))
display.start()

plot_interval = 10 # update the plot every 10 episodes
video_every = 100 # videos can take a very long time to render so only do it every 100 episodes

[33m0% [Working][0m            Get:1 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [1 InRelease 20.0 kB/114[0m                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [1 InRelease 37.3 kB/114[0m[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [1 InRelease 66.3 kB/114[0m[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [C[0m                                                                               Hit:3 http://archive.ubuntu.com/ubuntu focal InRelease
[33m                                                                               0% [Waiting for headers] [Waiting for headers] [Waiting for headers][0m                                                                    Hit:

In [2]:
# Code reference: 
# https://github.com/Rafael1s/Deep-Reinforcement-Learning-Algorithms/tree/master/BipedalWalker-Soft-Actor-Critic 
LOG_SIG_MAX = 2
LOG_SIG_MIN = -20
epsilon = 1e-6

def soft_update(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)

def hard_update(target, source):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(param.data)

In [3]:
# Code reference: 
# https://github.com/Rafael1s/Deep-Reinforcement-Learning-Algorithms/tree/master/BipedalWalker-Soft-Actor-Critic 

# Initialize Policy weights
def weights_init_(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight, gain=1)
        torch.nn.init.constant_(m.bias, 0)

# Double Q-function trick:
class QNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_dim):
        super(QNetwork, self).__init__()

        # Q1 architecture
        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)

        # Q2 architecture
        self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim)
        self.linear5 = nn.Linear(hidden_dim, hidden_dim)
        self.linear6 = nn.Linear(hidden_dim, 1)

        self.apply(weights_init_)

    def forward(self, state, action):
        
        xu = torch.cat([state, action], 1)
        
        x1 = F.relu(self.linear1(xu))
        x1 = F.relu(self.linear2(x1))
        x1 = self.linear3(x1)

        x2 = F.relu(self.linear4(xu))
        x2 = F.relu(self.linear5(x2))
        x2 = self.linear6(x2)

        return x1, x2

# Policy with reparameterization trick:
class GaussianPolicy(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None):
        super(GaussianPolicy, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)

        self.mean_linear = nn.Linear(hidden_dim, num_actions)
        self.log_std_linear = nn.Linear(hidden_dim, num_actions)

        self.apply(weights_init_)

        # action rescaling
        if action_space is None:
            self.action_scale = torch.tensor(1.)
            self.action_bias = torch.tensor(0.)
        else:
            self.action_scale = torch.FloatTensor(
                (action_space.high - action_space.low) / 2.)
            self.action_bias = torch.FloatTensor(
                (action_space.high + action_space.low) / 2.)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        mean = self.mean_linear(x)
        log_std = self.log_std_linear(x)
        log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX)
        return mean, log_std

    def sample(self, state):
        mean, log_std = self.forward(state)
        std = log_std.exp()
        normal = Normal(mean, std)
        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
        y_t = torch.tanh(x_t)
        action = y_t * self.action_scale + self.action_bias
        log_prob = normal.log_prob(x_t)
        # Enforcing Action Bound
        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon)
        log_prob = log_prob.sum(1, keepdim=True)
        mean = torch.tanh(mean) * self.action_scale + self.action_bias
        return action, log_prob, mean

    def to(self, device):
        self.action_scale = self.action_scale.to(device)
        self.action_bias = self.action_bias.to(device)
        return super(GaussianPolicy, self).to(device)

In [4]:
# Code reference: 
# https://github.com/Rafael1s/Deep-Reinforcement-Learning-Algorithms/tree/master/BipedalWalker-Soft-Actor-Critic 

# Replay buffer
class ReplayMemory:
    def __init__(self, capacity, seed):
        random.seed(seed)
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

In [5]:
# Code reference: 
# https://github.com/Rafael1s/Deep-Reinforcement-Learning-Algorithms/tree/master/BipedalWalker-Soft-Actor-Critic 
# https://github.com/CoderAT13/BipedalWalkerHardcore-SAC

# SAC agent
class SAC(object):
    def __init__(self, num_inputs, action_space, \
                 device, hidden_size, lr, gamma, tau, alpha):

        self.gamma = gamma
        self.tau = tau
        self.alpha = alpha

        self.device = device 

        self.critic = QNetwork(num_inputs, action_space.shape[0], hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0], hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)
        
        # Target Entropy = −dim(A) as given in the paper
        self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = Adam([self.log_alpha], lr=lr)
        self.policy = GaussianPolicy(num_inputs, action_space.shape[0], \
                                         hidden_size, action_space).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=lr)

    # Add random noise when select actions
    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if eval == False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]

    def update_parameters(self, memory, batch_size):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(next_state_batch, next_state_action)
            min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (min_qf_next_target)

        # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1, qf2 = self.critic(state_batch, action_batch) 
        qf1_loss = F.mse_loss(qf1, next_q_value) 
        qf2_loss = F.mse_loss(qf2, next_q_value) 
        qf_loss = qf1_loss + qf2_loss

        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() 
        
        # Was been using by first version code, however due to bugs in pytorch
        # Change it to qf_loss instead in second version.

        # self.critic_optim.zero_grad()
        # qf1_loss.backward()
        # self.critic_optim.step()

        # self.critic_optim.zero_grad()
        # qf2_loss.backward()
        # self.critic_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        self.critic_optim.zero_grad()
        qf_loss.backward()
        self.critic_optim.step()

        # Auto-tune alpha
        # In second version of the code has been commented for testing fixed alpha.
        alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()

        self.alpha_optim.zero_grad()
        alpha_loss.backward()
        self.alpha_optim.step()

        self.alpha = self.log_alpha.exp()

        soft_update(self.critic_target, self.critic, self.tau)

In [6]:
# For standard version, if hardcore please comment line 2-15:
gamma=0.99
batch_size=256
lr=5.3e-4 # learing rate
hidden_size=400
tau=0.02
alpha=0.2 # suggested by author
start_steps=10000
update_start_steps=1e4
reward_scale = 10 # experimented by myself, extremely important hyperparameter mentioned in paper
test_ep = 10 # for adding noise
max_timesteps = 2000
capacity = 300000 # buffer size 
seed = 42
iteration = 1100 # total episode

# For hardcore version, please use uncomment line 20-32:
# These hyperparameter was been used by second version of SAC code.

# gamma=0.99
# batch_size=256
# lr=5e-4
# hidden_size=400
# tau=0.005
# alpha=0.2
# start_steps=10000
# update_start_steps=1e4
# reward_scale = 5
# test_ep = 10
# capacity = 1000000
# seed = 42
# iteration = 2200         # total episode, 2200 would reach over 300 a few times.

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [8]:
%%capture
env = gym.make("BipedalWalker-v3")
# env = gym.make("BipedalWalkerHardcore-v3") # only attempt this when your agent has solved BipedalWalker-v3
env = gym.wrappers.Monitor(env, "./video", video_callable=lambda ep_id: ep_id%video_every == 0, force=True)

# Set seeds
env.seed(seed)
env.action_space.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

In [9]:
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

In [None]:
print('The environment has {} observations and the agent can take {} actions'.format(state_dim, action_dim))
print('The device is: {}'.format(device))

In [None]:
# Code reference: 
# https://github.com/Rafael1s/Deep-Reinforcement-Learning-Algorithms/tree/master/BipedalWalker-Soft-Actor-Critic 
# https://github.com/CoderAT13/BipedalWalkerHardcore-SAC

agent = SAC(state_dim, env.action_space, device, hidden_size, lr, gamma, tau, alpha)
replay_buffer = ReplayMemory(capacity, seed)
total_steps = 0 
log_f = open("agent-log.txt","w+")
reward_list = []
plot_data = []
# i: each episode
# ep_r: each episode reward
# ep_s: steps in each episode
for i in range(iteration):

    ep_r = 0
    ep_s = 0
    done = False
    state = env.reset()
    while not done:

        action = []
        # Start to make observation
        if total_steps < start_steps:
            action = env.action_space.sample()
        else:
            # Stop to make observation
            use_eval = False
            if i % (test_ep*2) >= test_ep:
                use_eval = True
            action = agent.select_action(state, use_eval)

        # get next action
        next_state, reward, done, info = env.step(action)
        ep_r += reward

        # apply reward scaling
        reward = reward * reward_scale

        ep_s += 1
        total_steps += 1
        
        mask = 1 if (ep_s == 1600) else float(not done)
        
        # add into replay_buffer
        replay_buffer.push(state, action, reward, next_state, mask)

        state = next_state

        if ep_s > max_timesteps:
            break
    # update parameters
    for upi in range(ep_s):
        if len(replay_buffer) >= update_start_steps:
            agent.update_parameters(replay_buffer, batch_size)
    
    # do NOT change this logging code - it is used for automated marking!
    log_f.write('episode: {}, reward: {}\n'.format(i, ep_r))
    log_f.flush()
  
    reward_list.append(ep_r)

    # plot graph
    if i % plot_interval == 0:
        plot_data.append([i, np.array(reward_list).mean(), np.array(reward_list).std()])
        reward_list = []
        # plt.rcParams['figure.dpi'] = 100
        plt.plot([x[0] for x in plot_data], [x[1] for x in plot_data], '-', color='tab:grey')
        plt.fill_between([x[0] for x in plot_data], [x[1]-x[2] for x in plot_data], [x[1]+x[2] for x in plot_data], alpha=0.2, color='tab:grey')
        plt.xlabel('Episode number')
        plt.ylabel('Episode reward')
        plt.show()
        disp.clear_output(wait=True)