# Soft Actor Critic

   
With this algorithm we want to solve the problem of how to get robust and stable learning in continuous action space environments.

We can also use algorithms like DDPG or TD3 for continuous action environments, and TD3 in particular works really well, it's on par with the SAC algorithm (they were developed concurrently by separate groups), so they are comparable in terms of quality. DDPG falls short and doesn't do quite as well.

The basic idea of SAC is to use a Maximum Entropy Framework (entropy just means disorder in this case), so it's gonna add a parameter to the cost function, which scales the cost function in such a way that it encourages exploration, but it does so in a way that is robust to random seeds for the environment, as well as episode to episode variatons and starting conditions. It is maximizing not just the total reward over time, but also the stochasticity, the randomness, the entropy of how the agent behaves.

The Actor Network is the Policy. In DDPG or TD3, the network outputs the action directly, here we output the Mean and Standard Deviation for a Normal Distribution, which we then sample to get the actions for our agent.

The Critic Network takes a State and Action as Input and "judges" the action taken by the actor.

The Value Network assigns a value to the states.

SAC lears rather slowly. The entropy comes from scaling the reward, is the scale factor grows, the signal to exploit grows, if the reward scale decreases, the tendency to explore increases. This is the only real parameter we have to play with for the performance of the agent.

In [1]:
import numpy as np
import os
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.normal import Normal

In [2]:
class ReplayBuffer:
    
    def __init__(self, max_size, input_shape, n_actions):
        self.size = max_size
        
        self.state_memory = np.zeros((self.size, *input_shape))
        self.next_state_memory = np.zeros((self.size, *input_shape))
        self.action_memory = np.zeros((self.size, n_actions))
        self.reward_memory = np.zeros(self.size)
        self.terminal_memory = np.zeros(self.size, dtype=np.bool)
        
        self.counter = 0
        
    def store_transition(self, state, action, reward, next_state, done):
        idx = self.counter % self.size
        
        self.state_memory[idx] = state
        self.next_state_memory[idx] = next_state
        self.action_memory[idx] = action
        self.reward_memory[idx] = reward
        self.terminal_memory[idx] = done
        
        self.counter += 1
        
    def sample(self, batch_size):
        max_mem = min(self.counter, self.size)
        batch = np.random.choice(max_mem, batch_size)
        
        states = self.state_memory[batch]
        next_states = self.next_state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        dones = self.terminal_memory[batch]
        
        return states, actions, rewards, next_states, dones

In [3]:
class CriticNetwork(nn.Module):
    
    def __init__(self, lr, input_dims, n_actions, fc1_dims=256, fc2_dims=256,
                 name='critic', checkpoint_dir='tmp/sac'):
        super().__init__()
        
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        
        self.fc1 = nn.Linear(self.input_dims[0] + n_actions, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.q = nn.Linear(self.fc2_dims, 1)
        
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.device = T.device('cuda' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
        self.checkpoint_file = os.path.join(checkpoint_dir, name + '_sac')
    
    def forward(self, state, action):
        action_value = self.fc1(T.cat([state, action], dim=1))
        action_value = F.relu(action_value)
        action_value = self.fc2(action_value)
        action_value = F.relu(action_value)
        q = self.q(action_value)
        return q
    
    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)
        
    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

In [4]:
class ValueNetwork(nn.Module):
    
    def __init__(self, lr, input_dims, fc1_dims=256, fc2_dims=256,
                 name='value', checkpoint_dir='tmp/sac'):
        super().__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.v = nn.Linear(self.fc2_dims, 1)
        
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.device = T.device('cuda' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
        self.checkpoint_file = os.path.join(checkpoint_dir, name + '_sac')
        
    def forward(self, state):
        state_value = self.fc1(state)
        state_value = F.relu(state_value)
        state_value = self.fc2(state_value)
        state_value = F.relu(state_value)
        v = self.v(state_value)
        return v
        
    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)
        
    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

In [None]:
class ActorNetwork(nn.Module):
    
    def __init__(self, lr, input_dims, n_actions, max_action, fc1_dims=256, fc2_dims=256,
                 name='actor', checkpoint_dir='tmp/sac'):
        super().__init__()
        self.input_dims = input_dims
        self.max_action = max_action
        self.n_actions = n_actions
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        
        self.reparam_noise = 1e-6  # so we do not take log(0)
        
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.mu = nn.Linear(self.fc2_dims, self.n_actions)
        self.sigma = nn.Linear(self.fc2_dims, self.n_actions)
        
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.device = T.device('cuda' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
        self.checkpoint_file = os.path.join(checkpoint_dir, name + '_s ac')
        
    def forward(self, state):
        prob = self.fc1(state)
        prob = F.relu(prob)
        prob = self.fc2(prob)
        prob = F.relu(prob)
        
        mu = self.mu(prob)
        sigma = self.sigma(prob)
        
        # we could also use sigma activation to clamp between 0 and 1, but it is slower
        sigma = T.clamp(sigma, min=self.reparam_noise, max=1)
        
        return mu, sigma
    
    def sample_normal(self, state, reparameterize=True):
        # reparameterize is a trick the authors use
        mu, sigma = self.forward(state)
        probabilities = Normal(mu, sigma)
        
        if reparameterize:
            actions = probabilities.rsample()  # gives us sample + noise (additional exploration factor)
        else:
            actions = probabilities.sample()  # gives us sample
            
        action = T.tanh(actions) * T.tensor(self.max_action).to(self.device)  # scale the action beyond +- 1
        log_probabilities = probabilities.log_prob(actions)
        log_probabilities -= T.log(1 - action.pow(2) + self.reparam_noise)  # from the appendix of the paper
        
        # pytorch outputs a vector, but we need a scalar quantity for the loss
        log_probabilities = log_probabilities.sum(1, keepdim=True)
        
        return action, log_probabilities

    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)
        
    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

In [None]:
class Agent:
    
    def __init__(self, env, input_dims, n_actions, lr_alpha=0.0003, lr_beta=0.0003, gamma=0.99,
                 max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2):
        self.gamma = gamma
        self.tau = tau  # factor by which we're going to modulate the parameters of the target value network
                        # we have a value network and a target value network, and instead of a hard copy, we
                        # do a soft copy, meaning we detune the parameters by this factor
        
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        
        self.actor = ActorNetwork(lr_alpha, input_dims, self.n_actions, max_action=env.action_space.high)
        
        # we take the min of the evaluation of the state for these two networks in the loss calculation
        self.critic_1 = CriticNetwork(lr_beta, input_dims, self.n_actions, name='critic_1')
        self.critic_2 = CriticNetwork(lr_beta, input_dims, self.n_actions, name='critic_2')
        
        self.value = ValueNetwork(lr_beta, input_dims, name='value')
        self.target_value = ValueNetwork(lr_beta, input_dims, name='target_value')
        
        self.scale = reward_scale
        self.update_network_parameters(tau=1)  # hard copy in the beginning for target networks
        
    def choose_action(self, observation):
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)
        actions, _ = self.actor.sample_normal(state, reparameterize=False)
        
        return actions.cpu().detach().numpy()[0]
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.store_transition(state, action, reward, next_state, done)
    
    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau
            
        target_value_params = dict(self.target_value.named_parameters())
        value_params = dict(self.value.named_parameters())
        
        for name in value_params:
            value_params[name] = tau * value_params[name].clone() + (1 - tau) * target_value_params[name].clone()

        self.target_value.load_state_dict(value_params)
        
    def save_models(self):
        print('Saving models')
        self.actor.save_checkpoint()
        self.value.save_checkpoint()
        self.target_value.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        
    def load_models(self):
        print('Loading models')
        self.actor.load_checkpoint()
        self.value.load_checkpoint()
        self.target_value.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        
    def learn(self):
        if self.memory.counter  < self.batch_size:
            return
        
        state, action, reward, next_state, done = self.memory.sample(self.batch_size)
        
        state = T.tensor(state, dtype=T.float).to(self.actor.device)
        action = T.tensor(action, dtype=T.float).to(self.actor.device)
        reward = T.tensor(reward, dtype=T.float).to(self.actor.device)
        next_state = T.tensor(next_state, dtype=T.float).to(self.actor.device)
        done = T.tensor(done).to(self.actor.device)
        
        value = self.value(state).view(-1)
        target_value = self.target_value(next_state).view(-1)
        target_value[done] = 0.0
        
        actions, log_probs = self.actor.sample_normal(state, reparameterize=False)
        log_probs = log_probs.view(-1)
        
        # improves stability of learning (overestimation bias due to the max in the Q Update), see TD3 paper
        q1_new_policy = self.critic_1(state, actions)
        q2_new_policy = self.critic_2(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy).view(-1)
        
        self.value.optimizer.zero_grad()
        value_target = critic_value - log_probs
        value_loss = 0.5 * F.mse_loss(value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()
        
        actions, log_probs = self.actor.sample_normal(state, reparameterize=True)
        log_probs = log_probs.view(-1)
        
        q1_new_policy = self.critic_1(state, actions)
        q2_new_policy = self.critic_2(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy).view(-1)
        
        actor_loss = log_probs - critic_value
        actor_loss = T.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()
        
        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        q_hat = self.scale * reward + self.gamma * target_value
        q1_old_policy = self.critic_1(state, action).view(-1)
        q2_old_policy = self.critic_2(state, action).view(-1)
        critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)
        
        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()
        
        self.update_network_parameters()

In [None]:
import matplotlib.pyplot as plt

def plot_learning_curve(x, scores, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    plt.savefig(figure_file)

In [None]:
import pybullet_envs
import gym


env = gym.make('InvertedPendulumBulletEnv-v0')
agent = Agent(env, input_dims=env.observation_space.shape, n_actions=env.action_space.shape[0])

n_games = 250
filename = 'inverted_pendulum.png'
figure_file = 'plots/' + filename

best_score = env.reward_range[0]
score_history = []
load_checkpoint = True

if load_checkpoint:
    agent.load_models()
    env.render(mode='human')
    
for i in range(n_games):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        action = agent.choose_action(state)
        next_state, reward, done, info = env.step(action)
        
        score += reward
        agent.remember(state, action, reward, next_state, done)
        
        if not load_checkpoint:
            agent.learn()
        
        state = next_state
    
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])
    
    if avg_score > best_score:
        best_score = avg_score
        
        if not load_checkpoint:
            agent.save_models()
            
    print('Episode {:3}, Score: {:5.1f}, Avg Score: {:5.1f}'.format(i, score, avg_score))
    
if not load_checkpoint:
    x = [i+1 for i in range(n_games)]
    plot_learning_curve(x, score_history, figure_file)



Loading models
Episode   0, Score: 1000.0, Avg Score: 1000.0
Episode   1, Score: 1000.0, Avg Score: 1000.0
Episode   2, Score: 1000.0, Avg Score: 1000.0
Episode   3, Score: 1000.0, Avg Score: 1000.0
Episode   4, Score: 1000.0, Avg Score: 1000.0
Episode   5, Score: 1000.0, Avg Score: 1000.0
Episode   6, Score: 1000.0, Avg Score: 1000.0
Episode   7, Score: 1000.0, Avg Score: 1000.0
Episode   8, Score: 1000.0, Avg Score: 1000.0
Episode   9, Score: 1000.0, Avg Score: 1000.0
Episode  10, Score: 1000.0, Avg Score: 1000.0
Episode  11, Score: 1000.0, Avg Score: 1000.0
