# Multi-Agent Soft Actor Critic Algorithm

---

This notebook implements the Soft Actor-Critic Algorithm as documented in the paper [here](https://arxiv.org/pdf/1812.05905.pdf)


In [None]:
# Import libraries
from IPython.display import clear_output
import random
import math
from collections import namedtuple, deque
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
from unityagents import UnityEnvironment
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
class Soft_Q_Network(nn.Module):

    def __init__(self, input_size, h1_size, h2_size, output_size):
        super(Soft_Q_Network, self).__init__()

        # state, hidden layer, action sizes
        self.input_size = input_size
        self.h1_size = h1_size
        self.h2_size = h2_size
        self.output_size = output_size

        # define layers
        self.fc1 = nn.Linear(self.input_size, self.h1_size)
        self.fc2 = nn.Linear(self.h1_size, self.h2_size)
        self.fc3 = nn.Linear(self.h2_size, self.output_size)

        #initialize weights
        init_w = 3e-3
        self.fc3.weight.data.uniform_(-init_w,init_w)
        self.fc3.bias.data.uniform_(-init_w,init_w)

    def forward(self, state,action):
        x = torch.cat([state,action],1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [None]:
class Soft_Policy_Network(nn.Module):

    def __init__(self, input_size, h1_size, h2_size, output_mean_size, output_std_size):
        super(Soft_Policy_Network, self).__init__()

        # state, hidden layer, action sizes
        self.input_size = input_size
        self.h1_size = h1_size
        self.h2_size = h2_size
        self.output_mean_size = output_mean_size
        self.output_std_size = output_std_size

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # define layers
        self.fc1 = nn.Linear(self.input_size, self.h1_size)
        self.fc2 = nn.Linear(self.h1_size, self.h2_size)
        self.fc3_mean = nn.Linear(self.h2_size, self.output_mean_size)
        self.fc3_log_std = nn.Linear(self.h2_size, self.output_std_size)
        #initialize weights
        init_w = 3e-3
        self.fc3_mean.weight.data.uniform_(-init_w,init_w)
        self.fc3_mean.bias.data.uniform_(-init_w,init_w)
        self.fc3_log_std.weight.data.uniform_(-init_w,init_w)
        self.fc3_log_std.bias.data.uniform_(-init_w,init_w)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        mean = self.fc3_mean(x) #values of the action should be between -1 and 1 so this is not the mean of the action value
        log_std = self.fc3_log_std(x)
        log_std_min = -20
        log_std_max = 0
        log_std = torch.clamp(log_std,log_std_min, log_std_max)
        return mean,log_std

    def sample (self,state,epsilon = 1e-6):
        mean, log_std = self.forward(state)
        std = log_std.exp()
        normal = Normal (mean,std)
        z = normal.rsample()
        action = torch.tanh(z)
        log_pi = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)
        log_pi = log_pi.sum(1,keepdim=True)
        return action, log_pi

    def get_action(self, state,deterministic):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.forward(state)

        if deterministic:
            action = torch.tanh(mean)
        else:
            std = log_std.exp()
            normal = Normal(mean, std)
            z = normal.sample() #sample an action from a normal distribution with (mean,std)
            action = torch.tanh(z) #squeeze the value between -1 and 1

        action = action.cpu().detach().squeeze(0).numpy()
        return self.rescale_action(action)

    def rescale_action(self, action):
        action_range=[-1,1]
        return action * (action_range[1] - action_range[0]) / 2.0 +\
            (action_range[1] + action_range[0]) / 2.0


In [None]:
class Centralized_ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, batch_size, seed,num_agents):
        """Initialize a ReplayBuffer object.
        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.seed = random.seed(seed)
        self.num_agents = num_agents
        self.memory = deque(maxlen=buffer_size)
        self.all_fields_names = ["states", "actions", "rewards", "next_states", "dones"]
        self.record_length = len(self.all_fields_names)
        self.experience = namedtuple("Experience", field_names=self.all_fields_names)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


        """
        ====
        Data Structure.

            Each experience is made of a list of states (s), a list of actions (a),
            a list of rewards (r), a list of next_states(s') and a list of dones (d)

            The i-th element of each list of s,a,r,s' records the s,a,r,s for agent of index i
            For example, experience of agent2 =
            (experience.states[2],experience.actions[2],experience.rewards[2],experience.next_states[2]
        """

    def add(self, states, actions, rewards, next_states, dones):

        """Add a new experience to memory."""
        e=self.experience._make((states, actions, rewards, next_states, dones))
        self.memory.append(e)

    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        """Returns torch tensors"""

        experiences = random.sample(self.memory, k=self.batch_size)

        all_states = tuple(torch.from_numpy(np.vstack([e.states[i] for e in experiences if e is not None]))\
                                   .float().to(self.device) for i in range (self.num_agents))

        all_actions = tuple(torch.from_numpy(np.vstack([e.actions[i] for e in experiences if e is not None]))\
                                   .float().to(self.device) for i in range (self.num_agents))

        all_rewards = tuple(torch.from_numpy(np.vstack([e.rewards[i] for e in experiences if e is not None]))\
                                   .float().to(self.device) for i in range (self.num_agents))

        all_next_states = tuple(torch.from_numpy(np.vstack([e.next_states[i] for e in experiences if e is not None]))\
                                   .float().to(self.device) for i in range (self.num_agents))

        all_dones = tuple(torch.from_numpy(np.vstack([e.dones[i] for e in experiences if e is not None]).astype(np.uint8))\
                          .float().to(self.device) for i in range(self.num_agents))

        return (all_states, all_actions, all_rewards, all_next_states, all_dones)

    def buffer_len(self):
        """Return the current size of internal memory."""
        return len(self.memory)


In [None]:
class SAC_Agent:

    def __init__(self, index,state_dim = 24 ,action_dim = 2,layer_size = 128, qf_lr = 0.0006, \
                 policy_lr=0.0003,a_lr = 0.0006,auto_entropy_tuning=True,soft_target_tau =0.02, discount = 0.99):

        self.index= index #starts at 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.layer_size = layer_size
        self.qf_lr = qf_lr
        self.policy_lr = policy_lr
        self.a_lr = a_lr
        self.auto_entropy_tuning = auto_entropy_tuning
        self.soft_target_tau = soft_target_tau
        self.discount = discount

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.qf1 = Soft_Q_Network(
            input_size = 2*(state_dim + action_dim),
            h1_size = layer_size,
            h2_size = layer_size,
            output_size=1

        ).to(self.device)

        self.qf1_optimizer = optim.Adam(self.qf1.parameters(), lr=qf_lr)

        self.qf2 = Soft_Q_Network(
            input_size = 2 * (state_dim + action_dim),
            h1_size = layer_size,
            h2_size = layer_size,
            output_size=1
        ).to(self.device)

        self.qf2_optimizer = optim.Adam(self.qf2.parameters(), lr=qf_lr)

        self.target_qf1 = Soft_Q_Network(
            input_size = 2*(state_dim + action_dim),
            h1_size = layer_size,
            h2_size = layer_size,
            output_size=1
        ).to(self.device)

        self.target_qf2 = Soft_Q_Network(
            input_size = 2*(state_dim + action_dim),
            h1_size = layer_size,
            h2_size = layer_size,
            output_size=1
        ).to(self.device)

        self.policy = Soft_Policy_Network(
            input_size = state_dim,
            h1_size = layer_size,
            h2_size = layer_size,
            output_mean_size = action_dim,
            output_std_size = action_dim
        ).to(self.device)
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=policy_lr)

        if self.auto_entropy_tuning:
            self.target_entropy = -0.000
            self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
            self.alpha = self.log_alpha.exp()
            self.log_alpha_optim = optim.Adam([self.log_alpha], lr=a_lr)

        # copy parameters of qf1/qf2 to target_qf1/target_qf2
        for target_params, params in zip(self.target_qf1.parameters(), self.qf1.parameters()):
            target_params.data.copy_(params)

        for target_params, params in zip(self.target_qf2.parameters(), self.qf2.parameters()):
            target_params.data.copy_(params)

    def get_action(self, state,deterministic):
        return self.policy.get_action(state,deterministic)

    def save_weights(self):
        torch.save(self.qf1.state_dict(), "Models/checkpoint_masac_critic1_Agent"+str(self.index)+".pth")
        torch.save(self.qf2.state_dict(), "Models/checkpoint_masac_critic2_Agent"+str(self.index)+".pth")
        torch.save(self.policy.state_dict(), "Models/checkpoint_masac_actor_Agent"+str(self.index)+".pth")

    def load_weights(self,critic1_path,critic2_path,actor_path):
        self.qf1.load_state_dict(torch.load(critic1_path))
        self.qf1.eval()
        print ("Models: {} loaded...".format(critic1_path))
        self.qf2.load_state_dict(torch.load(critic2_path))
        self.qf2.eval()
        print ("Models: {} loaded...".format(critic2_path))
        self.policy.load_state_dict(torch.load(actor_path))
        self.policy.eval()
        print ("Models: {} loaded...".format(actor_path))

    def update(self,states, actions, rewards, next_states, dones,ma_agents):
        num_agents = len(ma_agents)
        #concatenate states, actions, next_states and next_actions
        all_states = torch.cat(tuple(states[i] for i in range(num_agents)),dim=1)
        all_actions = torch.cat(tuple(actions[i] for i in range(num_agents)),dim=1)
        all_next_states = torch.cat(tuple(next_states[i] for i in range(num_agents)),dim=1)
        local_rewards = rewards[self.index]
        local_dones = dones[self.index]

        ###### POLICY EVALUATION STEP ######

        #Update the collective Q-function parameters for the agent
        #Predict next actions after next_states for all the agents
        next_actions=[]
        next_log_pis = []

        for agent in ma_agents:
            local_next_actions, local_next_log_pis = agent.policy.sample(next_states[agent.index])
            next_actions.append(local_next_actions)
            next_log_pis.append (local_next_log_pis)

        all_next_actions = torch.cat(tuple(next_actions[i] for i in range(num_agents)),dim=1)

        next_qf1 = self.target_qf1.forward(all_next_states,all_next_actions)
        next_qf2 = self.target_qf2.forward(all_next_states,all_next_actions)
        next_q_target = torch.min(next_qf1,next_qf2) - self.alpha * next_log_pis[self.index]
        expected_q = local_rewards + (1 - local_dones) * self.discount * next_q_target
        curr_qf1 = self.qf1.forward(all_states,all_actions)
        curr_qf2 = self.qf2.forward(all_states,all_actions)
        qf1_loss = F.mse_loss(curr_qf1, expected_q.detach())
        qf2_loss = F.mse_loss(curr_qf2, expected_q.detach())

        #Update critic1 weights
        self.qf1_optimizer.zero_grad()
        qf1_loss.backward()
        self.qf1_optimizer.step()

        #Update critic2 weights
        self.qf2_optimizer.zero_grad()
        qf2_loss.backward()
        self.qf2_optimizer.step()

        ###### POLICY IMPROVEMENT STEP ######
        #Predict new actions after the current state
        new_actions = []
        new_log_pis = []

        for agent in ma_agents:
            local_new_actions, local_new_log_pis = agent.policy.sample(states[agent.index])
            new_actions.append(local_new_actions)
            new_log_pis.append (local_new_log_pis)

        all_new_actions = torch.cat(tuple(new_actions[i] for i in range(num_agents)),dim=1)
        local_log_pis = new_log_pis[self.index]

        min_q = torch.min(self.qf1.forward(all_states, all_new_actions),
                          self.qf2.forward(all_states, all_new_actions))

        policy_loss = (self.alpha * local_log_pis - min_q).mean()

        #Update actor weights
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        #print ("Policy of agents {} updated".format(self.index))

        #Update target network weights at every iteration
        for target_params, params in zip(self.target_qf1.parameters(), self.qf1.parameters()):
            target_params.data.copy_(self.soft_target_tau * params + (1 - self.soft_target_tau) * target_params)

        for target_params, params in zip(self.target_qf2.parameters(), self.qf2.parameters()):
            target_params.data.copy_(self.soft_target_tau * params + (1 - self.soft_target_tau) * target_params)

        #Adjust entropy temperature
        if self.auto_entropy_tuning:
            self.log_alpha_optim.zero_grad()
            alpha_loss = (self.log_alpha * (-local_log_pis - self.target_entropy).detach()).mean()
            alpha_loss.backward()
            self.log_alpha_optim.step()
            self.alpha = self.log_alpha.exp()

In [None]:
class Game:

    def __init__(self, name, solve_score,state_dim,action_dim,num_agents,num_steps_per_epoch=1000):
        self.name = name
        self.solve_score = solve_score
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.num_agents = num_agents
        self.num_steps_per_epoch = num_steps_per_epoch
        self.game_score = GameScore()

    def compute_episod_score(self, scores):
        return np.max(scores)

    def test_for_ending(self):
        test1 = (self.game_score.last_moving_average>=self.solve_score)
        test2 = (len(self.game_score.all_scores) >=100)
        end_test = test1 and test2
        return end_test

In [None]:
class Multiple_Agents:

    def __init__(self, game,replay_buffer_size=50000,batch_size=256,load_mode=False,\
                 save_mode=True, episods_before_update=10):

        self.game = game
        self.replay_buffer_size = replay_buffer_size
        self.batch_size = batch_size
        self.load_mode = load_mode
        self.save_mode=save_mode
        self.episods_before_update = episods_before_update

        self.episod_index = 0
        self.num_agents = self.game.num_agents

        # Creation of the agents
        self.agents = [SAC_Agent(index=i,state_dim = self.game.state_dim,action_dim = self.game.action_dim) for i in range(self.num_agents)]
        # Creation of the replay buffer and game scores
        self.replay_buffer = Centralized_ReplayBuffer(self.replay_buffer_size, self.batch_size, 1,self.num_agents)

        # Loading of existing agents
        if load_mode:
            file_name = "Models/checkpoint_masac_"
            for agent in self.agents:
                critic1_path = file_name + "critic1_Agent" + str(agent.index)+".pth"
                critic2_path = file_name + "critic2_Agent" + str(agent.index)+".pth"
                actor_path = file_name + "actor_Agent" + str(agent.index)+".pth"
                agent.load_weights(critic1_path, critic2_path,actor_path)

    def get_action(self,states,deterministic):
        actions = []
        for state,agent in zip(states,self.agents):
            action = agent.get_action(state,deterministic)
            actions.append(action)
        return actions

    def update_per_step(self):
        self.episod_index+=1

        if self.update_gateway():
            # Sample a batch of experiences from the centralized buffer
            states, actions, rewards, next_states, dones = self.replay_buffer.sample()

            # Update critics and actors
            for agent in self.agents:
                agent.update(states, actions, rewards, next_states, dones,self.agents)

    def update_gateway(self):
        test1 = self.replay_buffer.buffer_len() >=self.batch_size
        test2 = self.episod_index > self.episods_before_update
        gateway = test1 and test2
        return gateway

    def save_weights(self):
        if self.save_mode:
            for agent in self.agents:
                agent.save_weights()

    def training(self,num_epochs):
        # Launch the environment

        env = UnityEnvironment(file_name="Tennis.app")
        # get the default brain
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]
        actions=np.zeros(self.game.num_agents,)

        for each_iteration in range(num_epochs):

            # Initialization
            actions=np.zeros(self.game.num_agents,)

            env_info=env.reset(train_mode=True)[brain_name]

            states = env_info.vector_observations

            scores = np.zeros(self.game.num_agents)

            for each_environment_step in range(self.game.num_steps_per_epoch):
                #interacts with the environment by sampling actions and collect next_states, rewards and status
                actions = self.get_action(states,deterministic = False)
                env_info = env.step(actions)[brain_name]
                next_states = env_info.vector_observations
                rewards = env_info.rewards
                dones = env_info.local_done
                #computes scores of all the agents
                scores += rewards

                #Store the transition in the replay buffer
                self.replay_buffer.add(states,actions,rewards,next_states,dones)

                #updates the critic and actor
                self.update_per_step()
                states = next_states

                if np.any(dones) or each_environment_step == self.game.num_steps_per_epoch - 1:
                    break

            episod_score = self.game.compute_episod_score(scores)
            self.game.game_score.update_scores(episod_score)
            self.game.game_score.display_score()

            if self.game.test_for_ending():
                print("\nGame {} solved in {:d} episodes!".format(self.game.name, each_iteration))
                self.save_weights()
                env.close()
                break

In [None]:
class GameScore:
    
    def __init__(self, window_len=100,frequency_display = 10,display_chart=True,clear_output=True):
        self.window_len = window_len
        self.frequency_display = frequency_display
        self.display_chart = display_chart
        self.clear_output = clear_output
        
        # Initialization
        self.episod_index = 0
        # list containing scores from each episode
        self.all_scores = []                      
        self.all_moving_averages=[]
        # last [window_len] scores - useful for calculating a moving average
        self.scores_window = deque(maxlen=window_len)  
        self.last_score = 0
        self.last_moving_average  = 0
        
    def update_scores(self,episod_score):

        self.episod_index+=1
        self.all_scores.append(episod_score)           
        self.scores_window.append(episod_score)
        self.all_moving_averages.append(np.mean(self.scores_window))
        
        # Update general statistics
        self.last_score = self.all_scores[-1]
        self.last_moving_average  = self.all_moving_averages[-1]
        
    def display_score(self):        

        if self.episod_index % self.frequency_display == 0:
            clear_output(wait=self.clear_output)
            print ("Last Score: {:.2f} - Moving Average over last {} episods: {:.2f}".\
                   format(self.last_score, self.episod_index, self.last_moving_average))
            if self.display_chart:
                fig = plt.figure()
                ax = fig.add_subplot(111)
                plt.plot(np.arange(++len(self.all_scores)), self.all_scores)
                plt.ylabel('Score')
                plt.xlabel('Episode #')
                plt.show()

In [None]:
#Simulation parameters

# PARAMETERS FOR TRAINING / TEST MODE
train_mode = True #test mode if False
load_mode=True #set it to True in test mode (train_mode = False)
save_mode = True #save the networks - train_mode only

my_game = Game(name="Unity Tennis", solve_score=1.5, state_dim = 24, action_dim=2,num_agents=2)
ma = Multiple_Agents(game = my_game,replay_buffer_size=50000,batch_size=256,\
                     load_mode=load_mode,save_mode=save_mode, episods_before_update=10)



if train_mode:
        
    #env = UnityEnvironment(file_name="Tennis.app")
    # get the default brain
    #brain_name = env.brain_names[0]
    #brain = env.brains[brain_name]
    #env_info=env.reset(train_mode=True)[brain_name]
    
    ma.training(num_epochs = 3000)

### Watch the game!

In [None]:
if not train_mode:

    states = env_info.vector_observations                  # get the current state (for each agent)
    t_step=0
    rewards_history=[]
    nb_episodes = 10
    former_actions = np.zeros((2,2))
    
    for episodes in range (nb_episodes):

        scores = np.zeros(my_game.num_agents) # initialize the score (for each agent) 
        t_step=0
        while True:
            
            actions = ma.get_action(states,True)                # select an action (for each agent)
            env_info = env.step(actions)[brain_name]           # send all actions to tne environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished
            scores += rewards                                 # update the score (for each agent)
            states = next_states                               # roll over states to next time step
            t_step+=1
            rewards_history.append(rewards)

            if np.all(dones) or t_step>2000:                   # exit loop if episode finished
                break
        print ("Nadal vs Federer: {}".format(np.sum(rewards_history,axis=0)))
        print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))
    print ('Total score: {}'.format(np.sum(rewards_history)))
    env.close()

In [None]:
'''
from game import Game
from multiple_agents import Multiple_Agents

my_game = Game(name="Unity Tennis", solve_score=1.5, state_dim = 24, action_dim=2,\
               num_agents=2,num_steps_per_epoch = 1000)

ma = Multiple_Agents(game = my_game,replay_buffer_size=50000,batch_size=256,\
                     load_mode=True,save_mode=True, episods_before_update=10)

ma.training(num_epochs = 3000)
'''
