In [None]:
# Importing needed librarys
import torch
import torch.distributions as dist
import torch.nn as nn
import gym
import numpy as np
import random
import wandb
from copy import deepcopy
import torch.multiprocessing as mp
import matplotlib.pyplot as plt

In [None]:
# Setting seeds
def make_deterministic(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)

In [None]:
# Building a class for the shared Adam optimizer (from chapter 11 of Morales(2020))
class SharedAdam(torch.optim.Adam):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False):
        super(SharedAdam, self).__init__(
            params, lr=lr, betas=betas, eps=eps, 
            weight_decay=weight_decay, amsgrad=amsgrad)
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = 0
                state['shared_step'] = torch.zeros(1).share_memory_()
                state['exp_avg'] = torch.zeros_like(p.data).share_memory_()
                state['exp_avg_sq'] = torch.zeros_like(p.data).share_memory_()
                if weight_decay:
                    state['weight_decay'] = torch.zeros_like(p.data).share_memory_()
                if amsgrad:
                    state['max_exp_avg_sq'] = torch.zeros_like(p.data).share_memory_()

    def step(self, closure=None):
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                self.state[p]['steps'] = self.state[p]['shared_step'].item()
                self.state[p]['shared_step'] += 1
        super().step(closure)

In [None]:
# Building a class for the neural network
class Neural_Network(nn.Module):
  def __init__(self, in_dim, out_dim, hidden_dim):
        super(Neural_Network, self).__init__()
        self.input_layer = nn.Linear(in_dim, hidden_dim[0])
        self.hidden_layer_1 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.hidden_layer_2 = nn.Linear(hidden_dim[1], hidden_dim[2])
        self.output_layer = nn.Linear(hidden_dim[2], out_dim)
       
  def forward (self, x, activation_function = nn.ReLU()):
    x = activation_function(self.input_layer(x))
    x = activation_function(self.hidden_layer_1(x))
    x = activation_function(self.hidden_layer_2(x))
    output = self.output_layer(x)
    return output

In [None]:
# Defining function for playing during and after training to obtain performance of the agent
def play(episodes, act_episode, end, envs, model, worker = None, step = 0):
  x = 0
  returns = []

  for i in range(0, episodes):
    memory_reward = []

    state = envs.reset()
    while True:
      state_tensor = torch.tensor(state, dtype=torch.float32)
      values = model(state_tensor)
      distribution = dist.Categorical(logits = values)
      action = distribution.sample()
      state_next, reward, done, _ = envs.step(action.item())
      memory_reward.append(reward)
      state = state_next

      if done:
        x = sum(memory_reward)
        state = envs.reset()
        returns.append(x)
        break

  if end:
    avg_return = sum(returns[-100:]) / 100
    print("Average return over 100 episodes of trained agent:", avg_return)
  else:
    avg_return = sum(returns[-25:]) / 25
    print("Episode:", act_episode, ", Average return over 25 episodes of playing:", avg_return, ", Worker:", worker)

  return avg_return

In [None]:
# Building function for local agent
def worker(n, seed_loc, optimizer, optimizer_2, policy_model, state_value_model, seed):

  # Calculating and setting local seed
  seed_local = seed + seed_loc + 1
  make_deterministic(seed_local)

  # Creating two environments: One for training and for playing
  env_local = gym.make("CartPole-v1")
  env_local_play = gym.make("CartPole-v1")

  # Setting seed for local environment
  env_local.seed(seed + seed_loc)
  env_local.action_space.seed(seed + seed_loc)
  env_local_play.seed(seed + seed_loc)
  env_local_play.action_space.seed(seed + seed_loc)

  # Creating neural networks and setting optimizers
  policy_model_local = Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = env.action_space.n, hidden_dim=hidden_dims_1)
  state_value_model_local = Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = 1, hidden_dim=hidden_dims_1)

  # Copying weights from shared networks to local network
  policy_model_local.load_state_dict(policy_model.state_dict())
  state_value_model_local.load_state_dict(state_value_model.state_dict())

  steps = 0
  returns_play = []
  reward_list = []
  disc_rewards = []
  discounts = []
  returns = []
  state_values = []
  logs = []
  dones = []
  x_axis = []

  # Training the agent for number of episodes
  for episode in range(0, num_episodes):

    step = 0
    state = env_local.reset()
    reward_sum = 0
    returns.clear()
    logs.clear()
    state_values.clear()
    discounts.clear()
    reward_list.clear()

    while True:
      steps += 1
      step += 1

      # Choosing action and obtaining log value of action probability
      state_tensor = torch.tensor(state, dtype=torch.float32)
      values = policy_model_local(state_tensor)
      distribution = dist.Categorical(logits = values)
      action = distribution.sample()
      log_value = distribution.log_prob(action)
      state_next, reward, done, _ = env_local.step(action.item())
      values_state_value = state_value_model_local(state_tensor)
      reward_list.append(reward)
      logs.append(log_value)
      state_values.append(values_state_value)
      state = state_next

      # Playing every 500 steps to obtain the performance of the local agent
      if steps % 500 == 0:
        avg_return_play = play(episodes = 25, act_episode = episode + 1, end = False, envs = env_local_play, model = policy_model_local, worker = seed_local, step = steps)
        returns_play.append(avg_return_play)
        x_axis.append(steps)
          
      # Following these steps, if episode has to come to an end or if agent took n steps
      if done or step == n:
        
        # If agent took n steps, add the state-value function of the last visited state to reward_list; else add 0
        if done:
          reward_list.append(0)
        else:
          reward_list.append(state_values[n-1])

        # Calculating discounts
        for i in range(0,step+1):
          discounts.append(gamma**i)

        # Calculating discounted returns
        for i in range(0, step):
          disc_rewards = [x*y for x,y in zip(reward_list,discounts)]
          returns.append(sum(disc_rewards))
          reward_list.pop(0)
      
        returns_tensor = torch.tensor(returns)
        logs_tensor = torch.stack(logs)
        state_value_tensor = torch.stack(state_values)

        advantage = returns_tensor - state_value_tensor

        # Calculating loss for policy network
        log_baseline = - advantage.detach() * logs_tensor

        loss = log_baseline.mean()
          
        # Optimizing shared policy network
        optimizer.zero_grad()

        loss.backward()

        # Copying gradients from local policy network to shared policy network
        for local_param, param in zip(policy_model_local.parameters(), policy_model.parameters()):
          if param.grad is None:
              param._grad = local_param.grad
            
        optimizer.step()

        # Calculating loss for state-value network
        losses = advantage.pow(2).mean()

        # Optimizing shared state value network
        optimizer_2.zero_grad()

        losses.backward()

        # Copying gradients from local state-value network to shared state-value network
        for local_param, param in zip(state_value_model_local.parameters(), state_value_model.parameters()):
          if param.grad is None:
              param._grad = local_param.grad

        optimizer_2.step()

        policy_model_local.load_state_dict(policy_model.state_dict())
        state_value_model_local.load_state_dict(state_value_model.state_dict())
        break

    # If training steps >= 2500 and the agent has obtained an average reward of 475 during last 5 times of playing, end training
    if (steps >= 2500):
      if (sum(returns_play[-5:]) / 5 >= 475):
        plt.plot(x_axis, returns_play, color ='black')
        plt.xlabel('step')
        plt.ylabel('average return')
        plt.title('Average return over 25 episodes of playing (agent ' + str(seed_local) + ')')
        plt.show()
        return None
        break

In [None]:
##### A3C ###########
# Seeds for repeatable training results
SEEDS = (12, 34, 56)

# Looping over different seeds for training different agents
for seed in SEEDS:
  # Creating environments for obtaining end results
  env = gym.make("CartPole-v1")
  make_deterministic(seed)
  # Setting seed for the environment
  env.seed(seed)
  env.action_space.seed(seed)
  print("SEED:", seed)

  # Defining hyperparameters
  gamma = 0.99 # gamma
  learning_rate_1 = 0.001 # learning rate for shared policy network
  learning_rate_2 = 0.00055 # learning rate for shared state-value network
  num_episodes = 10000 # maximum number of episodes
  hidden_dims_1 = [128, 64, 32] # size of hidden layers in the policy networks
  hidden_dims_2 = [128, 64, 32] # size of hidden layers in the state-value networks
  worker_number = 3 # number of workers
  n = 100 # bootstrapping step-size

  wandb.init(project="A3C-CartPole", entity="raisa")
  config = wandb.config

  # Logging hyperparameters
  config.learning_rate_policy = learning_rate_1
  config.learning_rate_value = learning_rate_2
  config.gamma = gamma
  config.num_episodes = num_episodes
  config.hidden_dim_policy = hidden_dims_1
  config.hidden_dim_value = hidden_dims_2
  config.seed = seed

  # Creating shared neural networks and setting optimizers
  policy_model = Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = env.action_space.n, hidden_dim=hidden_dims_1)
  state_value_model = Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = 1, hidden_dim=hidden_dims_1)
  optimizer = SharedAdam(policy_model.parameters(), lr=learning_rate_1)
  optimizer_2 = SharedAdam(state_value_model.parameters(), lr=learning_rate_2)
  policy_model.share_memory()
  state_value_model.share_memory()

  # Start running local actors
  workers = [mp.Process(target=worker, args=(n , seed_loc, optimizer, optimizer_2, policy_model, state_value_model, seed)) for seed_loc in range(worker_number)]
  [w.start() for w in workers] ; [w.join() for w in workers]
  
  # Playing after training to obtain end results
  play(episodes = 100, act_episode = None, end = True, envs = env, model = policy_model)