In [1]:
import copy

import torch
import gym
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from torch.distributions import Normal
import itertools

In [2]:
class SACBuffer:
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.counter = 0
        self.states_memory = np.zeros((self.mem_size, *input_shape))
        self.actions_memory = np.zeros((self.mem_size, n_actions))
        self.new_states_memory = np.zeros((self.mem_size, *input_shape))
        self.rewards_memory = np.zeros(self.mem_size)
        self.dones = np.zeros(self.mem_size, dtype = np.bool)

    def store_transition(self, state, action, reward, new_state, done):
        self.index = self.counter % self.mem_size
        self.states_memory[index] = state
        self.actions_memory[index] = action
        self.rewards_memory[index] = reward
        self.new_states_memory[index] = new_state
        self.dones[index] = done
        self.counter += 1

    def sample_buffer(self, batch_size):
        max_mem = self.counter
        if self.counter > self.mem_size:
            max_mem = self.mem_size
        batch = np.random.choice(max_mem, batch_size)

        states = self.states_memory[batch]
        rewards = self.rewards_memory[batch]
        new_states = self.new_states_memory[batch]
        actions = self.actions_memory[batch]
        dones = self.dones[batch]

        return states, rewards, actions, new_states, dones

class ActionEstimator(nn.Module):
    def __init__(self, input_dims, actions, max_action, hidden_sizes):
        super(ActionEstimator, self).__init__()
        self.max_action = max_action
        self.hidden_sizes = hidden_sizes
        self.reparam_noise = 1e-6
        self.LOG_STD_MAX = 2
        self.LOG_STD_MIN = -20
        self.SeqLayer = nn.Sequential(
            nn.Linear(*input_dims, hidden_sizes[0]),
            nn.ReLU(),
            nn.Linear(hidden_sizes[0], hidden_sizes[1]),
            nn.ReLU(),
        )
        self.mu = nn.Linear(hidden_sizes[1], actions)
        self.sigma = nn.Linear(hidden_sizes[1], actions)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def forward(self, state):
        out = self.SeqLayer(state)
        mu = self.mu(out)
        log_std = self.sigma(out)
        log_std = torch.clamp(log_std, self.LOG_STD_MIN, self.LOG_STD_MAX)
        return mu, log_std

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        mu, log_std = self.forward(state)
        sigma = torch.exp(log_std)
        gaussian_dist = Normal(mu, sigma)
        actions = gaussian_dist.rsample()
        actions = torch.tanh(actions)
        return actions.cpu().detach().numpy()[0]*self.max_action

    def evaluate(self, state, reparameterize=False):
        mu, log_std = self.forward(state)
        sigma = torch.exp(log_std)
        action_dist = Normal(mu, sigma)
        if reparameterize:
            action = action_dist.rsample()
        else:
            action = action_dist.sample()

        # log_prob = action_dist.log_prob(action)-torch.log(1-action.pow(2)+self.reparam_noise)
        # log_prob = log_prob.sum(dim=-1)
        log_prob = action_dist.log_prob(action).sum(axis=-1)
        log_prob -= (2*(np.log(2) - action - F.softplus(-2*action))).sum(axis=1)

        return torch.tanh(action)*self.max_action, log_prob

class Q_Net(nn.Module):
    def __init__(self, input_dims, actions, hidden_sizes, learning_rate = 1e-3):
        super(Q_Net, self).__init__()
        self.hidden_sizes = hidden_sizes
        self.SeqLayer = nn.Sequential(
            nn.Linear(input_dims[0]+actions, self.hidden_sizes[0]),
            nn.ReLU(),
            nn.Linear(self.hidden_sizes[0], self.hidden_sizes[1]),
            nn.ReLU(),
            nn.Linear(self.hidden_sizes[1], 1)
        )

    def forward(self, state, actions):
        out = self.SeqLayer(torch.cat([state, actions], dim = -1))
        return out

class Value_Net(nn.Module):
    def __init__(self, input_dims, hidden_sizes, learning_rate = 1e-3):
        super(Value_Net, self).__init__()
        self.hidden_sizes = hidden_sizes
        self.SeqLayer = nn.Sequential(
            nn.Linear(input_dims[0], self.hidden_sizes[0]),
            nn.ReLU(),
            nn.Linear(self.hidden_sizes[0], self.hidden_sizes[1]),
            nn.ReLU(),
            nn.Linear(self.hidden_sizes[1], 1)
        )

    def forward(self, state):
        out = self.SeqLayer(state)
        return out


class Agent:
    def __init__(self,  max_action, n_actions, input_dims, hidden_sizes = [256, 256], gamma=0.95, batch_size=256, learning_rate = 1e-3):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.gamma = gamma
        self.max_action = max_action
        self.batch_size = batch_size
        self.alpha = 0.2
        self.tau = 0.005
        self.actor = ActionEstimator(input_dims, n_actions, max_action, hidden_sizes).to(self.device)
        self.Q1 = Q_Net(input_dims, n_actions, hidden_sizes).to(self.device)
        self.Q2 = Q_Net(input_dims, n_actions, hidden_sizes).to(self.device)
        self.value_net = Value_Net(input_dims, hidden_sizes).to(self.device)
        self.target_value_net = copy.deepcopy(self.value_net)

        self.buffer = SACBuffer(1000000, input_shape = input_dims, n_actions = n_actions)
        self.MSE_Criterion = nn.MSELoss()

        self.V_params = itertools.chain(self.value_net.parameters())
        self.V_targ_params = itertools.chain(self.target_value_net.parameters())
        self.q1_optim = torch.optim.Adam(self.Q1.parameters(), lr=0.00003)
        self.q2_optim = torch.optim.Adam(self.Q2.parameters(), lr=0.00003)
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=0.00003)
        self.v_optim = torch.optim.Adam(self.value_net.parameters(), lr=0.00003)
        for p in self.V_targ_params:
            p.requires_grad = False

    def choose_action(self, state):
        actions,_ = self.actor.evaluate(state)
        return actions.cpu().detach().numpy()[0]

    def update_network_parameters(self):
        target_value_params = self.target_value_net.named_parameters()
        value_params = self.value_net.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)

        for name in value_state_dict:
            value_state_dict[name] = self.tau*value_state_dict[name].clone() + \
                                     (1-self.tau)*target_value_state_dict[name].clone()

        self.target_value_net.load_state_dict(value_state_dict)

    def remember(self, s, a, r, ns, done):
        self.buffer.store_transition(s,a,r,ns,done)

    def update(self):
        if self.buffer.counter < self.batch_size:
            return
        states, rewards, actions, new_states, dones = self.buffer.sample_buffer(self.batch_size)

        states_t = torch.from_numpy(states).float().to(self.device)
        rewards_t = torch.from_numpy(rewards).float().to(self.device)
        actions_t = torch.from_numpy(actions).float().to(self.device)
        new_states_t = torch.from_numpy(new_states).float().to(self.device)
        dones_t = torch.from_numpy(dones).to(self.device)


        actions_pi, log_probs = self.actor.evaluate(states_t)
        q1_value = self.Q1(states_t, actions_pi)
        q2_value = self.Q2(states_t, actions_pi)
        next_val = self.target_value_net(new_states_t).view(-1)
        state_val = self.value_net(states_t).view(-1)
        next_val[dones_t] = 0.0
        predicted_new_q = torch.min(q1_value, q2_value).view(-1)

        self.v_optim.zero_grad()
        target_value_func = predicted_new_q - log_probs.view(-1)
        value_loss = 0.5*self.MSE_Criterion(state_val, target_value_func.detach())
        value_loss.backward(retain_graph=True)
        self.v_optim.step()

        # LOSS FOR ACTOR
        actions_pip, log_probsp = self.actor.evaluate(states_t, reparameterize = True)
        log_probsp = log_probsp.view(-1)

        q1_new_policy = self.Q1(states_t, actions_pip)
        q2_new_policy = self.Q2(states_t, actions_pip)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        policy_loss = (log_probsp - critic_value).mean()
        self.actor_optim.zero_grad()
        policy_loss.backward(retain_graph = True)
        self.actor_optim.step()

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        self.q1_optim.zero_grad()
        self.q2_optim.zero_grad()
        q_hat = rewards_t + self.gamma * next_val
        q1_old_policy = self.Q1(states_t, actions_t).view(-1)
        q2_old_policy = self.Q2(states_t, actions_t).view(-1)
        critic1_loss = 0.5*self.MSE_Criterion(q1_old_policy, q_hat)
        critic2_loss = 0.5*self.MSE_Criterion(q2_old_policy, q_hat)

        critic_loss = critic1_loss + critic2_loss
        critic_loss.backward()
        self.q1_optim.step()
        self.q2_optim.step()

        # Updating Policy

        # Finally, update target networks by polyak averaging.
        # with torch.no_grad():
        #     for p_targ, p in zip(self.V_targ_params, self.V_params):
        #         # NB: We use an in-place operations "mul_", "add_" to update target
        #         # params, as opposed to "mul" and "add", which would make new tensors.
        #         p_targ.data.mul_(self.polyak)
        #         p_targ.data.add_((1 - self.polyak) * p.data)

        self.update_network_parameters()


env = gym.make("LunarLanderContinuous-v2")
writer = SummaryWriter()
max_action = env.action_space.high[0]
n_actions = env.action_space.shape[0]
obs_dim = env.observation_space.shape
n_games = 4000
agent = Agent(max_action, n_actions, obs_dim)
exploration_end = 10000
start_updates = 1000
update_steps = 50
index = 0
eps_rewards = []
updating = 0
for i in range(n_games):
    done = False
    score = 0
    obs = env.reset()
    while not done:
        if index < exploration_end:
            action = env.action_space.sample()
        else:
            action = agent.choose_action(obs)
        index+=1
        new_state, reward, done, info = env.step(action)
        agent.remember(obs, action, reward, new_state, done)
        if index > start_updates:
            for j in range(5):
                agent.update()
        score += reward
        obs = new_state
    eps_rewards.append(score)
    avg_score = np.mean(eps_rewards[-100:])
    writer.add_scalar("Episode total reward", eps_rewards[i], i)

    print('episode ', i, " score %.2f " %score,
          "100 game average %.2f" % avg_score)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.dones = np.zeros(self.mem_size, dtype = np.bool)


episode  0  score -57.59  100 game average -57.59
episode  1  score -485.57  100 game average -271.58
episode  2  score -43.06  100 game average -195.40
episode  3  score -315.78  100 game average -225.50
episode  4  score -43.39  100 game average -189.08
episode  5  score -446.95  100 game average -232.06
episode  6  score -96.03  100 game average -212.62
episode  7  score -39.22  100 game average -190.95
episode  8  score -342.53  100 game average -207.79
episode  9  score -98.66  100 game average -196.88
episode  10  score -62.83  100 game average -184.69
episode  11  score -401.66  100 game average -202.77
episode  12  score -136.60  100 game average -197.68
episode  13  score -343.37  100 game average -208.09
episode  14  score -229.67  100 game average -209.53
episode  15  score -73.35  100 game average -201.02
episode  16  score -350.54  100 game average -209.81
episode  17  score -360.68  100 game average -218.19
episode  18  score -280.22  100 game average -221.46
episode  19 

TypeError: linear(): argument 'input' (position 1) must be Tensor, not numpy.ndarray