In [1]:
import copy

import torch
import gym
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from torch.distributions import Normal
import itertools

In [3]:
class SACBuffer:
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.counter = 0
        self.states_memory = np.zeros((self.mem_size, *input_shape))
        self.actions_memory = np.zeros((self.mem_size, n_actions))
        self.new_states_memory = np.zeros((self.mem_size, *input_shape))
        self.rewards_memory = np.zeros(self.mem_size)
        self.dones = np.zeros(self.mem_size)

    def store_transition(self, state, action, reward, new_state, done):
        self.index = self.counter % self.mem_size
        self.states_memory[index] = state
        self.actions_memory[index] = action
        self.rewards_memory[index] = reward
        self.new_states_memory[index] = new_state
        self.dones[index] = done
        self.counter += 1

    def sample_buffer(self, batch_size):
        max_mem = self.counter
        if self.counter > self.mem_size:
            max_mem = self.mem_size
        batch = np.random.choice(max_mem, batch_size)

        states = self.states_memory[batch]
        rewards = self.rewards_memory[batch]
        new_states = self.new_states_memory[batch]
        actions = self.actions_memory[batch]
        dones = self.dones[batch]

        return states, rewards, actions, new_states, dones

class ActionEstimator(nn.Module):
    def __init__(self, input_dims, actions, max_action, hidden_sizes):
        super(ActionEstimator, self).__init__()
        self.max_action = max_action
        self.hidden_sizes = hidden_sizes
        self.reparam_noise = 1e-6
        self.LOG_STD_MAX = 2
        self.LOG_STD_MIN = -20
        self.SeqLayer = nn.Sequential(
            nn.Linear(*input_dims, hidden_sizes[0]),
            nn.ReLU(),
            nn.Linear(hidden_sizes[0], hidden_sizes[1]),
            nn.ReLU(),
        )
        self.mu = nn.Linear(hidden_sizes[1], actions)
        self.sigma = nn.Linear(hidden_sizes[1], actions)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def forward(self, state):
        out = self.SeqLayer(state)
        mu = self.mu(out)
        log_std = self.sigma(out)
        log_std = torch.clamp(log_std, self.LOG_STD_MIN, self.LOG_STD_MAX)
        return mu, log_std

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        mu, log_std = self.forward(state)
        sigma = torch.exp(log_std)
        gaussian_dist = Normal(mu, sigma)
        actions = gaussian_dist.rsample()
        actions = torch.tanh(actions)
        return actions.cpu().detach().numpy()[0]*self.max_action

    def evaluate(self, state):
        mu, log_std = self.forward(state)
        sigma = torch.exp(log_std)
        action_dist = Normal(mu, sigma)
        action = action_dist.rsample()

        log_prob = action_dist.log_prob(action)-torch.log(1-action.pow(2)+self.reparam_noise)
        log_prob = log_prob.sum(dim=-1)
        # log_prob = action_dist.log_prob(action).sum(axis=-1)
        # log_prob -= (2*(np.log(2) - action - F.softplus(-2*action))).sum(axis=1)

        return torch.tanh(action)*self.max_action, log_prob

class Q_Net(nn.Module):
    def __init__(self, input_dims, actions, hidden_sizes, learning_rate = 1e-3):
        super(Q_Net, self).__init__()
        self.hidden_sizes = hidden_sizes
        self.SeqLayer = nn.Sequential(
            nn.Linear(input_dims[0]+actions, self.hidden_sizes[0]),
            nn.ReLU(),
            nn.Linear(self.hidden_sizes[0], self.hidden_sizes[1]),
            nn.ReLU(),
            nn.Linear(self.hidden_sizes[1], 1)
        )

    def forward(self, state, actions):
        out = self.SeqLayer(torch.cat([state, actions], dim = -1))
        return out


class Agent:
    def __init__(self,  max_action, n_actions, input_dims, hidden_sizes = [256, 256], gamma=0.95, batch_size=256, learning_rate = 1e-3):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.gamma = gamma
        self.max_action = max_action
        self.batch_size = batch_size
        self.polyak = 0.995
        self.ALPHA_INITIAL = 1
        self.tau = 0.01
        self.actor = ActionEstimator(input_dims, n_actions, max_action, hidden_sizes).to(self.device)
        self.Q1 = Q_Net(input_dims, n_actions, hidden_sizes).to(self.device)
        self.Q2 = Q_Net(input_dims, n_actions, hidden_sizes).to(self.device)
        self.Q1_targ = copy.deepcopy(self.Q1)
        self.Q2_targ = copy.deepcopy(self.Q2)
        self.target_entropy = 0.98 * -np.log(1 / n_actions)
        self.log_alpha = torch.tensor(np.log(self.ALPHA_INITIAL), requires_grad=True)
        self.alpha = self.log_alpha
        self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=1e-4)

        self.buffer = SACBuffer(50000, input_shape = input_dims, n_actions = n_actions)
        self.MSE_Criterion = nn.MSELoss()

        self.q_params = itertools.chain(self.Q1.parameters(), self.Q2.parameters())
        self.q_targ_params = itertools.chain(self.Q1_targ.parameters(), self.Q2_targ.parameters())
        self.q_optim = torch.optim.Adam(self.q_params, lr=0.00003)
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=0.00003)
        for p in self.q_targ_params:
            p.requires_grad = False

    def choose_action(self, state):
        action = self.actor.act(state)
        return action

    def update_network_parameters(self):
        for target_param, param in zip(self.Q1_targ.parameters(), self.Q1.parameters()):
            target_param.data.copy_(self.tau*param.data + (1-self.tau)*target_param.data)

        for target_param, param in zip(self.Q2_targ.parameters(), self.Q2.parameters()):
            target_param.data.copy_(self.tau*param.data + (1-self.tau)*target_param.data)

    def remember(self, s, a, r, ns, done):
        self.buffer.store_transition(s,a,r,ns,done)

    def update(self):
        if self.buffer.counter < self.batch_size:
            return
        states, rewards, actions, new_states, dones = self.buffer.sample_buffer(self.batch_size)

        states_t = torch.from_numpy(states).float().to(self.device)
        rewards_t = torch.from_numpy(rewards).float().to(self.device)
        actions_t = torch.from_numpy(actions).float().to(self.device)
        new_states_t = torch.from_numpy(new_states).float().to(self.device)
        dones_t = torch.from_numpy(dones).to(self.device)

        # LOSS FOR Q FUNCTION
        self.q_optim.zero_grad()

        q1_value = self.Q1(states_t, actions_t)
        q2_value = self.Q2(states_t, actions_t)

        with torch.no_grad():
            actions2, log_probs = self.actor.evaluate(new_states_t)
            q1_p_targ = self.Q1_targ(new_states_t, actions2)
            q2_p_targ = self.Q2_targ(new_states_t, actions2)
            q_min = torch.min(q1_p_targ, q2_p_targ)
            backup = rewards_t+self.gamma*(1-dones_t)*(q_min-self.alpha*log_probs)

        loss_q1 = ((q1_value-backup)**2).mean()
        loss_q2 = ((q2_value-backup)**2).mean()
        loss_qf = loss_q1 + loss_q2
        loss_qf.backward()
        self.q_optim.step()

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in self.q_params:
            p.requires_grad = False

        # Updating Policy
        self.actor_optim.zero_grad()
        new_act, new_log_probs = self.actor(states_t)
        q1_p = self.Q1(states_t, new_act)
        q2_p = self.Q2(states_t, new_act)
        q_p_min = torch.min(q1_p, q2_p)
        policy_loss = (self.alpha*new_log_probs-q_p_min).mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in self.q_params:
            p.requires_grad = True

        alpha_loss = -(self.log_alpha * (log_probs + self.target_entropy).detach()).mean()
        alpha_loss.backward()
        self.alpha_optimizer.step()
        self.alpha = self.log_alpha.exp()
        # Finally, update target networks by polyak averaging.
        self.update_network_parameters()


env = gym.make("LunarLanderContinuous-v2")
writer = SummaryWriter()
max_action = env.action_space.high[0]
n_actions = env.action_space.shape[0]
obs_dim = env.observation_space.shape
n_games = 4000
agent = Agent(max_action, n_actions, obs_dim)
exploration_end = 10000
start_updates = 1000
update_steps = 50
index = 0
eps_rewards = []
updating = 0
for i in range(n_games):
    done = False
    score = 0
    obs = env.reset()
    while not done:
        action = agent.choose_action(obs)
        index+=1
        new_state, reward, done, info = env.step(action)
        agent.remember(obs, action, reward, new_state, int(done))
        if index > start_updates and index % update_steps == 0:
            agent.update()
        score += reward
        obs = new_state
    eps_rewards.append(score)
    avg_score = np.mean(eps_rewards[-100:])
    writer.add_scalar("Episode total reward", eps_rewards[i], i)

    print('episode ', i, " score %.2f " %score,
          "100 game average %.2f" % avg_score)

episode  0  score -118.20  100 game average -118.20
episode  1  score -423.15  100 game average -270.67
episode  2  score -70.63  100 game average -203.99
episode  3  score -62.91  100 game average -168.72
episode  4  score -96.43  100 game average -154.26
episode  5  score -446.72  100 game average -203.01
episode  6  score -285.32  100 game average -214.77
episode  7  score -183.45  100 game average -210.85
episode  8  score -407.55  100 game average -232.71
episode  9  score -115.97  100 game average -221.03
episode  10  score -273.12  100 game average -225.77
episode  11  score -406.18  100 game average -240.80
episode  12  score -219.27  100 game average -239.15
episode  13  score -172.60  100 game average -234.39
episode  14  score -55.69  100 game average -222.48
episode  15  score -66.42  100 game average -212.73
episode  16  score -91.63  100 game average -205.60
episode  17  score -309.32  100 game average -211.37
episode  18  score -68.19  100 game average -203.83
episode  1

IndexError: index 50000 is out of bounds for axis 0 with size 50000