In [17]:
import copy

import torch
import gym
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from torch.distributions import Normal
from torch.distributions import Categorical
import itertools

In [18]:
class SACBuffer:
    def __init__(self, max_size, input_shape):
        self.mem_size = max_size
        self.counter = 0
        self.states_memory = np.zeros((self.mem_size, *input_shape))
        self.actions_memory = np.zeros(self.mem_size).astype(np.int64)
        self.new_states_memory = np.zeros((self.mem_size, *input_shape))
        self.rewards_memory = np.zeros(self.mem_size)
        self.dones = np.zeros(self.mem_size).astype(np.int64)

    def store_transition(self, state, action, reward, new_state, done):
        self.index = self.counter % self.mem_size
        self.states_memory[self.index] = state
        self.actions_memory[self.index] = action
        self.rewards_memory[self.index] = reward
        self.new_states_memory[self.index] = new_state
        self.dones[self.index] = done
        self.counter += 1

    def sample_buffer(self, batch_size):
        max_mem = self.counter
        if self.counter > self.mem_size:
            max_mem = self.mem_size
        batch = np.random.choice(max_mem, batch_size)

        states = self.states_memory[batch]
        rewards = self.rewards_memory[batch]
        new_states = self.new_states_memory[batch]
        actions = self.actions_memory[batch]
        dones = self.dones[batch]

        return states, rewards, actions, new_states, dones

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
class ActionEstimator(nn.Module):
    def __init__(self, input_dims, actions, hidden_sizes):
        super(ActionEstimator, self).__init__()
        self.hidden_sizes = hidden_sizes
        self.reparam_noise = 1e-6
        self.SeqLayer = nn.Sequential(
            nn.Linear(*input_dims, hidden_sizes[0]),
            nn.ReLU(),
            nn.Linear(hidden_sizes[0], hidden_sizes[1]),
            nn.ReLU(),
            nn.Linear(hidden_sizes[1], actions),
            nn.Softmax(dim=1)
        )

    def forward(self, state):
        action_dist = self.SeqLayer(state)
        log_probs = torch.log(action_dist+self.reparam_noise)
        return action_dist, log_probs

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        action_dist, log_probs = self.forward(state)
        m = Categorical(action_dist)
        action = m.sample()
        return action.item()

class Q_Net(nn.Module):
    def __init__(self, input_dims, actions, hidden_sizes):
        super(Q_Net, self).__init__()
        self.hidden_sizes = hidden_sizes
        self.SeqLayer = nn.Sequential(
            nn.Linear(*input_dims, self.hidden_sizes[0]),
            nn.ReLU(),
            nn.Linear(self.hidden_sizes[0], self.hidden_sizes[1]),
            nn.ReLU(),
            nn.Linear(self.hidden_sizes[1], actions)
        )

    def forward(self, state):
        Q_values = self.SeqLayer(state)
        return Q_values



In [21]:
class Agent:
    def __init__(self, n_actions, input_dims, hidden_sizes = [256, 256], gamma=0.95, batch_size=256):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.gamma = gamma
        self.batch_size = batch_size
        self.tau = 0.01
        self.ALPHA_INITIAL = 1
        self.actor = ActionEstimator(input_dims, n_actions, hidden_sizes).to(self.device)
        self.Q1 = Q_Net(input_dims, n_actions, hidden_sizes).to(self.device)
        self.Q2 = Q_Net(input_dims, n_actions, hidden_sizes).to(self.device)
        self.Q1_targ = copy.deepcopy(self.Q1)
        self.Q2_targ = copy.deepcopy(self.Q2)

        self.buffer = SACBuffer(50000, input_shape = input_dims)
        self.MSE_Criterion = nn.MSELoss()

        self.q1_optim = torch.optim.Adam(self.Q1.parameters(), lr=1e-4)
        self.q2_optim = torch.optim.Adam(self.Q2.parameters(), lr=1e-4)
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=1e-4)
        self.q_targ_p = itertools.chain(self.Q1_targ.parameters(), self.Q2_targ.parameters())
        for parameter in self.q_targ_p:
            parameter.requires_grad = False

        self.target_entropy = 0.98 * -np.log(1 / n_actions)
        self.log_alpha = torch.tensor(np.log(self.ALPHA_INITIAL), requires_grad=True)
        self.alpha = self.log_alpha
        self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=1e-4)

    def choose_action(self, state):
        action = self.actor.act(state)
        return action

    def update_network_parameters(self):
        for target_param, param in zip(self.Q1_targ.parameters(), self.Q1.parameters()):
            target_param.data.copy_(self.tau*param.data + (1-self.tau)*target_param.data)

        for target_param, param in zip(self.Q2_targ.parameters(), self.Q2.parameters()):
            target_param.data.copy_(self.tau*param.data + (1-self.tau)*target_param.data)

    def remember(self, s, a, r, ns, done):
        self.buffer.store_transition(s,a,r,ns,done)

    def update(self):
        if self.buffer.counter < self.batch_size:
            return
        states, rewards, actions, new_states, dones = self.buffer.sample_buffer(self.batch_size)

        states_t = torch.from_numpy(states).float().to(self.device)
        rewards_t = torch.from_numpy(rewards).float().to(self.device)
        actions_t = torch.from_numpy(actions).to(self.device)
        new_states_t = torch.from_numpy(new_states).float().to(self.device)
        dones_t = torch.from_numpy(dones).to(self.device)


        self.q1_optim.zero_grad()
        self.q2_optim.zero_grad()
        self.actor_optim.zero_grad()
        self.alpha_optimizer.zero_grad()
        q1 = self.Q1(states_t).gather(1, actions_t.unsqueeze(-1)).squeeze(-1)
        q2 = self.Q2(states_t).gather(1, actions_t.unsqueeze(-1)).squeeze(-1)
        with torch.no_grad():
            actions_probs, log_probs = self.actor(new_states_t)
            q1t_value = self.Q1_targ(new_states_t)
            q2t_value = self.Q2_targ(new_states_t)
            predicted_new_q = torch.min(q1t_value, q2t_value)
            backup = rewards_t + self.gamma * (1 - dones_t) * (actions_probs * (predicted_new_q - self.alpha * log_probs)).sum(dim=-1)

        loss_q1 = self.MSE_Criterion(q1, backup)
        loss_q2 = self.MSE_Criterion(q2, backup)
        loss_q1.backward()
        loss_q2.backward()
        self.q1_optim.step()
        self.q2_optim.step()

        # LOSS FOR ACTOR
        action_dist, log_probs = self.actor(states_t)
        q1_new_policy = self.Q1(states_t)
        q2_new_policy = self.Q2(states_t)
        qp_min = torch.min(q1_new_policy, q2_new_policy)
        pi_loss = (action_dist*(self.alpha * log_probs - qp_min)).sum(dim=-1).mean()
        pi_loss.backward()
        self.actor_optim.step()

        alpha_loss = -(self.log_alpha * (log_probs + self.target_entropy).detach()).mean()
        alpha_loss.backward()
        self.alpha_optimizer.step()
        self.alpha = self.log_alpha.exp()
        # Updating Policy

        # Finally, update target networks by polyak averaging.
        # with torch.no_grad():
        #     for p_targ, p in zip(self.V_targ_params, self.V_params):
        #         # NB: We use an in-place operations "mul_", "add_" to update target
        #         # params, as opposed to "mul" and "add", which would make new tensors.
        #         p_targ.data.mul_(self.polyak)
        #         p_targ.data.add_((1 - self.polyak) * p.data)

        self.update_network_parameters()

env = gym.make("CartPole-v1")
writer = SummaryWriter()
n_actions = env.action_space.n
obs_dim = env.observation_space.shape
n_games = 4000
agent = Agent(n_actions, obs_dim)
exploration_end = 10000
start_updates = 1000
update_steps = 50
index = 0
eps_rewards = []
updating = 0
for i in range(n_games):
    done = False
    score = 0
    obs = env.reset()
    while not done:
        # if index < exploration_end:
        #     action = env.action_space.sample()
        # else:
        action = agent.choose_action(obs)
        index+=1
        new_state, reward, done, info = env.step(action)
        agent.remember(obs, action, reward, new_state, done)
        if index > start_updates:
            agent.update()
        score += reward
        obs = new_state
    eps_rewards.append(score)
    avg_score = np.mean(eps_rewards[-100:])
    writer.add_scalar("Episode total reward", eps_rewards[i], i)

    print('episode ', i, " score %.2f " %score,
          "100 game average %.2f" % avg_score)

episode  0  score 20.00  100 game average 20.00
episode  1  score 18.00  100 game average 19.00
episode  2  score 10.00  100 game average 16.00
episode  3  score 23.00  100 game average 17.75
episode  4  score 15.00  100 game average 17.20
episode  5  score 27.00  100 game average 18.83
episode  6  score 11.00  100 game average 17.71
episode  7  score 14.00  100 game average 17.25
episode  8  score 26.00  100 game average 18.22
episode  9  score 17.00  100 game average 18.10
episode  10  score 11.00  100 game average 17.45
episode  11  score 13.00  100 game average 17.08
episode  12  score 10.00  100 game average 16.54
episode  13  score 19.00  100 game average 16.71
episode  14  score 16.00  100 game average 16.67
episode  15  score 64.00  100 game average 19.62
episode  16  score 21.00  100 game average 19.71
episode  17  score 13.00  100 game average 19.33
episode  18  score 17.00  100 game average 19.21
episode  19  score 15.00  100 game average 19.00
episode  20  score 24.00  100 