Actor-Critic using TD

In [2]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

#Hyperparameters
learning_rate = 0.0002
gamma         = 0.98 # discount factor
n_rollout     = 10

class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.data = [] # list containing data
        self.fc1 = nn.Linear(4,256) # fully conected first
        self.fc_pi = nn.Linear(256,2) # fully connected pi
        self.fc_v = nn.Linear(256,1) # fully connected value
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
    # Neural network of policy
    def pi(self, x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob
    
    # NN for value function
    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v
    
    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, done_lst = [], [], [], [], []
        for transition in self.data:
            s,a,r,s_prime,done = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r/100.0])
            s_prime_lst.append(s_prime) # next state
            done_mask = 0.0 if done else 1.0 # If an episode is done (terminated), done_mask = 0
            done_lst.append([done_mask])
        
        # Convert the lists into torch.tensor
        s_batch, a_batch, r_batch, s_prime_batch, done_batch = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                                               torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \
                                                               torch.tensor(done_lst, dtype=torch.float)
        self.data = [] # Clear data after make_batch
        return s_batch, a_batch, r_batch, s_prime_batch, done_batch
  
    def train_net(self):
        s, a, r, s_prime, done = self.make_batch() # start with make_batch from accumulated data
        td_target = r + gamma * self.v(s_prime) * done # I guess it is tensor operation. (Vector)
        delta = td_target - self.v(s) # TD-target, advantage
        
        pi = self.pi(s, softmax_dim=1)
        pi_a = pi.gather(1,a)
        # If you use just delta, it means network. Thus, use delta.detach() since it is a constant
        loss = -torch.log(pi_a) * delta.detach() + F.smooth_l1_loss(self.v(s), td_target.detach()) # policy loss + value loss

        self.optimizer.zero_grad()
        loss.mean().backward()
        self.optimizer.step()         

In [3]:
env = gym.make('CartPole-v1')
num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.n
print("Size of Action Space ->  {}".format(num_actions))

model = ActorCritic()    
print_interval = 50
score = 0.0

#    for n_epi in range(10000):
for n_epi in range(500): # small test
    done = False
    s = env.reset()
    while not done:
        for t in range(n_rollout):
            prob = model.pi(torch.from_numpy(s).float()) # tensor of probability. torch.from_numpy: numpy to torch tensor
            m = Categorical(prob) # Category object defined by its probability
            # m.sample() returns a tensor. draw an action 
            a = m.sample().item() # .item() returns Int
            s_prime, r, done, info = env.step(a) # Action and new step
            model.put_data((s,a,r,s_prime,done)) # accumulate data

            s = s_prime # update state
            score += r

            if done:
                break                     

        model.train_net()

    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
        score = 0.0
env.close()

Size of State Space ->  4
Size of Action Space ->  2
# of episode :50, avg score : 20.6
# of episode :100, avg score : 26.1
# of episode :150, avg score : 31.9
# of episode :200, avg score : 44.1
# of episode :250, avg score : 58.9
# of episode :300, avg score : 73.1
# of episode :350, avg score : 114.8
# of episode :400, avg score : 129.9
# of episode :450, avg score : 171.2


1