In [2]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
buffer_limit  = 50000
batch_size    = 32

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit) # sequence containers with dynamic sizes that can be expanded or contracted on both ends
        # buffer has [[s,a,r,s',done],[s,a,r,s',done],[s,a,r,s',done],...]
        
    def put(self, transition):
        self.buffer.append(transition)
    
    """
    done_mask is zero or one
    """
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n) # because of random.sample, mini_batch is a subset of buffer
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])

        # return variables, type of torch.tensor
        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__() # Call initialization of parent
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
      
    """
    Input tensor -> output scalar
    """
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        # epsilon-greedy
        # goal is to return a scalar (not tensor)
        if coin < epsilon:
            return random.randint(0,1) # draw 0 or 1
        else : 
            return out.argmax().item() # returning action. (Not action network)

"""
q (Qnet): 
q_target (Qnet): 
memory (ReplayBuffer):
optimizer (optim.Adam):
"""
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s,a,r,s_prime,done_mask = memory.sample(batch_size) # s,a,r,s_prime,done_mask are torch.tensor
        # those are tensors

        q_out = q(s) # q(s) is equivalent to q.forward(s)
        q_a = q_out.gather(1,a) # current greedy-action. Usually dim=1 since the tensor looks like [[a,b,c,d,...]]
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1) # q_target(s_prime).max(1)[0]: take tensor of value. unsqueeze(1): make dimension of 1
        target = r + gamma * max_q_prime * done_mask # only gamma is a scalar
        loss = F.smooth_l1_loss(q_a, target)
        
        optimizer.zero_grad() # initialization to get rid of accumulated gradients
        loss.backward()
        optimizer.step()


In [99]:
env = gym.make('CartPole-v1')
num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.n
print("Size of Action Space ->  {}".format(num_actions))
q = Qnet()
q_target = Qnet()
q_target.load_state_dict(q.state_dict()) # load pre-trained model
memory = ReplayBuffer()

print_interval = 20
print_interval = 100
score = 0.0  
optimizer = optim.Adam(q.parameters(), lr=learning_rate) # Define the optimizer for the parameters of q

for n_epi in range(10000):
#for n_epi in range(500):
    epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
    s = env.reset() # numpy ndarray
    done = False

    while not done:
        a = q.sample_action(torch.from_numpy(s).float(), epsilon) # convert numpy to torch tensor
        s_prime, r, done, info = env.step(a)
        done_mask = 0.0 if done else 1.0
        memory.put((s,a,r/100.0,s_prime, done_mask))
        s = s_prime

        score += r
        if done:
            break

    if memory.size()>2000:
        train(q, q_target, memory, optimizer)

    if n_epi%print_interval==0 and n_epi!=0:
        q_target.load_state_dict(q.state_dict())
        print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
                                                        n_epi, score/print_interval, memory.size(), epsilon*100))
        score = 0.0


n_episode :100, score : 9.8, n_buffer : 983, eps : 7.5%
n_episode :200, score : 9.9, n_buffer : 1971, eps : 7.0%
n_episode :300, score : 17.2, n_buffer : 3695, eps : 6.5%
n_episode :400, score : 15.3, n_buffer : 5222, eps : 6.0%
n_episode :500, score : 23.0, n_buffer : 7520, eps : 5.5%
n_episode :600, score : 91.0, n_buffer : 16619, eps : 5.0%
n_episode :700, score : 235.5, n_buffer : 40167, eps : 4.5%
n_episode :800, score : 203.6, n_buffer : 50000, eps : 4.0%
n_episode :900, score : 191.5, n_buffer : 50000, eps : 3.5%
n_episode :1000, score : 141.1, n_buffer : 50000, eps : 3.0%
n_episode :1100, score : 184.4, n_buffer : 50000, eps : 2.5%
n_episode :1200, score : 185.2, n_buffer : 50000, eps : 2.0%
n_episode :1300, score : 190.1, n_buffer : 50000, eps : 1.5%
n_episode :1400, score : 170.2, n_buffer : 50000, eps : 1.0%
n_episode :1500, score : 264.0, n_buffer : 50000, eps : 1.0%
n_episode :1600, score : 205.8, n_buffer : 50000, eps : 1.0%
n_episode :1700, score : 225.8, n_buffer : 5000

In [101]:
# Test
score = 0.0  
for n_epi in range(1000):
    epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
    s = env.reset() # numpy ndarray
    done = False

    while not done:
        a = q.sample_action(torch.from_numpy(s).float(), epsilon) # convert numpy to torch tensor
        s_prime, r, done, info = env.step(a)
        done_mask = 0.0 if done else 1.0
        s = s_prime

        score += r
        if done:
            break

    if n_epi%print_interval==0 and n_epi!=0:
        q_target.load_state_dict(q.state_dict()) # update q_target network
        print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
                                                        n_epi, score/print_interval, memory.size(), epsilon*100))
        score = 0.0



n_episode :100, score : 505.0, n_buffer : 50000, eps : 7.5%
n_episode :200, score : 500.0, n_buffer : 50000, eps : 7.0%
n_episode :300, score : 500.0, n_buffer : 50000, eps : 6.5%
n_episode :400, score : 500.0, n_buffer : 50000, eps : 6.0%
n_episode :500, score : 500.0, n_buffer : 50000, eps : 5.5%
n_episode :600, score : 500.0, n_buffer : 50000, eps : 5.0%
n_episode :700, score : 500.0, n_buffer : 50000, eps : 4.5%
n_episode :800, score : 500.0, n_buffer : 50000, eps : 4.0%
n_episode :900, score : 500.0, n_buffer : 50000, eps : 3.5%


In [None]:
env.close()