In [1]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
#하이퍼 파라미터
learning_rate = 0.0005
gamma = 0.98
buffer_limit = 50000
batch_size = 32

### 리플레이 버퍼 클래스

In [3]:
class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst= [], [], [], [], []

        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])

        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), torch.tensor(done_mask_lst)

    def size(self):
        return len(self.buffer)

        

5만개의 데이터를 들고 있다가 필요할 때, batch_size만큼의 데이터를 뽑아서 제공해줌


### Q밸류 네트워크 클래스

In [11]:
class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0, 1)
        else:
            return out.argmax().item()

nn.Module로부터 상속받음 init에서 각 레이어들을 선언해줌 (여기선 다 FC레이어만 사용됨) - 어차피 input은 (카트 위치, 카트 속도, 막대 각도, 막대 각속도)로 정해져 있기 때문인거 같다.

마지막 아웃풋은 Q밸류라 ReLU를 안써준 모습을 보이고, sample_action은 코인 토스를 통해서 액션을 epsilon-greedy 방식으로 선택한다.

### 학습 함수

In [5]:
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s,a,r,s_prime,done_mask = memory.sample(batch_size)
        q_out = q(s)
        q_a = q_out.gather(1, a)
        max_q_prime = q_target(s_prime). max (1)[0].unsqueeze(1)
        target = r + gamma*max_q_prime*done_mask
        loss = F.smooth_l1_loss(q_a, target)

        optimizer.zero_grad()
        loss.backward()  #F로 forward연산을 기록해놨다가 대충 loss에 대해 그라디언트를 구할수 있다.
        optimizer.step()

에피소드가 하나 끝날때마다 함수가 호출되며, 10개의 미니배치를 뽑아 학습시켜
loss값과 loss의 gradient를 구한다.

q_a는 선택된action의 q값에 해당한다

### 메인 함수

In [17]:
def main():
    env = gym.make('CartPole-v1')
    q = Qnet()
    q_target = Qnet()
    q_target.load_state_dict(q.state_dict())
    memory = ReplayBuffer()

    print_interval = 20
    score = 0.0 
    optimizer = optim.Adam(q.parameters(), lr=learning_rate)
    #q, q_target은 초반에 동일하지만, optimizer에 의해 q는 지속적으로 학습이 된다

    for n_epi in range(10000):
        epsilon = max(0.01, 0.08 - 0.01*(n_epi/200))
        # 시간에따라 0.08 에서 0.01까지 epsilon 값이 감소한다
        s = env.reset()
        done = False

        while not done:
            a = q.sample_action(torch.from_numpy(s).float(), epsilon)
            s_prime, r, done, info = env.step(a)
            done_mask = 0.0 if done else 1.0
            memory.put((s,a,r/100.0,s_prime, done_mask))
            s= s_prime
            score += r
            if done:
                break

        if memory.size()>2000:
            train(q, q_target, memory, optimizer)

        if n_epi%print_interval==0 and n_epi!=0:
            q_target.load_state_dict(q.state_dict())
            print("n_episode:{}, score : {:.1f}, n_buffer :{}, eps : {:.1f}%".format(n_epi, score/print_interval, memory.size(), epsilon*100))
            score =0.0
    env.close()

In [18]:
if __name__ == '__main__':
    main()

n_episode:20, score : 10.2, n_buffer :205, eps : 7.9%
n_episode:40, score : 9.4, n_buffer :394, eps : 7.8%
n_episode:60, score : 9.6, n_buffer :586, eps : 7.7%
n_episode:80, score : 9.7, n_buffer :780, eps : 7.6%
n_episode:100, score : 9.7, n_buffer :974, eps : 7.5%
n_episode:120, score : 9.8, n_buffer :1171, eps : 7.4%
n_episode:140, score : 9.4, n_buffer :1360, eps : 7.3%
n_episode:160, score : 9.7, n_buffer :1553, eps : 7.2%
n_episode:180, score : 9.8, n_buffer :1749, eps : 7.1%
n_episode:200, score : 9.6, n_buffer :1940, eps : 7.0%




n_episode:220, score : 9.8, n_buffer :2135, eps : 6.9%
n_episode:240, score : 9.9, n_buffer :2334, eps : 6.8%
n_episode:260, score : 10.3, n_buffer :2540, eps : 6.7%
n_episode:280, score : 12.7, n_buffer :2793, eps : 6.6%
n_episode:300, score : 14.7, n_buffer :3087, eps : 6.5%
n_episode:320, score : 16.3, n_buffer :3413, eps : 6.4%
n_episode:340, score : 42.2, n_buffer :4258, eps : 6.3%
n_episode:360, score : 76.3, n_buffer :5785, eps : 6.2%
n_episode:380, score : 122.2, n_buffer :8229, eps : 6.1%
n_episode:400, score : 103.0, n_buffer :10290, eps : 6.0%
n_episode:420, score : 102.8, n_buffer :12346, eps : 5.9%
n_episode:440, score : 114.4, n_buffer :14634, eps : 5.8%
n_episode:460, score : 159.2, n_buffer :17817, eps : 5.7%
n_episode:480, score : 224.0, n_buffer :22297, eps : 5.6%
n_episode:500, score : 171.6, n_buffer :25728, eps : 5.5%
n_episode:520, score : 187.1, n_buffer :29470, eps : 5.4%
n_episode:540, score : 170.6, n_buffer :32881, eps : 5.3%
n_episode:560, score : 194.0, n_b

KeyboardInterrupt: ignored

에피소드가 늘어나면, 평균 스코어값은 오르나 완벽하진 않고, eps는 수렴하는 것을 확인하였다.