Policy-based

In [5]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

#Hyperparameters
learning_rate = 0.0002
gamma         = 0.98

class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.data = []
        
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 2)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x
      
    def put_data(self, item):
        self.data.append(item)
        
    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]: # backward calculation
            R = r + gamma * R 
            loss = -torch.log(prob) * R
            loss.backward() # Compute gradients
        self.optimizer.step()
        self.data = []



In [6]:
env = gym.make('CartPole-v1')
num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.n
print("Size of Action Space ->  {}".format(num_actions))

pi = Policy()
score = 0.0
print_interval = 20
print_interval = 100


for n_epi in range(10000):
#for n_epi in range(500):
    s = env.reset()
    done = False

    while not done: # CartPole-v1 forced to terminates at 500 step.
        prob = pi(torch.from_numpy(s).float())
        m = Categorical(prob)
        a = m.sample()
        s_prime, r, done, info = env.step(a.item())
        pi.put_data((r,prob[a]))
        s = s_prime
        score += r

    pi.train_net()

    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval))
        score = 0.0


Size of State Space ->  4
Size of Action Space ->  2
# of episode :100, avg score : 25.48
# of episode :200, avg score : 31.31
# of episode :300, avg score : 37.8
# of episode :400, avg score : 39.75
# of episode :500, avg score : 45.93
# of episode :600, avg score : 62.73
# of episode :700, avg score : 67.64
# of episode :800, avg score : 91.72
# of episode :900, avg score : 119.42
# of episode :1000, avg score : 148.91
# of episode :1100, avg score : 167.75
# of episode :1200, avg score : 196.79
# of episode :1300, avg score : 206.51
# of episode :1400, avg score : 223.02
# of episode :1500, avg score : 220.87
# of episode :1600, avg score : 253.33
# of episode :1700, avg score : 282.35
# of episode :1800, avg score : 290.35
# of episode :1900, avg score : 272.93
# of episode :2000, avg score : 312.82
# of episode :2100, avg score : 338.51
# of episode :2200, avg score : 339.02
# of episode :2300, avg score : 300.67
# of episode :2400, avg score : 314.37
# of episode :2500, avg score

Test

In [8]:
score = 0.0

#for n_epi in range(10000):
for n_epi in range(1000):
    s = env.reset()
    done = False

    while not done: # CartPole-v1 forced to terminates at 500 step.
        prob = pi(torch.from_numpy(s).float())
        m = Categorical(prob)
        a = m.sample()
        s_prime, r, done, info = env.step(a.item())
        pi.put_data((r,prob[a]))
        s = s_prime
        score += r

    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval))
        score = 0.0



# of episode :100, avg score : 468.23
# of episode :200, avg score : 475.74
# of episode :300, avg score : 477.46
# of episode :400, avg score : 471.5
# of episode :500, avg score : 469.75
# of episode :600, avg score : 479.85
# of episode :700, avg score : 479.53
# of episode :800, avg score : 469.23
# of episode :900, avg score : 482.2


In [9]:
env.close()
