In [1]:
import gym
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.distributions import Categorical

In [12]:
# class Net(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.affine1 = nn.Linear(4, 128)
#         self.affine2 = nn.Linear(128, 2)

# #         self.saved_log_probs = []
# #         self.rewards = []

#     def forward(self, x):
#         x = F.relu(self.affine1(x))
#         action_scores = self.affine2(x)
#         return F.softmax(action_scores, dim=1)
    
Net = lambda: nn.Sequential(nn.Linear(4,128),nn.ReLU(),nn.Linear(128,2),nn.Softmax(dim=1))


class PolicyGradient:
    
    def __init__(self, model, gamma=0.99, eps=1.e-6, running_gamma=0.99,
                episode2thresh=lambda i: 0.05+0.9*np.exp(-1. * i / 100) if i>10 else 0): 
#                 episode2thresh=lambda i: 0): # without exploration
                # eploration will start after 150 episodes
        self.model = model
        self.gamma = gamma
        self.eps = eps
        self.log_probs = []
        self.rewards = []
        self.total_rewards = []
        self.running_reward = 0
        self.running_gamma = running_gamma
        self.episode2thresh = episode2thresh
        
    @property
    def episodes(self):
        return len(self.total_rewards)
        
    def select_action(self,obs):
        self.model.train(True)
        thresh=self.episode2thresh(self.episodes)
        action, log_prob = select_action(obs, self.model, thresh=thresh)
        self.log_probs.append(log_prob)
        return action
    
    def get_loss_and_clear(self):
        total_reward = sum(self.rewards)
        self.total_rewards.append(total_reward)
        self.running_reward = self.running_gamma*self.running_reward+(1-self.running_gamma)*total_reward
        policy_loss = get_policy_loss(self.log_probs, self.rewards, self.gamma, self.eps)
        del self.log_probs[:]
        del self.rewards[:]
        return policy_loss
    
    def take_action(self, action, env, render=False):
        obs, reward, done, info = env.step(action)
        self.rewards.append(reward)
        if render:
            env.render()
        return obs, reward, done, info
    
    def greedy_policy(self, obs):
        self.model.train(False)
        state = torch.from_numpy(obs).float().unsqueeze(0)
        prob = self.model(Variable(state))
        max_prob, action = prob.max(dim=1)
        return action.data[0]

def select_action(obs, model, thresh=0):
    state = torch.from_numpy(obs).float().unsqueeze(0)
    probs = model(Variable(state))
    m = Categorical(probs)
    if np.random.random()>thresh:
#         print(probs)
        try:
            action = m.sample()
        except:
            print(probs,m)
            raise
    else:
        action_space = probs.size(1)
        action = Variable(torch.from_numpy(np.random.randint(action_space,size=1)))
    return action.data[0],m.log_prob(action)
    
def get_normalized_rewards(rewards, gamma, eps):
    acc = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        acc.append(R)
    ret = Variable(torch.Tensor(acc[::-1]),requires_grad=False)
    ret = (ret - ret.mean()) / (ret.std()+eps)
#     print(ret)
    return ret

def get_policy_loss(log_probs,rewards, gamma,eps):
    log_probs_v = torch.cat(log_probs)
    rewards_v = get_normalized_rewards(rewards, gamma, eps)
    return -log_probs_v.dot(rewards_v) # / rewards_v.size(0)

In [13]:
def clip_grads(net, low=-10, high=10):
    """Gradient clipping to the range [low, high]."""
    parameters = [param for param in net.parameters()
                  if param.grad is not None]
    for p in parameters:
        p.grad.data.clamp_(low, high)
        
# clip_grads = nn.utils.clip_grad_norm

In [20]:
net = Net()
optimizer = optim.Adam(net.parameters(), lr=1e-3,weight_decay=0.001)
trainer = PolicyGradient(model=net)

# small learning rate makes more stable training

In [21]:
env = gym.make('CartPole-v1')

for episode in range(10000):
    obs = env.reset()
    total_reward = 0
    for _ in range(10000): # not exceed 10000 episodes
        action = trainer.select_action(obs)
        obs, reward, done, _ = trainer.take_action(action, env, render=False)
        total_reward+=reward
        if done:
             break
    policy_loss = trainer.get_loss_and_clear()
#     print(policy_loss)
    optimizer.zero_grad()
    policy_loss.backward()
    clip_grads(trainer.model,-10,10)
#     nn.utils.clip_grad_norm(trainer.model.parameters(), 40)
    optimizer.step()
    running_reward = trainer.running_reward
    print(episode, total_reward,running_reward)
    if running_reward>env.spec.reward_threshold:
        break
print("Finished: %s@%s" %(trainer.running_reward,episode))

0 23.0 0.2300000000000002
1 14.0 0.36770000000000036
2 12.0 0.4840230000000005
3 14.0 0.6191827700000005
4 27.0 0.8829909423000007
5 11.0 0.9841610328770009
6 17.0 1.144319422548231
7 22.0 1.352876228322749
8 24.0 1.5793474660395217
9 16.0 1.7235539913791267
10 36.0 2.0663184514653357
11 32.0 2.3656552669506827
12 34.0 2.681998714281176
13 12.0 2.7751787271383646
14 22.0 2.9674269398669813
15 45.0 3.387752670468312
16 28.0 3.6338751437636287
17 16.0 3.7575363923259926
18 40.0 4.119961028402733
19 11.0 4.188761418118705
20 52.0 4.666873803937519
21 12.0 4.740205065898144
22 25.0 4.942803015239162
23 12.0 5.013374985086771
24 24.0 5.203241235235904
25 13.0 5.281208822883545
26 34.0 5.56839673465471
27 12.0 5.632712767308163
28 14.0 5.716385639635082
29 15.0 5.809221783238731
30 42.0 6.1711295654063445
31 17.0 6.279418269752281
32 27.0 6.486624087054758
33 9.0 6.51175784618421
34 24.0 6.686640267722368
35 17.0 6.789773865045144
36 46.0 7.181876126394693
37 15.0 7.260057365130746
38 11.0 7

In [22]:
torch.save(net.state_dict(), "pg_exploration.pth")

In [23]:
for i in range(20):
    obs = env.reset()
    total_reward=0
    for _ in range(10000): # not exceed 10000 episodes
        action = trainer.greedy_policy(obs)
        obs, reward, done, _ = env.step(action)
        env.render()
        total_reward+=reward
        if done:
             break
    print("Encore: %s"%total_reward)

Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 500.0


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
plt.plot(trainer.total_rewards)