In [1]:
import gym
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.distributions import Categorical

In [2]:
# class Net(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.affine1 = nn.Linear(4, 128)
#         self.affine2 = nn.Linear(128, 2)

# #         self.saved_log_probs = []
# #         self.rewards = []

#     def forward(self, x):
#         x = F.relu(self.affine1(x))
#         action_scores = self.affine2(x)
#         return F.softmax(action_scores, dim=1)
    
Net = lambda: nn.Sequential(nn.Linear(4,128),nn.ReLU(),nn.Linear(128,2),nn.Softmax(dim=1))


class PolicyGradient:
    
    def __init__(self, model, gamma=0.99, eps=1.e-6, running_gamma=0.99,
                episode2thresh=lambda i: 0.05+0.9*np.exp(-1. * i / 100) if i>150 else 0): 
#                 episode2thresh=lambda i: 0): # without exploration
                # eploration will start after 150 episodes
        self.model = model
        self.gamma = gamma
        self.eps = eps
        self.log_probs = []
        self.rewards = []
        self.total_rewards = []
        self.running_reward = 0
        self.running_gamma = running_gamma
        self.episode2thresh = episode2thresh
        
    @property
    def episodes(self):
        return len(self.total_rewards)
        
    def select_action(self,obs):
        self.model.train(True)
        thresh=self.episode2thresh(self.episodes)
        action, log_prob = select_action(obs, self.model, thresh=thresh)
        self.log_probs.append(log_prob)
        return action
    
    def get_loss_and_clear(self):
        total_reward = sum(self.rewards)
        self.total_rewards.append(total_reward)
        self.running_reward = self.running_gamma*self.running_reward+(1-self.running_gamma)*total_reward
        policy_loss = get_policy_loss(self.log_probs, self.rewards, self.gamma, self.eps)
        del self.log_probs[:]
        del self.rewards[:]
        return policy_loss
    
    def take_action(self, action, env, render=False):
        obs, reward, done, info = env.step(action)
        self.rewards.append(reward)
        if render:
            env.render()
        return obs, reward, done, info
    
    def greedy_policy(self, obs):
        self.model.train(False)
        state = torch.from_numpy(obs).float().unsqueeze(0)
        prob = self.model(Variable(state))
        max_prob, action = prob.max(dim=1)
        return action.data[0]

def select_action(obs, model, thresh=0):
    state = torch.from_numpy(obs).float().unsqueeze(0)
    probs = model(Variable(state))
    m = Categorical(probs)
    if np.random.random()>thresh:
#         print(probs)
        try:
            action = m.sample()
        except:
            print(probs,m)
            raise
    else:
        action_space = probs.size(1)
        action = Variable(torch.from_numpy(np.random.randint(action_space,size=1)))
    return action.data[0],m.log_prob(action)
    
def get_normalized_rewards(rewards, gamma, eps):
    acc = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        acc.append(R)
    ret = Variable(torch.Tensor(acc[::-1]),requires_grad=False)
    ret = (ret - ret.mean()) / (ret.std()+eps)
#     print(ret)
    return ret

def get_policy_loss(log_probs,rewards, gamma,eps):
    log_probs_v = torch.cat(log_probs)
    rewards_v = get_normalized_rewards(rewards, gamma, eps)
    return -log_probs_v.dot(rewards_v)

In [3]:
def clip_grads(net, low=-10, high=10):
    """Gradient clipping to the range [low, high]."""
    parameters = [param for param in net.parameters()
                  if param.grad is not None]
    for p in parameters:
        p.grad.data.clamp_(low, high)

In [4]:
net = Net()
optimizer = optim.Adam(net.parameters(), lr=1e-2,weight_decay=0.001)
trainer = PolicyGradient(model=net)

In [6]:
env = gym.make('CartPole-v1')

for episode in range(30):
    obs = env.reset()
    total_reward = 0
    for _ in range(10000): # not exceed 10000 episodes
        action = trainer.select_action(obs)
        obs, reward, done, _ = trainer.take_action(action, env, render=False)
        total_reward+=reward
        if done:
             break
    policy_loss = trainer.get_loss_and_clear()
#     print(policy_loss)
    optimizer.zero_grad()
    policy_loss.backward()
    clip_grads(trainer.model,-5,5)
    optimizer.step()
    running_reward = trainer.running_reward
    print(episode, total_reward,running_reward)
    if running_reward>env.spec.reward_threshold:
        break
print("Finished: %s@%s" %(trainer.running_reward,episode))

0 47.0 18.523583911480916
1 40.0 18.738348072366108
2 51.0 19.06096459164245
3 41.0 19.280354945726025
4 50.0 19.587551396268765
5 49.0 19.88167588230608
6 59.0 20.27285912348302
7 45.0 20.52013053224819
8 43.0 20.744929226925706
9 44.0 20.97747993465645
10 72.0 21.487705135309888
11 57.0 21.84282808395679
12 51.0 22.134399803117223
13 69.0 22.60305580508605
14 50.0 22.87702524703519
15 51.0 23.15825499456484
16 52.0 23.44667244461919
17 101.0 24.222205720172997
18 44.0 24.41998366297127
19 79.0 24.965783826341557
20 59.0 25.30612598807814
21 68.0 25.73306472819736
22 133.0 26.805734080915386
23 102.0 27.55767674010623
24 122.0 28.50209997270517
25 122.0 29.43707897297812
26 263.0 31.77270818324834
27 78.0 32.234981101415855
28 140.0 33.3126312904017
29 69.0 33.66950497749768
Finished: 33.66950497749768@29


In [8]:
env = gym.make('CartPole-v1')

for episode in range(10000):
    obs = env.reset()
    total_reward = 0
    for _ in range(10000): # not exceed 10000 episodes
        action = trainer.select_action(obs)
        obs, reward, done, _ = trainer.take_action(action, env, render=False)
        total_reward+=reward
        if done:
             break
    policy_loss = trainer.get_loss_and_clear()
#     print(policy_loss)
    optimizer.zero_grad()
    policy_loss.backward()
    clip_grads(trainer.model,-5,5)
    optimizer.step()
    running_reward = trainer.running_reward
    print(episode, total_reward,running_reward)
    if running_reward>env.spec.reward_threshold:
        break
print("Finished: %s@%s" %(trainer.running_reward,episode))

0 116.0 43.6323291637344
1 74.0 43.936005872097056
2 65.0 44.14664581337608
3 65.0 44.35517935524232
4 81.0 44.7216275616899
5 100.0 45.274411286073
6 126.0 46.08166717321227
7 57.0 46.19085050148014
8 57.0 46.29894199646534
9 50.0 46.33595257650068
10 89.0 46.762593050735674
11 92.0 47.214967120228316
12 93.0 47.672817449026034
13 97.0 48.16608927453577
14 48.0 48.164428381790415
15 118.0 48.86278409797251
16 57.0 48.944156256992784
17 67.0 49.124714694422856
18 39.0 49.023467547478624
19 74.0 49.27323287200384
20 41.0 49.190500543283804
21 103.0 49.728595537850964
22 55.0 49.78130958247245
23 43.0 49.71349648664773
24 46.0 49.67636152178125
25 52.0 49.69959790656344
26 63.0 49.832601927497805
27 60.0 49.93427590822283
28 91.0 50.3449331491406
29 64.0 50.4814838176492
30 87.0 50.8466689794727
31 46.0 50.798202289677974
32 47.0 50.76022026678119
33 56.0 50.81261806411338
34 68.0 50.984491883472245
35 51.0 50.98464696463752
36 48.0 50.95480049499115
37 40.0 50.84525249004123
38 107.0 51

RuntimeError: invalid argument 2: invalid multinomial distribution (encountering probability entry < 0) at /home/jiancheng/apt/pytorch/aten/src/TH/generic/THTensorRandom.c:306

In [None]:
torch.save(net.state_dict(), "pg_exploration.pth")

In [9]:
for i in range(20):
    obs = env.reset()
    total_reward=0
    for _ in range(10000): # not exceed 10000 episodes
        action = trainer.greedy_policy(obs)
        obs, reward, done, _ = env.step(action)
        env.render()
        total_reward+=reward
        if done:
             break
    print("Encore: %s"%total_reward)

Encore: 9.0
Encore: 9.0
Encore: 8.0
Encore: 10.0
Encore: 9.0
Encore: 8.0
Encore: 9.0
Encore: 8.0
Encore: 9.0
Encore: 11.0
Encore: 10.0
Encore: 10.0
Encore: 9.0
Encore: 9.0
Encore: 8.0
Encore: 10.0
Encore: 8.0
Encore: 10.0
Encore: 10.0
Encore: 10.0


In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
plt.plot(trainer.total_rewards)