https://github.com/tsmatz/reinforcement-learning-tutorials/blob/master/02-policy-gradient.ipynb

In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class PolicyPi(nn.Module):
    def __init__(self, hidden_dim=64):
        super().__init__()
        self.hidden = nn.Linear(4, hidden_dim)
        self.classify = nn.Linear(hidden_dim, 2)

    def forward(self, s):
        outs = self.hidden(s)
        outs = F.relu(outs)
        logits = self.classify(outs)
        return logits
    
policy_pi = PolicyPi()

In [3]:
gamma = 0.99

def pick_sample(s):
    with torch.no_grad():
        s_batch = np.expand_dims(s, axis=0)
        s_batch = torch.tensor(s_batch, dtype=torch.float)

        logits = policy_pi.forward(s_batch)
        logits = logits.squeeze(dim=0)
        probs = F.softmax(logits)

        a = torch.multinomial(probs, num_samples=1) # returns 

        return a.tolist()[0]

env = gym.make("CartPole-v1")
reward_records = []
opt = torch.optim.AdamW(policy_pi.parameters(), lr=0.001)

for i in range(1000):

    # run each episode
    done = False
    states = []
    actions = []
    rewards = []
    s = env.reset()
    while not done:
        states.append(s.tolist())
        a = pick_sample(s)
        s, r, done, _ = env.step(a)
        actions.append(a)
        rewards.append(r)

    # get cumulative rewards (discounted reward)
    cum_rewards = np.zeros_like(rewards)
    reward_len = len(rewards)
    for j in reversed(range(reward_len)):
        cum_rewards[j] = rewards[j] + (cum_rewards[j+1] * gamma if j+1 < reward_len else 0) # building R(tau) from the last G to the first G so that the previous G can access later G

    # train
    states = torch.tensor(states, dtype=torch.float)
    actions = torch.tensor(actions, dtype=torch.int64)
    cum_rewards = torch.tensor(cum_rewards, dtype=torch.float)
    opt.zero_grad()
    logits = policy_pi(states)
    log_probs = -F.cross_entropy(logits, actions, reduction="none") # Cross-entropy loss is -log P
    loss = -log_probs * cum_rewards
    loss.sum().backward()
    opt.step()

    # Record total rewards in episode (max 500)
    print("Run episode{} with rewards {}".format(i, sum(rewards)), end="\r")
    reward_records.append(sum(rewards))

print("\nDone")
env.close()

  # Remove the CWD from sys.path while we load stuff.


Run episode999 with rewards 500.0
Done


In [8]:
s = env.reset()
done = False

while not done:
    a = pick_sample(s)
    s, r, done, _ = env.step(a)
    env.render()

env.close()

  # Remove the CWD from sys.path while we load stuff.
