In [1]:
# https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py
import argparse
import gym
import numpy as np
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
HistoricalAction = namedtuple('HistoricalAction', ['log_prob', 'value'])

class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.linear = nn.Linear(4, 128)
        self.head_action = nn.Linear(128, 2)
        self.head_value = nn.Linear(128, 1)

        self.historical_actions = []
        self.rewards = []

    def forward(self, x):
        x = F.relu(self.linear(x))
        action_scores = self.head_action(x)
        state_values = self.head_value(x)
        return F.softmax(action_scores, dim=-1), state_values


model = ActorCritic()
optimizer = optim.Adam(model.parameters(), lr=3e-2)
eps = np.finfo(np.float32).eps.item()


def choose_action(state):
    state = torch.from_numpy(state).float()
    probs, state_value = model(state)
    m = Categorical(probs)
    action = m.sample()
    model.historical_actions.append(HistoricalAction(m.log_prob(action), state_value))
    return action.item()


def end_episode():
    R = 0
    historical_actions = model.historical_actions
    losses_policy = []
    losses_value = []
    returns = []
    for r in model.rewards[::-1]:
        R = r + gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for (log_prob, value), R in zip(historical_actions, returns):
        advantage = R - value.item()
        losses_policy.append(-log_prob * advantage)
        losses_value.append(F.smooth_l1_loss(value, torch.tensor([R])))
    optimizer.zero_grad()
    loss = torch.stack(losses_policy).sum() + torch.stack(losses_value).sum()
    loss.backward()
    optimizer.step()
    del model.rewards[:]
    del model.historical_actions[:]


def main():
    running_reward = 10
    for i_episode in count(1):
        state, ep_reward = environment.reset(), 0
        for t in range(1, 10000):  
            action = choose_action(state)
            state, reward, done, _ = environment.step(action)
            if render:
                environment.render()
            model.rewards.append(reward)
            ep_reward += reward
            if done:
                break

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        end_episode()
        if i_episode % log_interval == 0:
            print('Episode number {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
        if running_reward > environment.spec.reward_threshold:
            print("Solved! Running reward is {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break

seed = 100
gamma = 0.99
render = 'store_true'
log_interval = 10

environment = gym.make('CartPole-v0')
environment.seed(seed)
torch.manual_seed(seed)

main()

Episode number 10	Last reward: 24.00	Average reward: 21.77
Episode number 20	Last reward: 200.00	Average reward: 88.68
Episode number 30	Last reward: 100.00	Average reward: 101.79
Episode number 40	Last reward: 134.00	Average reward: 115.30
Episode number 50	Last reward: 170.00	Average reward: 131.04
Episode number 60	Last reward: 140.00	Average reward: 130.99
Episode number 70	Last reward: 200.00	Average reward: 145.82
Episode number 80	Last reward: 173.00	Average reward: 162.88
Episode number 90	Last reward: 141.00	Average reward: 169.09
Episode number 100	Last reward: 174.00	Average reward: 164.69
Episode number 110	Last reward: 200.00	Average reward: 176.47
Episode number 120	Last reward: 183.00	Average reward: 185.06
Episode number 130	Last reward: 200.00	Average reward: 182.33
Episode number 140	Last reward: 187.00	Average reward: 187.80
Episode number 150	Last reward: 191.00	Average reward: 186.57
Episode number 160	Last reward: 192.00	Average reward: 181.48
Episode number 170	L