In [1]:
# import dependencies
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np
import gym
from collections import deque

In [2]:
# define policy network
class policy_net(nn.Module):
    def __init__(self, nS, nH, nA): # nS: state space size, nH: n. of neurons in hidden layer, nA: size action space
        super(policy_net, self).__init__()
        self.h = nn.Linear(nS, nH)
        self.out = nn.Linear(nH, nA)

    # define forward pass with one hidden layer with ReLU activation and sofmax after output layer
    def forward(self, x):
        x = F.relu(self.h(x))
        x = F.softmax(self.out(x), dim=1)
        return x

In [4]:
# create environment
env = gym.make("CartPole-v1")
# instantiate the policy
policy = policy_net(env.observation_space.shape[0], 20, env.action_space.n)
# create an optimizer
optimizer = torch.optim.Adam(policy.parameters())

In [11]:
# initialize gamma and stats
gamma=0.99
n_episode = 1
returns = deque(maxlen=100)
render_rate = 100 # render every render_rate episodes
while True:
    rewards = []
    actions = []
    states  = []
    # reset environment
    state, _= env.reset()
    while True:
        # render episode every render_rate epsiodes
        if n_episode%render_rate==0:
            env.render()

        # print(state)
        # calculate probabilities of taking each action
        probs = policy(torch.tensor(state).unsqueeze(0).float())
        # sample an action from that set of probs
        sampler = Categorical(probs)
        action = sampler.sample()

        # use that action in the environment
        new_state, reward, done, _, info = env.step(action.item())
        # store state, action and reward
        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = new_state
        if done:
            break

    # preprocess rewards
    rewards = np.array(rewards)
    # calculate rewards to go for less variance
    R = torch.tensor([np.sum(rewards[i:]*(gamma**np.array(range(i, len(rewards))))) for i in range(len(rewards))])
    # or uncomment following line for normal rewards
    #R = torch.sum(torch.tensor(rewards))

    # preprocess states and actions
    states = torch.tensor(states).float()
    actions = torch.tensor(actions)

    # calculate gradient
    probs = policy(states)
    sampler = Categorical(probs)
    log_probs = -sampler.log_prob(actions)   # "-" because it was built to work with gradient descent, but we are using gradient ascent
    pseudo_loss = torch.sum(log_probs * R) # loss that when differentiated with autograd gives the gradient of J(θ)
    # update policy weights
    optimizer.zero_grad()
    pseudo_loss.backward()
    optimizer.step()

    # calculate average return and print it out
    returns.append(np.sum(rewards))
    print("Episode: {:6d}\tAvg. Return: {:6.2f}".format(n_episode, np.mean(returns)))
    n_episode += 1

# close environment
env.close()

Episode:      1	Avg. Return:  53.00
Episode:      2	Avg. Return:  47.50
Episode:      3	Avg. Return:  47.00
Episode:      4	Avg. Return:  44.25
Episode:      5	Avg. Return:  38.60
Episode:      6	Avg. Return:  36.50
Episode:      7	Avg. Return:  35.86
Episode:      8	Avg. Return:  33.25
Episode:      9	Avg. Return:  31.22
Episode:     10	Avg. Return:  30.00
Episode:     11	Avg. Return:  29.09
Episode:     12	Avg. Return:  30.75
Episode:     13	Avg. Return:  30.00
Episode:     14	Avg. Return:  30.29
Episode:     15	Avg. Return:  31.33
Episode:     16	Avg. Return:  30.81
Episode:     17	Avg. Return:  29.94
Episode:     18	Avg. Return:  30.33
Episode:     19	Avg. Return:  30.74
Episode:     20	Avg. Return:  30.75
Episode:     21	Avg. Return:  30.43
Episode:     22	Avg. Return:  30.45
Episode:     23	Avg. Return:  29.61
Episode:     24	Avg. Return:  29.42
Episode:     25	Avg. Return:  28.72
Episode:     26	Avg. Return:  28.15
Episode:     27	Avg. Return:  28.19
Episode:     28	Avg. Return:

KeyboardInterrupt: 