https://www.sefidian.com/2021/03/01/policy-g/

In [1]:
import numpy as np
import torch
import gym
from matplotlib import pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
env = gym.make('CartPole-v0')

In [3]:
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
HIDDEN_SIZE = 256

model = torch.nn.Sequential(
    torch.nn.Linear(obs_size, HIDDEN_SIZE),
    torch.nn.ReLU(),
    torch.nn.Linear(HIDDEN_SIZE, n_actions),
    torch.nn.Softmax(dim=0) # turns logits to probability
)

In [5]:
learning_rate = 0.003
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
 
Horizon = 500
MAX_TRAJECTORIES = 1000
gamma = 0.99
score = []

for trajectory in range(MAX_TRAJECTORIES):
    curr_state = env.reset()
    done = False
    transitions = []

    for t in range(Horizon):
        act_prob = model(torch.from_numpy(curr_state).float())
        action = np.random.choice(np.array([0,1]), p=act_prob.data.numpy()) # choose action according to probability
        prev_state = curr_state
        curr_state, _, done, info = env.step(action)
        transitions.append((prev_state, action, t+1)) # define reward ourselves. the longer time goes on, the more reward
        if done:
            break
    score.append(len(transitions)) #  keep track of the trajectory length over training time . score should be greater as training goes on
    reward_batch = torch.tensor([r for (s,a,r) in transitions]).flip(dims=(0,)) # reward for each episode in reverse order

    batch_Gvals = []
    for i in range(len(transitions)): # len(transitions) is the steps in each episode
                                      # for each of the transitions, calculate expected return
        new_Gval = 0
        power = 0
        for j in range(i, len(transitions)): # calculate expected return for each transition
            new_Gval=new_Gval+((gamma**power)*reward_batch[j]).numpy() # as j increase, the fewer the expected reward (because time has passed and we expect the episode to terminate)
            power += 1
        batch_Gvals.append(new_Gval)

    # normalize
    # expected_returns_batch stores expected returns for all the transitions(step) of the current episode
    # expected_returns_batch = R(tau) = (G_0, G_1, G_k) where tau represents current trajectory(episode) and k represent each transition(step)
    expected_returns_batch=torch.FloatTensor(batch_Gvals)
    expected_returns_batch /= expected_returns_batch.max()

    state_batch = torch.Tensor([s for (s,a,r) in transitions])
    action_batch = torch.Tensor([a for (s,a,r) in transitions])

    # group the action probabilities associated with the actions that were taken
    pred_batch = model(state_batch)    
    prob_batch = pred_batch.gather(dim=1,index=action_batch.long().view(-1,1)).squeeze()

    loss = - torch.sum(torch.log(prob_batch) * expected_returns_batch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step() 

    if trajectory % 100 == 0 and trajectory>0:
            print('Trajectory {}\tAverage Score: {:.2f}'.format(trajectory, np.mean(score[-50:-1])))


Trajectory 100	Average Score: 185.59
Trajectory 200	Average Score: 176.88
Trajectory 300	Average Score: 195.61
Trajectory 400	Average Score: 194.49
Trajectory 500	Average Score: 193.33
Trajectory 600	Average Score: 183.76
Trajectory 700	Average Score: 196.63
Trajectory 800	Average Score: 182.41
Trajectory 900	Average Score: 185.27


In [31]:
state = env.reset()
done = False

while not done:
    act_prob = model(torch.from_numpy(state).float())
    action = np.random.choice(np.array([0,1]), p=act_prob.data.numpy()) # choose action according to probability
    state, reward, done, info = env.step(action)
    env.render()

env.close()