In [27]:
import numpy as np
import matplotlib.pyplot as plt
import gym

import torch as T
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions.categorical import Categorical

In [38]:
class PolicyNetwork(nn.Module):
    def __init__(self, lr, input_dims, 
                 n_actions):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(*input_dims, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, n_actions)
        
        self.optimizer = optim.Adam(params=self.parameters(), 
                       lr=lr)
        
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [46]:
class PolicyGradientAgent(object):
    def __init__(self, lr, input_dims, gamma=0.99, n_actions=4):
        self.lr = lr 
        self.gamma = gamma
        self.reward_memory = []
        self.action_memory = []
        
        self.policy = PolicyNetwork(self.lr, input_dims, n_actions)
        
    def choose_action(self, observation):
        state = T.Tensor([observation]).to(self.policy.device)
        probabilities = F.softmax(self.policy.forward(state))
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        
        log_probs = action_probs.log_prob(action)
        self.action_memory.append(log_probs)
        return action.item() # de-reference with item()

    def store_rewards(self, reward):
        self.reward_memory.append(reward)
        
    def learn(self):
        self.policy.optimizer.zero_grad()
        
        # G_t = R_t+1 + gamma * R_t+2 + gamma**2 + R_t+3
        # G_t = sum from k=0 to k=T { gamma**k * R_t+k+1}
        G = np.zeros_like(self.reward_memory)
        for t in range(len(self.reward_memory)):
            G_sum = 0
            discount = 1
            
            for k in range(t, len(self.reward_memory)):
                G_sum += self.reward_memory[k] * discount
                discount *= self.gamma
            G[t] = G_sum
    
        G = T.tensor(G, dtype=T.float).to(self.policy.device)
        
        loss = 0
        for g, logprob in zip(G, self.action_memory):
            loss += -g * logprob
        loss.backward()
        self.policy.optimizer.step()
        
        self.action_memory = []
        self.reward_memory = []

In [47]:
def plot_learning_curve(score, x, figure_file=None):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    if figure_file:
        plt.savefig(figure_file)
    else:
        plt.show()

In [48]:
env = gym.make('LunarLander-v2')
n_games = 3000
agent = PolicyGradientAgent(gamma=0.99, lr=0.0005, input_dims=[8],
                            n_actions=4)

fname = 'REINFORCE_' + 'lunar_lunar_lr' + str(agent.lr) + '_' \
        + str(n_games) + 'games'
figure_file = 'plots/' + fname + '.png'

scores = []
for i in range(n_games):
    done = False
    observation = env.reset()
    score = 0
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.store_rewards(reward)
        observation = observation_
    agent.learn()
    scores.append(score)

    avg_score = np.mean(scores[-100:])
    print('episode ', i, 'score %.2f' % score,
            'average score %.2f' % avg_score)

x = [i+1 for i in range(len(scores))]
plot_learning_curve(scores, x, figure_file)

env.close()
env.env.close()


  if sys.path[0] == '':


episode  0 score -174.03 average score -174.03
episode  1 score -340.24 average score -257.14
episode  2 score -77.17 average score -197.15
episode  3 score -135.03 average score -181.62
episode  4 score -238.07 average score -192.91
episode  5 score -120.96 average score -180.92
episode  6 score -237.16 average score -188.95
episode  7 score -281.66 average score -200.54
episode  8 score -115.34 average score -191.07
episode  9 score -131.56 average score -185.12
episode  10 score -155.52 average score -182.43
episode  11 score -400.78 average score -200.63
episode  12 score -334.23 average score -210.90
episode  13 score -321.00 average score -218.77
episode  14 score -183.87 average score -216.44
episode  15 score -138.99 average score -211.60
episode  16 score -125.14 average score -206.51
episode  17 score -202.79 average score -206.31
episode  18 score -282.05 average score -210.29
episode  19 score -382.57 average score -218.91
episode  20 score -104.24 average score -213.45
epi

episode  172 score -261.35 average score -195.37
episode  173 score -506.94 average score -196.96
episode  174 score -90.38 average score -194.18
episode  175 score -242.56 average score -193.44
episode  176 score -247.22 average score -192.38
episode  177 score -188.00 average score -189.52
episode  178 score -128.43 average score -189.43
episode  179 score -449.16 average score -193.12
episode  180 score -92.85 average score -193.20
episode  181 score -153.03 average score -193.77
episode  182 score -143.35 average score -194.06
episode  183 score -302.47 average score -193.73
episode  184 score -102.51 average score -193.56
episode  185 score -175.47 average score -191.53
episode  186 score -399.15 average score -194.48
episode  187 score -92.97 average score -192.22
episode  188 score -147.90 average score -192.59
episode  189 score -104.05 average score -191.13
episode  190 score -9.71 average score -190.64
episode  191 score -161.26 average score -187.38
episode  192 score -133.4

KeyboardInterrupt: 