In [1]:
### Implement REINFORCE (Monte Carlo Policy Gradients) to solve LunarLander task in OpenAI-gym

In [2]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical #bernoulli distribution

In [3]:
env_id = 'LunarLander-v2'
env = gym.make(env_id)

In [4]:
env.observation_space

Box(8,)

In [5]:
env.action_space

Discrete(4)

In [15]:
## check for bounds
env.observation_space.is_bounded()

False

In [16]:
LEARNING_RATE = 1e-2 #set learning rate

## Define Policy

In [7]:
class Policy(nn.Module):
    """Defines the general policy for an agent following simple NN architecture"""
    def __init__(self, action_size=4, state_size=8, h1=16, h2=8):
        """Creates the model using a 3 Hidden layer NN"""
        super(Policy, self).__init__()#inherit methods from parent class & override forward f(x)
        self.action_space = action_size
        self.state_space = state_size
        self.fc1 = nn.Linear(in_features=state_size, out_features=h1, bias=True)
        self.fc2 = nn.Linear(in_features=h1, out_features=h2, bias=True)
        self.fc3 = nn.Linear(in_features=h2, out_features=action_size, bias=True)
    def forward(self, x):
        """
        Performs one-pass from state -> action mapping.
        @Param:
        1. x - input state
        @return:
        x - action as a set of vector following stochastic measure. softmax output to logits from NN.
        """
        if(type(x) != torch.Tensor):
            try:
                x = torch.from_numpy(x).float().unsqueeze(0)#convert ndarray to torch.Tensor object
            except:
                raise TypeError(f"expected type torch.Tensor. got {type(x)}")
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)
    def act(self, state):
        """
        Uses current deterministic policy to determine the set of action to perform
        @param:
        1. state: input state of env. shape = env.observation_space.shape[0]
        @return:
        - action: (int) discrete action to take by the agent.
        - log_probs: (array_like) log of output from softmax unit. set of log probabilities.
        """
        probs = self.forward(state) #get estimated action following stochastic measure
        m = Categorical(probs)#get Bernoulli distribution of action
        action = m.sample() #returns the action based on the probability of each based on Benoulli(probs)
        return action, m.log_prob(probs)

In [8]:
policy = Policy()#instantiate Policy obj

In [9]:
state = env.reset() #sample random state

In [13]:
policy.act(state) #Test to see if policy works (random weights initially)

(tensor([3]),
 tensor([[-1.6292, -1.6292, -1.6292, -1.6292]], grad_fn=<SqueezeBackward1>))

In [17]:
optim.Adam(params=policy.parameters(), lr=LEARNING_RATE)

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    weight_decay: 0
)

### Train using REINFORCE

In [None]:
def REINFORCE(num_episode=1000, max_tau=1000, gamma=1.0, print_every=10):
    """
    Implements the Reinforce algorithm.
    See paper for more details: https://bit.ly/REINFORCE_paper
    @param:
    1. num_episode: number of epochs to train for.
    2. max_tau: length of trajectory, 𝝉.
    3. gamma: discounted return, γ.
    4. print_every: pprint details after very X epochs.
    @return:
    - scores: (array_like) expected return over epochs.
    """
    scores_deque = deque(maxlen=100)#determines if episode is solved by determining score of the last N episodes
    scores = []
    for i_episode in range(1, num_episode + 1):
        saved_log_probs = 