In [8]:
import os

import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym

import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim 
from torch.distributions import Categorical

from collections import deque
from IPython.display import Image
from matplotlib import animation
from tqdm.notebook import tqdm

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

np.random.seed(369)

# Problem statement

Given LunarLander-v2 environment, train an RL Agent to complete this game using Policy Gradient

## Environment Description

+ Environment type: **Stochastic** environment. 
+ Action Space: Discrete(4). 
+ Game Objective: Control the rocket to landing correctly at landing pad. 
+ Terminate State: The rocker landed on the surface. 
+  Reward function:
    + +100 points for landing at landing pad. 
    + -100 points for rocket crashed. 
    + -0.3 points for every timestep. 
    + +10 points for each rocket's leg contact with ground. 

In [5]:
game_name = 'LunarLander-v2'
env = gym.make(game_name, render_mode='rgb_array')

state_space = env.observation_space.shape[0]
action_space = env.action_space.n

print(f'State space : {state_space}\n'\
      f'Action space: {action_space}')

State space : 8
Action space: 4


In [9]:
class Policy(nn.Module): 
    def __init__(self, s_size, a_size, h_size): 
        super().__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, h_size*2)
        self.fc3 = nn.Linear(h_size*2, a_size)

    def forward(self, x): 
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=1)
        return x
    
    def act(self, state): 
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()

        return action.item(), m.log_prob(action)

In [27]:
def reinforce(policy, 
              optimizer, 
              n_training_episodes, 
              max_steps, 
              gamma, 
              print_every): 
    
    scores_deque = deque(maxlen=100)
    scores = []

    for i_episode in range(1, n_training_episodes + 1): 
        saved_log_probs = []
        rewards = []
        state, _ = env.reset()

        for t in range(max_steps): 
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            new_state, reward, done, _, info = env.step(action)
            rewards.append(reward)
            
            if done: 
                break

            state = new_state
        # Calculate expected discounted return
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        returns = deque(maxlen=max_steps)
        n_steps = len(rewards)

        for t in range(n_steps)[::-1]: 
            disc_return_t = returns[0] if len(returns) > 0 else 0
            returns.appendleft(gamma * disc_return_t + rewards[t])

        eps = np.finfo(np.float32).eps.item()
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)
        
        # Policy Gradient
        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns): 
            policy_loss.append(-log_prob * disc_return) # Since PyTorch/Tensorflow prefer minization, we put a negative sign to the objective function. 
        policy_loss = torch.cat(policy_loss).sum()

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

    return scores

In [28]:
h_size = 20
n_training_episodes = 3000
n_eval_episodes = 100
lr = 0.8

max_steps = 99
gamma = 0.95
eval_seed = range(n_eval_episodes)

policy = Policy(s_size= state_space, 
                a_size=action_space, 
                h_size=h_size).to(device)

optimizer = optim.Adam(policy.parameters(), lr=lr)
scores = reinforce(policy, 
                   optimizer, 
                   n_training_episodes, 
                   max_steps,
                   gamma,
                   print_every=100)

In [29]:
scores

[-130.8771257038721,
 -616.4897471168301,
 -260.0353836001714,
 -477.802883284622,
 -510.0255370225717,
 -422.82854808873736,
 -612.8053477496264,
 -443.2372095084932,
 -589.0136211373135,
 -781.4600081801733,
 -279.54746471164407,
 -369.74110533414034,
 -577.5561391111107,
 -435.24218070637016,
 -470.5842906682385,
 -304.9924035656309,
 -408.73161789875627,
 -209.90631941693007,
 -493.5741006409929,
 -588.9454470866588,
 -641.8719878625874,
 -419.48941617479113,
 -579.1277944120345,
 -373.6732049216024,
 -428.5729169374209,
 -616.0241013163939,
 -451.29960580906436,
 -449.91504473337017,
 -387.4673875225285,
 -587.6139759231925,
 -351.000658772778,
 -625.8705812946016,
 -509.8168568951348,
 -652.4636236026184,
 -415.76297780695865,
 -540.3970760349436,
 -472.5226633552841,
 -457.53301498287715,
 -604.1095487693929,
 -438.49353880115456,
 -613.4776726410713,
 -392.96639384617583,
 -469.29079032417724,
 -687.7478185597688,
 -584.7706946472906,
 -562.5631092781359,
 -468.1675549754996,
 

In [30]:
def evaluate_agent(env,
                   max_steps, 
                   n_eval_episodes, 
                   policy, 
                   seed): 
    episode_rewards = []
    for episode in range(n_eval_episodes): 
        if seed: 
            state, _ = env.reset(seed=seed[episode])
        else: 
            state, _ = env.reset()

        step = 0
        done = False
        total_rewards_ep = 0

        for step in range(max_steps): 
            action, _ = policy.act(state)
            new_state, reward, done, _, info = env.step(action)
            total_rewards_ep += reward

            if done: 
                break

            state = new_state

        episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward

In [31]:
mean_reward, std_reward = evaluate_agent(env, 
                                         max_steps, 
                                         n_eval_episodes, 
                                         policy, 
                                         eval_seed)
print(f'Mean Reward: {mean_reward} +/- {std_reward}')

Mean Reward: -498.06676173610964 +/- 95.38669941514625
