# Training Agents using Upside-Down Reinforcement Learning

... TODO: Intro ...

<img src="images/rl_vs_udrl.png" />

<img src="images/toy_env.png" />

In [1]:
import numpy as np
import torch
from helpers import make_episode

In [2]:
import gym
import rocket_lander_gym

env = gym.make('RocketLander-v0') # RocketLander-v0 | LunarLander-v2 | MountainCar-v0 | CartPole-v0

  result = entry_point.load(False)


## Upside-Down RL Hyperparameters

In [3]:
# Number of (input, target) pairs per batch used for training the behavior function
batch_size = 128

# Scaling factor for desired horizon input
horizon_scale = 0.02

# Number of episodes from the end of the replay buffer used for sampling exploratory
# commands
last_few = 100

# Learning rate for the ADAM optimizer
learning_rate = 0.002

# Number of exploratory episodes generated per step of UDRL training
n_episodes_per_iter = 100

# Number of gradient-based updates of the behavior function per step of UDRL training
n_updates_per_iter = 15

# Number of warm up episodes at the beginning of training
n_warm_up_episodes = 50

# Maximum size of the replay buffer (in episodes)
replay_size = 700

# Scaling factor for desired horizon input
return_scale = 0.02

## 2.3.1 Replay Buffer

RL does not explicitly maximize returns, but instead relies on exploration to continually discover higher return trajectories so that the behavior function can be trained on them. To drive learning progress, we found it helpful to use a replay buffer containing a fixed maximum number of trajectories with the highest returns seen so far, sorted in increasing order by return. The maximum buffer size is a hyperparameter. Since the agent starts learning with zero experience, an initial set of trajectories is generated by executing random actions in the environment. The trajectories are added to the replay buffer and used to start training the agent’s behavior function.

In [36]:
# TODO: I guess we need to get samples

class ReplayBuffer():
    def __init__(self, size):
        self.size = size
        self.buffer = []
        
    def add(self, episode):
        self.buffer.append(episode)
    
    def get_last(self, num):
        return self.buffer[-num:]
    
    def sort_and_trim(self):
        key_sort = lambda episode: episode.total_return
        self.buffer = sorted(self.buffer, key=key_sort)[-self.size:]
        pass
    
    def __len__(self):
        return len(self.buffer)

## 2.3.4 Sampling Exploratory Commands

After each training phase, the agent can attempt to generate new, previously infeasible behavior, potentially achieving higher returns. To profit from such exploration through generalization, one must first create a set of new initial commands c0 to be used in Algorithm 2. We use the following procedure to sample commands:

1. A number of episodes from the end of the replay buffer (i.e., with the highest returns) are selected. This number is a hyperparameter and remains fixed during training.

2. The exploratory desired horizon d<sup>h</sup><sub>0</sub> is set to the mean of the lengths of the selected episodes.

3. The exploratory desired returns d<sup>r</sup><sub>0</sub> are sampled from the uniform distribution U\[M, M + S\] where M is the mean and S is the standard deviation of the selected episodic returns.

In [37]:
def sample_command(buffer, last_few):
    if len(buffer) == 0: return [1, 1]
    
    # 1.
    commands = buffer.get_last(last_few)
    
    # 2.
    lengths = [command.length for command in commands]
    desired_horizon = np.mean(lengths)
    
    # 3.
    returns = [command.total_return for command in commands]
    mean_return, std_return = np.mean(returns), np.std(returns)
    desired_returns = np.random.uniform(mean_return, mean_return+std_return)
    
    return [desired_returns, desired_horizon]

## 3.2 Setup

All agents were implemented using articial neural networks. The behavior function for UDRL agents was implemented using fully-connected feed-forward networks for LunarLander-v2, and convolutional neural networks (CNNs; 16) for TakeCover-v0. The command inputs were scaled by a fixed scaling factor, transformed by a fully-connected sigmoidal layer, and then multiplied element-wise with an embedding of the observed inputs (after the first layer for fully-connected networks; after all convolutional layers for CNNs). Apart from this small modification regarding UDRL command inputs, the network architectures were identical for all algorithms.

In [38]:
import torch

class Behavior(torch.nn.Module):
    def __init__(self, 
                 state_size, 
                 action_size, 
                 hidden_size, 
                 command_scale = [1, 1]):
        super().__init__()
        
        self.command_scale = torch.FloatTensor(command_scale)
        
        self.state_fc = torch.nn.Sequential(torch.nn.Linear(state_size, 
                                                            hidden_size), 
                                            torch.nn.Sigmoid())
        
        self.command_fc = torch.nn.Sequential(torch.nn.Linear(2, hidden_size), 
                                              torch.nn.Sigmoid())
        
        self.output_fc = torch.nn.Sequential(torch.nn.Linear(hidden_size, 
                                                             hidden_size), 
                                             torch.nn.ReLU(), 
                                             torch.nn.Linear(hidden_size, 
                                                             action_size))
        
    
    def forward(self, state, command):
        state_output = self.state_fc(state)
        command_output = self.command_fc(command * self.command_scale)
        embedding = torch.mul(state_output, command_output)
        return self.output_fc(embedding)
    
    def action(self, state, command):
        logits = self.forward(state, command)
        probs = torch.softmax(logits, dim=-1)
        dist = torch.distributions.Categorical(probs)
        return dist.sample().item()

## Algorithm 1: Upside-Down Reinforcement Learning: High-level Description

<img src="images/udrl_algo1.png" />

In [39]:
def random_policy(state = None, command = None):
    return np.random.randint(env.action_space.n)

def initialize_replay_buffer(replay_size, n_warm_up_episodes, last_few):
    replay_buffer = ReplayBuffer(replay_size)
    for i in range(n_warm_up_episodes):
        state = env.reset().tolist()
        command = sample_command(buffer, last_few)
        episode = generate_episode(random_policy, state, command)
        replay_buffer.add(episode)
    
    return replay_buffer

def initialize_behavior_function(state_size, action_size, hidden_size, command_scale):
    return Behavior(state_size, 
                    action_size, 
                    hidden_size, 
                        command_scale)

def stopping_criteria():
    pass

# TODO

## Algorithm 2: Generates an Episode using the Behavior Function

<img src="images/udrl_algo2.png" />

In [31]:
def behavior_policy(state, command):
    return behavior.action(state, command)

def generate_episode(policy, state, command = [1, 1]):
    desired_return = command[0]
    desired_horizon = command[1]
    
    states = []
    actions = []
    rewards = []
    time_steps = 0

    done = False
    while not done:
        action = policy(state, command)
        next_state, reward, done, _ = env.step(action)
        
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        state = next_state.tolist()
        desired_return -= reward
        desired_horizon -= 1
        command = [desired_return, desired_horizon]
        time_steps += 1
        
    return make_episode(states, actions, rewards, sum(rewards), time_steps)