Lunar lander source code: https://github.com/openai/gym/blob/master/gym/envs/box2d/lunar_lander.py

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gym
import time
import pickle
from tqdm import tqdm
from copy import deepcopy
from rl_src.animations.video import write_video, show_video, simulate_episode

np.random.seed(42)

# Lunar Lander environment

In [None]:
"""
Rocket trajectory optimization is a classic topic in Optimal Control.

According to Pontryagin's maximum principle it's optimal to fire engine full throttle or
turn it off. That's the reason this environment is OK to have discreet actions (engine on or off).

The landing pad is always at coordinates (0,0). The coordinates are the first two numbers in the state vector.
Reward for moving from the top of the screen to the landing pad and zero speed is about 100..140 points.
If the lander moves away from the landing pad it loses reward. The episode finishes if the lander crashes or
comes to rest, receiving an additional -100 or +100 points. Each leg with ground contact is +10 points.
Firing the main engine is -0.3 points each frame. Firing the side engine is -0.03 points each frame.
Solved is 200 points.

Landing outside the landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
on its first attempt. Please see the source code for details.
"""


env = gym.make("LunarLander-v2")

In [None]:
# pos_x, pos_y, vel_x, vel_y, lander_angle, lander_angular_velocity, ground contact left leg, ground contact right leg
print(env.observation_space)

# Nop, fire left engine, main engine, right engine
print(env.action_space)

# Experience class

In [None]:
class ExperienceList:
    """Keeps track of past experiences, with a maximum buffer size.
    """
    
    def __init__(self, max_list_size=10000, batch_size=16):
        self.max_list_size = max_list_size
        self.batch_size = batch_size
        self.experiences = []
        
    def add_experience(self, state, action, reward, terminal, next_state):
        self.experiences.append([state, action, reward, terminal, next_state])
        self.experiences = self.experiences[-self.max_list_size:]
        
    def get_sample(self):
        indices = np.random.choice(np.arange(len(self.experiences)), size=self.batch_size, replace=False)
        return [self.experiences[i] for i in indices]

# Weight initializer

In [None]:
class WeightsInit:
    
    def __init__(self):
        pass
    
    @staticmethod
    def he(shape):
        n_out, n_in = shape
        return np.random.normal(loc=0, scale=np.sqrt(2/n_in), size=(n_out, n_in))
    
    @staticmethod
    def saxe(shape):
        n_out, n_in = shape
        weights = np.random.normal(0, 1, shape)
        if n_out < n_in:
            weights = weights.T
        weights, triang = np.linalg.qr(weights)  # orthonormal matrix, upper triangular matrix
        diag = np.diag(triang, 0)
        diag_sign = np.sign(diag)
        weights *= diag_sign
        if n_out < n_in:
            weights = weights.T
        return weights

# Optimizer

In [None]:
class SGD:
    
    def __init__(self, learning_rate=0.001):
        self.learning_rate = learning_rate
        
    def gradient_step(self, weights, updates):
        for name in weights.keys():
            weights[name] += self.learning_rate * updates[name]
        return weights


class Adam:
    
    def __init__(self, weights, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07):
        self.learning_rate = learning_rate
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon
        
        # Initialize Adam algorithm's m and v
        self.first_moment = dict()
        self.second_moment = dict()
        
        for name in weights.keys():
            self.first_moment[name] = np.zeros(weights[name].shape)
            self.second_moment[name] = np.zeros(weights[name].shape)
            
        self.beta_1_product = self.beta_1
        self.beta_2_product = self.beta_2
    
    def gradient_step(self, weights, updates):
        """updates should be based on the gradient multiplied by the td errors.
        """
        for name in weights.keys():
            self.first_moment[name] = self.beta_1*self.first_moment[name] + (1-self.beta_1)*updates[name]
            self.second_moment[name] = self.beta_2*self.second_moment[name] + (1-self.beta_2)*updates[name]**2
            first_moment_hat = self.first_moment[name] / (1 - self.beta_1_product)
            second_moment_hat = self.second_moment[name] / (1 - self.beta_2_product)
            weight_update = self.learning_rate / (np.sqrt(second_moment_hat) + self.epsilon) * first_moment_hat
            weights[name] += weight_update
        
        self.beta_1_product *= self.beta_1
        self.beta_2_product *= self.beta_2
        
        return weights

# Action-Value neural network

In [None]:
class ActionValueNN:
    
    def __init__(self, weight_initializer='he', n_features=10, n_hidden_nodes=128, n_outputs=4, gamma=1., tau=1., n_td_steps=0):
        self.weight_initializer = weight_initializer
        self.n_features = n_features
        self.n_hidden_nodes = n_hidden_nodes
        self.n_outputs = n_outputs
        self.weights = self.initialize_weights(weight_initializer)
        self.tau = tau
        self.gamma = gamma
        self.n_td_steps = n_td_steps
        
    def initialize_weights(self, weight_initializer):
        init_fnc = getattr(WeightsInit, weight_initializer)
        weights = dict()
        weights['w0'] = init_fnc((self.n_hidden_nodes, self.n_features))
        weights['b0'] = np.zeros(shape=(self.n_hidden_nodes, 1))
        weights['w1'] = init_fnc((self.n_outputs, self.n_hidden_nodes))
        weights['b1'] = np.zeros(shape=(self.n_outputs, 1))
        return weights
    
    @staticmethod
    def generate_features(state):
        """Expected shape of state is (batch_size, n_state_space). state should contain
        the following [pos_x, pos_y, vel_x, vel_y, ang, ang_vel, leg_l, leg_r] = state
        Adds two features:
        - distance from target
        - total velocity
        """
        distance = (state[:,0:1]**2 + state[:,1:2]**2)**0.5
        velocity = (state[:,2:3]**2 + state[:,3:4]**2)**0.5
        return np.concatenate((state, distance, velocity), axis=1)
    
    @staticmethod
    def softmax(logits, tau=1.):
        """logits shape: (n_outputs, batch_size)
        output shape is the same: (n_outputs, batch_size)
        """
        z = logits - np.max(logits, axis=0, keepdims=True)
        z = z / tau
        num = np.exp(z)
        den = np.sum(num, axis=0)
        return num / den
    
    def forward_propagation(self, features):
        """features shape: (n_features, batch_size)
        output shape: (n_outputs, batch_size)
        """
        z0 = np.dot(self.weights['w0'], features) + self.weights['b0']
        a0 = np.maximum(z0, 0)  # ReLU
        z1 = np.dot(self.weights['w1'], a0) + self.weights['b1']  # linear output layer
        return z1, a0, z0
    
    def get_action_values(self, features):
        action_values, _, _ = self.forward_propagation(features)
        return action_values
    
    def calculate_td_error(self, features, actions, rewards, terminals, next_features, frozen_nn):
        # calculate state value estimates of next state from the frozen neural network
        next_action_values = frozen_nn.get_action_values(next_features)
        next_action_softmaxes = frozen_nn.softmax(next_action_values, self.tau)
        next_state_values = np.sum(next_action_softmaxes * next_action_values, axis=0)  # expected sarsa
        next_state_values *= (1 - terminals)
        # calculate state values estimates of current state
        action_values = self.get_action_values(features)
        state_values = action_values[actions, np.arange(action_values.shape[1])]  # selected action
        # calculate td error
        td_error = rewards + self.gamma**self.n_td_steps * next_state_values - state_values  # shape: (batch_size,)
        return td_error
    
    def calculate_td_updates(self, features, td_error_matrix):
        """Variable shape:
        features -> (n_features, batch_size)
        td_error_matrix -> (n_actions, batch_size)
        z0, a0, d_a0 -> (n_hidden_nodes, batch_size)
        z1 -> (n_actions, batch_size)
        """
        batch_size = features.shape[1]
        z1, a0, z0 = self.forward_propagation(features)
        d_a0 = (a0 > 0).astype('float32')
        updates = dict()
        updates['w1'] = np.dot(td_error_matrix, a0.T) * (1 / batch_size)  # shape: (n_outputs, n_hidden_nodes)
        updates['b1'] = np.sum(td_error_matrix, axis=1, keepdims=True) * (1 / batch_size)  # shape: (n_outputs, 1)
        td_error_matrix_backprop = np.dot(self.weights['w1'].T, td_error_matrix) * d_a0  # shape: (n_hidden_nodes, batch_size)
        updates['w0'] = np.dot(td_error_matrix_backprop, features.T) * (1 / batch_size)  # shape: (n_hidden_nodes, n_features)
        updates['b0'] = np.sum(td_error_matrix_backprop, axis=1, keepdims=True) * (1 / batch_size)  # shape: (n_hidden_nodes, 1)
        return updates
        
    def get_weights_updates(self, experiences, frozen_nn):
        batch_size = len(experiences)
        states, actions, rewards, terminals, next_states = zip(*experiences)  # unpack data
        states = np.array(states)  # shape (batch_size, n_state_space)
        features = self.generate_features(states).T  # shape (n_features, batch_size)
        next_states = np.array(next_states)  # shape (batch_size, n_state_space)
        next_features = self.generate_features(next_states).T  # shape (n_features, batch_size)
        actions = np.array(actions)
        rewards = np.array(rewards)
        terminals = np.array(terminals)
        # calculate td errors
        td_error = self.calculate_td_error(features, actions, rewards, terminals, next_features, frozen_nn)
        # create td error matrix with non-zero values only for the actions that were actually taken
        td_error_matrix = np.zeros((self.n_outputs, batch_size))
        td_error_matrix[actions, np.arange(batch_size)] = td_error
        # calculate gradients
        updates = self.calculate_td_updates(features, td_error_matrix)
        return updates

# Agent

In [None]:
class Agent:
    
    def __init__(self, env, agent_setup, exp_list_setup, nn_setup, opt_setup):
        self.n_state_space = env.observation_space.shape[0]
        self.n_features = self.n_state_space + 2
        self.action_space = env.action_space
        self.actions = list(range(self.action_space.n))
        
        self.n_replay_steps = agent_setup.get('n_replay_steps', 4)
        self.n_td_steps = agent_setup.get('n_td_steps', 10)
        self.freeze_weights = agent_setup.get('freeze_weights', True)
        self.experience_list = ExperienceList(**exp_list_setup)
        self.nn = ActionValueNN(n_td_steps=self.n_td_steps, n_features=self.n_features, n_outputs=self.action_space.n, **nn_setup)
        self.gamma = self.nn.gamma
        self.opt = Adam(self.nn.weights, **opt_setup)
        
        self.reward_history = None
        self.state_history = None
        self.action_history = None
    
    def select_action(self, state):
        features = self.nn.generate_features(state.reshape((1, self.n_state_space)))
        action_logits = self.nn.get_action_values(features=features.T)
        softmax_probs = self.nn.softmax(action_logits)
        action = np.random.choice(self.actions, p=softmax_probs.reshape(-1))
        return action
    
    def agent_start(self, state):
        # initialize first action
        action = self.select_action(state)
        # initialize history lists
        self.reward_history = []
        self.state_history = [state]
        self.action_history = [action]
        return action
    
    def agent_step(self, reward, state):
        # store newly acquired experience
        terminal = 0
        self.reward_history.append(reward)
        # correct length of reward history
        self.reward_history = self.reward_history[-self.n_td_steps:]
        if len(self.reward_history) >= self.n_td_steps:
            reward_sum = sum([self.gamma**i * r for i, r in enumerate(self.reward_history)])
            last_state = self.state_history[0]  # we can take the 0th element since we only keep track of the necessary history
            last_action = self.action_history[0]
            self.experience_list.add_experience(last_state, last_action, reward_sum, terminal, state)
        # select next action
        action = self.select_action(state)
        # learn by replaying experiences
        if len(self.experience_list.experiences) >= self.experience_list.batch_size:
            if self.freeze_weights:
                frozen_nn = deepcopy(self.nn)
            else:
                frozen_nn = self.nn
            for _ in range(self.n_replay_steps):
                experience_sample = self.experience_list.get_sample()
                updates = self.nn.get_weights_updates(experience_sample, frozen_nn)
                self.nn.weights = self.opt.gradient_step(self.nn.weights, updates)
            
        # save state and action
        self.state_history.append(state)
        self.action_history.append(action)
        # correct length of state/action history
        self.state_history = self.state_history[-self.n_td_steps:]
        self.action_history = self.action_history[-self.n_td_steps:]
    
    def agent_end(self, reward):
        # store newly acquired experience
        state = np.zeros(self.state_history[-1].shape)  # set dummy terminal state
        terminal = 1
        self.reward_history.append(reward)
        # correct length of reward history
        self.reward_history = self.reward_history[-self.n_td_steps:]
        if len(self.reward_history) >= self.n_td_steps:
            reward_sum = sum(self.reward_history)
            last_state = self.state_history[0]  # we can take the 0th element since we only keep track of the necessary history
            last_action = self.action_history[0]
            self.experience_list.add_experience(last_state, last_action, reward_sum, terminal, state)
        # learn by replaying experiences
        if len(self.experience_list.experiences) >= self.experience_list.batch_size:
            if self.freeze_weights:
                frozen_nn = deepcopy(self.nn)
            else:
                frozen_nn = self.nn
            for _ in range(self.n_replay_steps):
                experience_sample = self.experience_list.get_sample()
                updates = self.nn.get_weights_updates(experience_sample, frozen_nn)
                self.nn.weights = self.opt.gradient_step(self.nn.weights, updates)

# Helper Functions

In [None]:
def save_object(path, obj):
    if '/' in path:
        directory, fname = path.rsplit('/', 1)
        os.makedirs(directory, exist_ok=True)
    pickle.dump(obj, open(path, "wb"))


def load_object(path):
    return pickle.load(open(path, "rb"))


def episode(env, agent):
    """Run one (training) episode.
    """
    last_observation = env.reset()
    terminal = False
    cumulative_reward = 0
    episode_n_steps = 0
    agent.agent_start(last_observation)
    while not terminal:
        observation, reward, terminal, info = env.step(agent.action_history[-1])
        agent.agent_step(reward, observation)
        cumulative_reward += reward
        episode_n_steps += 1
    agent.agent_end(reward)
    return cumulative_reward, episode_n_steps


def run_experiment(env, agent, n_episodes=100, checkpoint_freq=None, checkpoint_path_template=None, results_output_path=None, df_prev_results=None):
    reward_list = []
    episode_n_steps_list = []
    ep_start = 1
    if df_prev_results is not None:
        # if previous results given, then this means continue training from certain point
        reward_list = df_prev_results['reward'].tolist()
        episode_n_steps_list = df_prev_results['episode_steps'].tolist()
        ep_start = len(df_prev_results) + 1
        print("Continuing from older results from episode {}".format(len(df_prev_results)))
    
    for i in tqdm(range(ep_start, ep_start + n_episodes)):
        episode_reward, episode_n_steps = episode(env, agent)
        reward_list.append(episode_reward)
        episode_n_steps_list.append(episode_n_steps)
        if checkpoint_freq is not None and (i % checkpoint_freq) == 0:
            checkpoint_path = checkpoint_path_template.format(i)
            save_object(checkpoint_path, agent)
            df_results = pd.DataFrame(data={'reward': reward_list, 'episode_steps': episode_n_steps_list})
            if results_output_path is not None:
                df_results.to_csv(results_output_path, index=False)
            
    df_results = pd.DataFrame(data={'reward': reward_list, 'episode_steps': episode_n_steps_list})
    if results_output_path is not None:
        df_results.to_csv(results_output_path, index=False)
    return df_results


def animate_episode(env, agent):
    if agent == 'random':
        action_fnc = lambda x: env.action_space.sample()
    else:
        action_fnc = lambda x: agent.select_action(x)
    obs = env.reset()
    terminal = False
    i = 0
    while not terminal and i < 500:
        obs, _, terminal, _ = env.step(action_fnc(obs))
        env.render()
        i += 1
        time.sleep(1/30)
    env.close()

# Example episodes

In [None]:
# random agent
simulate_episode(env, lambda x: env.action_space.sample(), width=600, play_type='autoplay')

# Train agent

In [None]:
results_output_path = 'agents/train_results.csv'
checkpoint_path_template = 'agents/lunarlander_ep{:04}.pickle'
checkpoint_frequency = 100
n_episodes = 1000
config = {
    'agent_setup': {'n_td_steps': 10, 'n_replay_steps': 4, 'freeze_weights': True},
    'exp_list_setup': {'max_list_size': 50000, 'batch_size': 8},
    'nn_setup': {'weight_initializer': 'saxe', 'n_hidden_nodes': 256, 'gamma': 0.99, 'tau': 0.001},
    'opt_setup': {'learning_rate': 0.001, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-08}
}


agent = Agent(env, **config)

In [None]:
results = run_experiment(
    env, 
    agent, 
    n_episodes=n_episodes, 
    checkpoint_freq=checkpoint_frequency, 
    checkpoint_path_template=checkpoint_path_template,
    results_output_path=results_output_path
)

# Inspect results

In [None]:
results = pd.read_csv(results_output_path, squeeze=True)

In [None]:
fig, ax1 = plt.subplots(figsize=(15,5))

window = 100
min_periods = 10
rol_rewards = results['reward'].rolling(window=window, min_periods=min_periods).mean()
rol_rewards.plot(color='green', alpha=0.8, ax=ax1)
ax1.vlines(range(0, len(results)+1, 100), rol_rewards.min(), rol_rewards.max(), color='black', lw=0.5, ls='--', alpha=0.5)
ax1.set_xlabel('episode number', fontsize=16)
ax1.set_ylabel('cumulative reward', fontsize=16)
ax2 = ax1.twinx()
results['episode_steps'].rolling(window=window, min_periods=min_periods).mean().plot(color='blue', alpha=0.8, ax=ax2)
ax2.set_ylabel('number of steps', fontsize=16)
ax1.legend(loc=2, fontsize=14)
ax2.legend(loc=4, fontsize=14);

In [None]:
agent = load_object(checkpoint_path_template.format(n_episodes))

In [None]:
# trained agent
simulate_episode(env, agent.select_action, width=800, play_type='controls')