## Testing the PPO Implementation

### Imports

In [None]:
from constants import LANDER, BIPEDAL
from PPO.PPO_agent import DiscretePPOAgent, ContinuousPPOAgent
from environments.environment import Environment
from environments.vizdoom_environment import VizDoomEnvironment
from utils.results_plotter import plot_test_results

import os
import numpy as np
from time import sleep

In [2]:
root_path = './'
results_path = os.path.join(root_path, 'results')
weights_path = os.path.join(root_path, 'weights')

lander_results_path = os.path.join(results_path, LANDER)
lander_weights_path = os.path.join(weights_path, LANDER)

bipedal_results_path = os.path.join(results_path, BIPEDAL)
bipedal_weights_path = os.path.join(weights_path, BIPEDAL)

basic_results_path = os.path.join(results_path, 'basic')
basic_weights_path = os.path.join(weights_path, 'basic')

health_results_path = os.path.join(results_path, 'health_gathering')
health_weights_path = os.path.join(weights_path, 'health_gathering')

### Functions to test the environments

In [3]:
def create_test_environment(environment_name, render):
    if environment_name == 'basic' or environment_name == 'health_gathering':
        env = VizDoomEnvironment(environment_name, render=render)
    else:
        env = Environment(environment_name, render=render)
    return env

def create_test_agent(environment_name, state_shape, action_space, weights_path):
    if environment_name == 'basic' or environment_name == 'health_gathering':
        num_actions = action_space
        agent = DiscretePPOAgent.test(weights_path, state_shape, num_actions)
    else:
        action_space_info = (action_space.shape[0], action_space.low, action_space.high)
        agent = ContinuousPPOAgent.test(weights_path, state_shape, action_space_info)
    return agent

In [4]:
def test_agent(results_path, weights_path, environment_name, render=True):
    env = create_test_environment(environment_name, render)
    state_shape = env.get_state_shape()
    action_space = env.get_action_space()
    agent = create_test_agent(environment_name, state_shape, action_space, weights_path)

    test_episodes = 100
    episode_rewards = []
    sleep_time = 0.05 if (environment_name == 'basic' or environment_name == 'health_gathering') else 0

    for i in range(test_episodes):
        done = False
        state = env.start()
        total_reward = 0

        while not done:
            action = agent.test_step(state)
            next_state, reward, done = env.step(action)
            total_reward += reward
            state = next_state

            # Slow a little bit the ViZDoom environments to appreciate the visualization
            sleep(sleep_time)

        episode_rewards.append(total_reward)
        print(f'Episode {i+1}/{test_episodes}: Reward: {total_reward:.2f}')

    avg_reward = np.mean(episode_rewards)
    print(f'Avg reward of {test_episodes} episodes: {avg_reward}')
    plot_test_results(episode_rewards, environment_name, results_path)
    env.end()

### Test PPO on LunarLanderContinuous-v2

In [5]:
best_lander_results_path = os.path.join(lander_results_path, 'execution_2')
best_lander_weights_path = os.path.join(lander_weights_path, 'execution_2')

test_agent(best_lander_results_path, best_lander_weights_path, LANDER, render=False)

Episode 1/100: Reward: 310.31
Episode 2/100: Reward: 277.10
Episode 3/100: Reward: 273.31
Episode 4/100: Reward: 261.94
Episode 5/100: Reward: 284.80
Episode 6/100: Reward: 226.71
Episode 7/100: Reward: 286.23
Episode 8/100: Reward: 273.18
Episode 9/100: Reward: 292.83
Episode 10/100: Reward: 284.18
Episode 11/100: Reward: 270.04
Episode 12/100: Reward: 279.72
Episode 13/100: Reward: 290.73
Episode 14/100: Reward: 242.16
Episode 15/100: Reward: 243.59
Episode 16/100: Reward: 304.38
Episode 17/100: Reward: 244.41
Episode 18/100: Reward: 296.36
Episode 19/100: Reward: 307.58
Episode 20/100: Reward: 306.25
Episode 21/100: Reward: 270.45
Episode 22/100: Reward: 295.65
Episode 23/100: Reward: 286.25
Episode 24/100: Reward: 223.10
Episode 25/100: Reward: 302.86
Episode 26/100: Reward: 287.06
Episode 27/100: Reward: 272.32
Episode 28/100: Reward: 243.00
Episode 29/100: Reward: 274.29
Episode 30/100: Reward: 304.58
Episode 31/100: Reward: 285.99
Episode 32/100: Reward: 295.91
Episode 33/100: R

### Test PPO on BipedalWalker-v3

In [6]:
# This execution contains the weights and results of the first execution which learns to walk in the safest way

bipedal_results_1_path = os.path.join(bipedal_results_path, 'execution_1')
bipedal_weights_1_path = os.path.join(bipedal_weights_path, 'execution_1')

test_agent(bipedal_results_1_path, bipedal_weights_1_path, BIPEDAL, render=False)

Episode 1/100: Reward: 316.29
Episode 2/100: Reward: 84.69
Episode 3/100: Reward: 317.45
Episode 4/100: Reward: 317.74
Episode 5/100: Reward: 316.13
Episode 6/100: Reward: 316.14
Episode 7/100: Reward: 316.38
Episode 8/100: Reward: 317.24
Episode 9/100: Reward: 316.96
Episode 10/100: Reward: 317.09
Episode 11/100: Reward: -80.62
Episode 12/100: Reward: 317.59
Episode 13/100: Reward: 316.47
Episode 14/100: Reward: 316.81
Episode 15/100: Reward: 122.03
Episode 16/100: Reward: 316.58
Episode 17/100: Reward: 316.45
Episode 18/100: Reward: 315.81
Episode 19/100: Reward: 317.17
Episode 20/100: Reward: 317.62
Episode 21/100: Reward: 156.47
Episode 22/100: Reward: 61.37
Episode 23/100: Reward: 315.95
Episode 24/100: Reward: 318.67
Episode 25/100: Reward: 316.33
Episode 26/100: Reward: 315.56
Episode 27/100: Reward: 316.00
Episode 28/100: Reward: 316.57
Episode 29/100: Reward: -29.92
Episode 30/100: Reward: 77.09
Episode 31/100: Reward: 315.26
Episode 32/100: Reward: 317.00
Episode 33/100: Rewa

In [7]:
# This execution contains the weights and results of the second execution which learns to walk better, but not in the fastest way

bipedal_results_2_path = os.path.join(bipedal_results_path, 'execution_2')
bipedal_weights_2_path = os.path.join(bipedal_weights_path, 'execution_2')

test_agent(bipedal_results_2_path, bipedal_weights_2_path, BIPEDAL,render=False)

Episode 1/100: Reward: 324.56
Episode 2/100: Reward: 322.05
Episode 3/100: Reward: 323.71
Episode 4/100: Reward: 323.95
Episode 5/100: Reward: 324.95
Episode 6/100: Reward: 322.93
Episode 7/100: Reward: 317.38
Episode 8/100: Reward: 323.43
Episode 9/100: Reward: 324.55
Episode 10/100: Reward: 324.49
Episode 11/100: Reward: 325.29
Episode 12/100: Reward: 324.01
Episode 13/100: Reward: 324.19
Episode 14/100: Reward: 323.99
Episode 15/100: Reward: 324.70
Episode 16/100: Reward: 325.28
Episode 17/100: Reward: 324.34
Episode 18/100: Reward: 324.06
Episode 19/100: Reward: 325.11
Episode 20/100: Reward: 323.80
Episode 21/100: Reward: 324.49
Episode 22/100: Reward: 324.14
Episode 23/100: Reward: 325.21
Episode 24/100: Reward: 323.45
Episode 25/100: Reward: 325.75
Episode 26/100: Reward: 324.07
Episode 27/100: Reward: 324.66
Episode 28/100: Reward: 324.99
Episode 29/100: Reward: 323.08
Episode 30/100: Reward: 324.06
Episode 31/100: Reward: 324.32
Episode 32/100: Reward: 324.35
Episode 33/100: R

### Test PPO on ViZDoom basic

In [5]:
basic_results_1_path = os.path.join(basic_results_path, 'execution_1')
basic_weights_1_path = os.path.join(basic_weights_path, 'execution_1')

test_agent(basic_results_1_path, basic_weights_1_path, 'basic', render=False)

Episode 1/100: Reward: 95.00
Episode 2/100: Reward: 79.00
Episode 3/100: Reward: 79.00
Episode 4/100: Reward: 95.00
Episode 5/100: Reward: 75.00
Episode 6/100: Reward: 95.00
Episode 7/100: Reward: 95.00
Episode 8/100: Reward: 95.00
Episode 9/100: Reward: 75.00
Episode 10/100: Reward: 79.00
Episode 11/100: Reward: 87.00
Episode 12/100: Reward: 95.00
Episode 13/100: Reward: 79.00
Episode 14/100: Reward: 71.00
Episode 15/100: Reward: 91.00
Episode 16/100: Reward: 95.00
Episode 17/100: Reward: 95.00
Episode 18/100: Reward: 83.00
Episode 19/100: Reward: 95.00
Episode 20/100: Reward: 95.00
Episode 21/100: Reward: 87.00
Episode 22/100: Reward: 95.00
Episode 23/100: Reward: 64.00
Episode 24/100: Reward: 71.00
Episode 25/100: Reward: 67.00
Episode 26/100: Reward: 95.00
Episode 27/100: Reward: 87.00
Episode 28/100: Reward: 67.00
Episode 29/100: Reward: 95.00
Episode 30/100: Reward: 95.00
Episode 31/100: Reward: 95.00
Episode 32/100: Reward: 95.00
Episode 33/100: Reward: 71.00
Episode 34/100: Rew

### Test PPO on ViZDoom health_gathering

In [6]:
health_results_1_path = os.path.join(health_results_path, 'execution_1')
health_weights_1_path = os.path.join(health_weights_path, 'execution_1')

test_agent(health_results_1_path, health_weights_1_path, 'health_gathering', render=False)

Episode 1/100: Reward: 2100.00
Episode 2/100: Reward: 2100.00
Episode 3/100: Reward: 2100.00
Episode 4/100: Reward: 2100.00
Episode 5/100: Reward: 2100.00
Episode 6/100: Reward: 2100.00
Episode 7/100: Reward: 2100.00
Episode 8/100: Reward: 2100.00
Episode 9/100: Reward: 2100.00
Episode 10/100: Reward: 2100.00
Episode 11/100: Reward: 2100.00
Episode 12/100: Reward: 2100.00
Episode 13/100: Reward: 2100.00
Episode 14/100: Reward: 2100.00
Episode 15/100: Reward: 2100.00
Episode 16/100: Reward: 2100.00
Episode 17/100: Reward: 2100.00
Episode 18/100: Reward: 2100.00
Episode 19/100: Reward: 2100.00
Episode 20/100: Reward: 2100.00
Episode 21/100: Reward: 2100.00
Episode 22/100: Reward: 2100.00
Episode 23/100: Reward: 2100.00
Episode 24/100: Reward: 2100.00
Episode 25/100: Reward: 2100.00
Episode 26/100: Reward: 2100.00
Episode 27/100: Reward: 2100.00
Episode 28/100: Reward: 2100.00
Episode 29/100: Reward: 2100.00
Episode 30/100: Reward: 2100.00
Episode 31/100: Reward: 2100.00
Episode 32/100: R