# Testing the PPO Algorithm
## (gym env, no imitation)

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gym
import pybullet_envs

import torch
from ppo import Agent

All the policy setup is here. Needs to match the parameters used to train the model originally. 

In [2]:
policy_name = "ppo_policy_6"

def make_env(gym_id, seed, idx, capture_video, run_name):
    def thunk():
        env = gym.make(gym_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        if capture_video:
            if idx == 0:
                env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        env = gym.wrappers.ClipAction(env)
        env = gym.wrappers.NormalizeObservation(env)
        env = gym.wrappers.TransformObservation(env, lambda obs: np.clip(obs, -10, 10))
        env = gym.wrappers.NormalizeReward(env)
        env = gym.wrappers.TransformReward(env, lambda reward: np.clip(reward, -10, 10))
        env.seed(seed)
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
        return env

    return thunk


envs = gym.vector.SyncVectorEnv(
    [make_env("HalfCheetahBulletEnv-v0", 1 + i, i, True, policy_name) for i in range(1)]
)
policy_arch = [
    {'Layer': 'Linear', 'Input': np.array(envs.single_observation_space.shape).prod(), 'Output': 64, 'std': np.sqrt(2)},
    {'Layer': 'Tanh'},
    {'Layer': 'Linear', 'Input': 64, 'Output': 64, 'std': np.sqrt(2)},
    {'Layer': 'Tanh'},
    {'Layer': 'Linear', 'Input': 64, 'Output': np.array(envs.single_action_space.shape).prod(), 'std': 0.01},
]
value_arch = [
    {'Layer': 'Linear', 'Input': np.array(envs.single_observation_space.shape).prod(), 'Output': 64, 'std': np.sqrt(2)},
    {'Layer': 'Tanh'},
    {'Layer': 'Linear', 'Input': 64, 'Output': 64, 'std': np.sqrt(2)},
    {'Layer': 'Tanh'},
    {'Layer': 'Linear', 'Input': 64, 'Output': 1, 'std': 1.0},
]

Instantiate the agent, set it to `eval_mode = True` so that actions are deterministic, and evaluate on the half-cheeta environment.

In [3]:
# Run vanilla behavior cloning
agent = Agent(envs, policy_arch, value_arch)
agent.load_policy(f"policies/{policy_name}.pt")
agent.set_eval_mode(True)

And now do some plotting to examine results

In [4]:
obs = envs.reset()
done = np.array([False, False])
with torch.no_grad():
    while not done.any():
        u = agent.get_action_and_value(torch.Tensor(obs))
        res = envs.step(u.cpu().numpy())
        obs = res[0]
        done = res[2]