# Autotrainer PPO2 on Gym Pendulum
Test the autotrainer on Open Ai's Pendulum environment, which is continuous and considered to be an easy enviroment to solve.

## Ensure that Tensorflow is using the GPU

In [1]:
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device: /device:GPU:0


## Define Experiment Tags

In [2]:
TAGS = ['gym-pendulum', 'gpu',]

## Parse CLI arguments and register w/ wandb

This experiment will be using the auto trainer to handle all of the hyperparmeter running

In [3]:
from auto_trainer import params
import auto_trainer

auto_trainer.trainer.PROJECT_NAME = 'autotrainer-gym-baselines'

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [4]:
config = params.WandbParameters().parse()

config.episodes = 10000
config.episode_length = 750

config.num_workers = 8
config.eval_frequency = 25
config.eval_episodes = 5
config.fps = 20

# Create a 4 second gif
config.eval_render_freq = int(config.episode_length / (4 * config.fps))

config

Namespace(algorithm='PPO2', episode_length=750, episodes=10000, eval_episodes=5, eval_frequency=25, eval_render_freq=9, fps=20, num_workers=8, policy='MlpPolicy')

In [5]:
config, run = auto_trainer.get_synced_config(config, TAGS)
config

[34m[1mwandb[0m: Currently logged in as: [33magupta231[0m (use `wandb login --relogin` to force relogin)


{'episodes': 10000, 'episode_length': 750, 'policy': 'MlpPolicy', 'algorithm': 'PPO2', 'num_workers': 8, 'eval_episodes': 5, 'eval_frequency': 25, 'eval_render_freq': 9, 'fps': 20}

## Create a virtual display for environment rendering

In [6]:
import pyvirtualdisplay
display = pyvirtualdisplay.Display(visible=False, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7fd20f950910>

## Create a normalized wrapper for the Pendulum Environment
The vanilla Pendulum enviornment has its action and observation spaces outside of $[-1, 1]$. Create a simple wrapper to apply min/max scaling to the respective values. Note that the default Pendulum environment doesn't have a termination state, so artifically create a termination condition.

In [7]:
from gym.envs.classic_control import pendulum
from gym import spaces
import gym

class NormalizedPendulum(pendulum.PendulumEnv):
    def __init__(self, length: int = 1000):
        super().__init__()
        self.unscaled_obs_space = self.observation_space
        self.action_space = spaces.Box(low=-1., high=1., shape=(1,))
        self.observation_space = spaces.Box(low=-1., high=1., shape=(3,))
        
        self._length = length
        self._cnt = 0
    
    def reset(self):
        self._cnt = 0
        return super().reset()
    
    def step(self, u):
        self._cnt += 1
        
        obs, reward, done, info = super().step(u * self.max_torque)
        if self._cnt % self._length == 0:
            return obs, reward, True, info
        else:
            return obs, reward, done, info
    
    def _get_obs(self):
        return super()._get_obs() / self.unscaled_obs_space.high

Create the environment generator

In [9]:
def make_env(length):
    def _init():
        return NormalizedPendulum(length)
    return _init

### Create the Envs
Import the desired vectorized env

In [10]:
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines.common.vec_env import VecNormalize

Create training & testing environments

In [11]:
train_env = SubprocVecEnv([make_env(config.episode_length) 
                           for _ in range(config.num_workers)])
test_env = make_env(config.episode_length)()

## Learning
And we're off!

In [None]:
model, config, run = auto_trainer.train(train_env, test_env, config, TAGS, 
                                        log_freq=250, full_logging=False, run=run)





Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




---------------------------------------
| approxkl           | 1.7475563e-06  |
| clipfrac           | 0.0            |
| explained_variance | 0.0079         |
| fps                | 1759           |
| n_updates          | 1              |
| policy_entropy     | 1.4193635      |
| policy_loss        | -2.7332295e-05 |
| serial_timesteps   | 128            |
| time_elapsed       | 0.00023        |
| total_timesteps    | 1024           |
| value_loss         | 5202.129       |
---------------------------------------





--------------------------------------
| approxkl           | 0.00032426487 |
| clipfrac           | 0.0           |
| explained_variance | 0.00168       |
| fps                | 3712          |
| n_updates          | 250           |
| policy_entropy     | 1.4304162     |
| policy_loss        | -0.0004787963 |
| serial_timesteps   | 32000         |
| time_elapsed       | 147           |
| total_timesteps    | 256000        |
| value_loss         | 2807.2314     |
--------------------------------------
---------------------------------------
| approxkl           | 1.3010788e-05  |
| clipfrac           | 0.0            |
| explained_variance | 0.0001         |
| fps                | 2855           |
| n_updates          | 500            |
| policy_entropy     | 1.4360393      |
| policy_loss        | -8.5928186e-05 |
| serial_timesteps   | 64000          |
| time_elapsed       | 290            |
| total_timesteps    | 512000         |
| value_loss         | 1539.0027      |
-------------