# PPO2 on Solo8 v2 Vanilla for Quadrupedal Standing
Try to get the solo to stand on 4 feet stabley

## Ensure that Tensorflow is using the GPU

In [1]:
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device: /device:GPU:0


## Define Experiment Tags

In [2]:
TAGS = ['solov2vanilla', 'gpu', 'home_pos_split_task', 
        'unnormalized_actions']

# Import required libraries

In [3]:
from gym_solo.envs import solo8v2vanilla
from gym_solo.core import obs
from gym_solo.core import rewards
from gym_solo.core import termination as terms

import gym
import gym_solo



## Parse CLI arguments and register w/ wandb

This experiment will be using the auto trainer to handle all of the hyperparmeter running

In [4]:
from auto_trainer import params
import auto_trainer

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



Give the robot a total of 10 seconds simulation time to learn how to stand.

In [5]:
episode_length = 2 / solo8v2vanilla.Solo8VanillaConfig.dt
episode_length

2000.0

Create a basic config

In [6]:
config = params.WandbParameters().parse()

config.episodes = 12500
config.episode_length = episode_length

config.target_torso_height = 0.33698 # Found experimentally

config.num_workers = 6
config.eval_frequency = 50
config.eval_episodes = 3
config.fps = 15

# Create a 3 second gif
config.eval_render_freq = int(config.episode_length / (3 * config.fps))

config

Namespace(algorithm='PPO2', episode_length=2000.0, episodes=12500, eval_episodes=3, eval_frequency=50, eval_render_freq=44, fps=15, num_workers=6, policy='MlpPolicy', target_torso_height=0.33698)

In [7]:
config, run = auto_trainer.get_synced_config(config, TAGS)
config

[34m[1mwandb[0m: Currently logged in as: [33magupta231[0m (use `wandb login --relogin` to force relogin)


{'episodes': 12500, 'episode_length': 2000.0, 'policy': 'MlpPolicy', 'algorithm': 'PPO2', 'num_workers': 6, 'eval_episodes': 3, 'eval_frequency': 50, 'eval_render_freq': 44, 'fps': 15, 'target_torso_height': 0.33698}

Add the following inputs to the robot / environment:

**Observations**
- TorsoIMU
- Motor encoder current values

**Reward**
- How flat the torso is
- Minimize the amount of control in the joints
- Minimize the amount of torso movement
- Keeping the torso at a given height

**Termination Criteria**
- Terminate after $n$ timesteps

In [8]:
def make_env(length, quad_standing_height):
    def _init():
        env_config = solo8v2vanilla.Solo8VanillaConfig()
        env = gym.make('solo8vanilla-v0', config=env_config, 
                       normalize_actions=False)

        env.obs_factory.register_observation(obs.TorsoIMU(env.robot))
        env.obs_factory.register_observation(obs.MotorEncoder(env.robot))
        env.termination_factory.register_termination(terms.TimeBasedTermination(length))

        env.reward_factory.register_reward(.2, rewards.SmallControlReward(env.robot))
        env.reward_factory.register_reward(.2, rewards.HorizontalMoveSpeedReward(env.robot, 0))
        env.reward_factory.register_reward(.3, rewards.FlatTorsoReward(env.robot))
        env.reward_factory.register_reward(.3, rewards.TorsoHeightReward(env.robot, quad_standing_height))

        return env
    return _init

### Create the Envs
Import the desired vectorized env

In [9]:
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines.common.vec_env import VecNormalize

Create training & testing environments

In [10]:
train_env = SubprocVecEnv([make_env(config.episode_length, 
                                    config.target_torso_height) 
                           for _ in range(config.num_workers)])

test_env = make_env(config.episode_length, 
                    config.target_torso_height)()



## Learning
And we're off!

In [None]:
model, config, run = auto_trainer.train(train_env, test_env, config, TAGS, 
                                        log_freq=1000, full_logging=False, run=run)





Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




-------------------------------------
| approxkl           | 0.0020415455 |
| clipfrac           | 0.0146484375 |
| explained_variance | 0.00457      |
| fps                | 846          |
| n_updates          | 1            |
| policy_entropy     | 17.030174    |
| policy_loss        | -0.007542923 |
| serial_timesteps   | 128          |
| time_elapsed       | 0.000398     |
| total_timesteps    | 768          |
| value_loss         | 25.362036    |
-------------------------------------





-------------------------------------
| approxkl           | 0.0031291447 |
| clipfrac           | 0.031575523  |
| explained_variance | 0.826        |
| fps                | 2043         |
| n_updates          | 1000         |
| policy_entropy     | 19.829273    |
| policy_loss        | -0.005137819 |
| serial_timesteps   | 128000       |
| time_elapsed       | 478          |
| total_timesteps    | 768000       |
| value_loss         | 2.128748     |
-------------------------------------
--------------------------------------
| approxkl           | 0.0039522056  |
| clipfrac           | 0.03841146    |
| explained_variance | 0.962         |
| fps                | 2125          |
| n_updates          | 2000          |
| policy_entropy     | 22.701862     |
| policy_loss        | -0.0034469045 |
| serial_timesteps   | 256000        |
| time_elapsed       | 934           |
| total_timesteps    | 1536000       |
| value_loss         | 1.6868482     |
--------------------------------------