In [1]:
from spinup import ppo_pytorch as ppo
import torch
import gym

In [2]:
### Setup the environment and algorithm parameters

env_fn = lambda : gym.make('LunarLander-v2')
ac_kwargs = dict(hidden_sizes=[64, 64], activation=torch.nn.ReLU)
logger_kwargs = dict(output_dir='/home/sherif/user/python/spinningup/data/ppo', exp_name='Lunar_PPO_1')

In [3]:
### Running a single expirement

ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, 
    epochs=250, logger_kwargs=logger_kwargs)

[32;1mLogging data to /home/sherif/user/python/spinningup/data/ppo/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            64,
            64
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "clip_ratio":	0.2,
    "env_fn":	"<function <lambda> at 0x7a10bfdf0ea0>",
    "epochs":	250,
    "exp_name":	"Lunar_PPO_1",
    "gamma":	0.99,
    "lam":	0.97,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7a10bfdefac8>":	{
            "epoch_dict":	{},
            "exp_name":	"Lunar_PPO_1",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"/home/sherif/user/python/spinningup/data/ppo",
            "output_file":	{
                "<_io.TextIOWrapper name='/home/sherif/user/python/spinningup/data/ppo/progress.txt' mode='w' encoding='UTF-8'>":	{
                    "mode":	"w"
                }
            }
        }




---------------------------------------
|             Epoch |               0 |
|      AverageEpRet |            -193 |
|          StdEpRet |            95.1 |
|          MaxEpRet |           -8.13 |
|          MinEpRet |            -406 |
|             EpLen |            96.3 |
|      AverageVVals |          -0.153 |
|          StdVVals |          0.0489 |
|          MaxVVals |         -0.0691 |
|          MinVVals |          -0.398 |
| TotalEnvInteracts |           5e+03 |
|            LossPi |       -2.24e-08 |
|             LossV |        1.31e+04 |
|       DeltaLossPi |         -0.0113 |
|        DeltaLossV |        -2.7e+03 |
|           Entropy |            1.38 |
|                KL |         0.00502 |
|          ClipFrac |           0.067 |
|          StopIter |              79 |
|              Time |            2.36 |
---------------------------------------
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -136 |


KeyboardInterrupt: 

In [11]:
### Using ExperimentalGrid to train multiple configurations (for hyperparameters) of the same algorithm sequentially

from spinup.utils.run_utils import ExperimentGrid
import argparse
import sys

sys.argv = ['01_experiments', '--cpu', '4', '--num_runs', '3']

parser = argparse.ArgumentParser()
parser.add_argument('--cpu', type=int, default=4)
parser.add_argument('--num_runs', type=int, default=3)
args = parser.parse_args()

eg = ExperimentGrid(name='ppo-pyt-bench')
eg.add('env_name', 'CartPole-v0', '', True)
eg.add('seed', [10*i for i in range(args.num_runs)])
eg.add('epochs', 10)
eg.add('steps_per_epoch', 4000)
eg.add('ac_kwargs:hidden_sizes', [(32, ), (64, 64)], 'hid')
eg.add('ac_kwargs:activation', [torch.nn.Tanh, torch.nn.ReLU], '')
eg.run(ppo, num_cpu=args.cpu)

[32;1mExperimentGrid [ppo-pyt-bench] runs over parameters:
[0m
 [36;1menv_name                                [0m [] 

	CartPole-v0

 [36;1mseed                                    [0m [see] 

	0
	10
	20

 [36;1mepochs                                  [0m [epo] 

	10

 [36;1msteps_per_epoch                         [0m [ste] 

	4000

 [36;1mac_kwargs:hidden_sizes                  [0m [hid] 

	(32,)
	(64, 64)

 [36;1mac_kwargs:activation                    [0m [] 

	Tanh
	ReLU

 Variants, counting seeds:               12
 Variants, not counting seeds:           4


[32;1mPreparing to run the following experiments...[0m

ppo-pyt-bench_cartpole-v0_hid32_relu
ppo-pyt-bench_cartpole-v0_hid32_tanh
ppo-pyt-bench_cartpole-v0_hid64-64_relu
ppo-pyt-bench_cartpole-v0_hid64-64_tanh

[36;1m
Launch delayed to give you a few seconds to review your experiments.

To customize or disable this behavior, change WAIT_BEFORE_LAUNCH in
spinup/user_config.py.



                                                                                

[36;1mRunning experiment:
[0m
ppo-pyt-bench_cartpole-v0_hid32_tanh

[36;1mwith kwargs:
[0m
{
    "ac_kwargs":	{
        "activation":	"Tanh",
        "hidden_sizes":	[
            32
        ]
    },
    "env_name":	"CartPole-v0",
    "epochs":	10,
    "seed":	0,
    "steps_per_epoch":	4000
}







End of experiment.


Plot results from this run with:

[32mpython -m spinup.run plot /home/sherif/user/spinningup/data/ppo-pyt-bench_cartpole-v0_hid32_tanh/ppo-pyt-bench_cartpole-v0_hid32_tanh_s0[0m


Watch the trained agent with:

[32mpython -m spinup.run test_policy /home/sherif/user/spinningup/data/ppo-pyt-bench_cartpole-v0_hid32_tanh/ppo-pyt-bench_cartpole-v0_hid32_tanh_s0[0m







[36;1mRunning experiment:
[0m
ppo-pyt-bench_cartpole-v0_hid32_relu

[36;1mwith kwargs:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            32
        ]
    },
    "env_name":	"CartPole-v0",
    "epochs":	10,
    "seed":	0,
    "steps_per_epoch":	4000
}





In [None]:
### Simulating trained policies (Doesn't work inside Jupyter Notebooks)

from spinup.utils.test_policy import load_policy_and_env, run_policy
_, get_action = load_policy_and_env('/home/sherif/user/python/spinningup/data')
env = gym.make('LunarLander-v2')
run_policy(env, get_action)