In [1]:
### Resources:

# PPO Overview: https://spinningup.openai.com/en/latest/algorithms/ppo.html

# PPO Implementation Details: https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details/

# PPO Paper: https://arxiv.org/abs/1707.06347

In [1]:
import gym
import torch
from torch import nn
import numpy as np
from spinup import ppo_pytorch as ppo
from spinup.utils.run_utils import setup_logger_kwargs

from models import MLPActorCritic
from train import PPOTrainer

In [2]:
### Test both Actor-Critic for both discrete and continuous action spaces

env_discrete = gym.make('CartPole-v1')
hidden_sizes_actor, hidden_sizes_critic = [64, 64], [64, 64]
hidden_acts_actor, hidden_acts_critic = nn.Tanh, nn.Tanh
agent_discrete = MLPActorCritic(env_discrete, hidden_sizes_actor, hidden_sizes_critic,
                                hidden_acts_actor, hidden_acts_critic)
print('Discrete Agent Summary \n\n')
agent_discrete.layer_summary()

env_box = gym.make('Pendulum-v0', g=9.81)
agent_continuous = MLPActorCritic(env_box, hidden_sizes_actor, hidden_sizes_critic,
                                hidden_acts_actor, hidden_acts_critic)
print('Continuous Agent Summary \n\n')
agent_continuous.layer_summary()

Discrete Agent Summary 


Actor Summary: 

Linear input & output shapes:	 torch.Size([1, 4]) torch.Size([1, 64])
Tanh input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Tanh input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 2])


Critic Summary: 

Linear input & output shapes:	 torch.Size([1, 4]) torch.Size([1, 64])
Tanh input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Tanh input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 1])


Continuous Agent Summary 


Actor Summary: 

Linear input & output shapes:	 torch.Size([1, 3]) torch.Size([1, 64])
Tanh input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) to



In [4]:
### Define parameters needed for training

# AC module parameters
hidden_sizes_actor, hidden_sizes_critic = [64, 64], [64, 64]
hidden_acts_actor, hidden_acts_critic = nn.Tanh, nn.Tanh
ac_kwargs = dict(hidden_sizes_actor=hidden_sizes_actor, 
                 hidden_sizes_critic=hidden_sizes_critic,
                 hidden_acts_actor=hidden_acts_actor, 
                 hidden_acts_critic=hidden_acts_critic)

# Training parameters
seed = 0
steps_per_epoch, max_ep_len = 4096, 1000
batch_size, epochs = 256, 50 
gamma, lam = 0.99, 0.97
clip_ratio, target_kl = 0.2, 0.01
pi_lr, vf_lr = 3e-4, 1e-3 
train_pi_iters, train_v_iters = 80, 80
save_freq = 10

# Experiment parameters
data_dir = '/home/sherif/user/python/DRL/data/ppo'

In [10]:
### Perform training experiment with the Spinup PPO implementation for Discrete environment

env_fn = lambda : gym.make('CartPole-v1')
exp_name_spinup = 'ppo_spinup_discrete'

logger_kwargs = setup_logger_kwargs(exp_name=exp_name_spinup, data_dir=data_dir)

ppo(env_fn=env_fn, ac_kwargs=dict(hidden_sizes=hidden_sizes_actor), seed=seed,
    steps_per_epoch=steps_per_epoch, epochs=epochs, gamma=gamma, clip_ratio=clip_ratio,
    pi_lr=pi_lr, vf_lr=vf_lr, train_pi_iters=train_pi_iters, train_v_iters=train_v_iters,
    lam=lam, max_ep_len=max_ep_len, target_kl=target_kl, logger_kwargs=logger_kwargs, 
    save_freq=save_freq)

[32;1mLogging data to /home/sherif/user/python/DRL/data/ppo/ppo_spinup_discrete/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            64,
            64
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "clip_ratio":	0.2,
    "env_fn":	"<function <lambda> at 0x72a0d2e5dc80>",
    "epochs":	50,
    "exp_name":	"ppo_spinup_discrete",
    "gamma":	0.99,
    "lam":	0.97,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x72a0d2e43b70>":	{
            "epoch_dict":	{},
            "exp_name":	"ppo_spinup_discrete",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"/home/sherif/user/python/DRL/data/ppo/ppo_spinup_discrete",
            "output_file":	{
                "<_io.TextIOWrapper name='/home/sherif/user/python/DRL/data/ppo/ppo_spinup_discrete/progress.txt' mode='w' encoding='UTF-8'>":	{
                    "mode":	"w"
                }


In [5]:
### Start the training process using the custom PPO implementation for Discrete environment

env_fn = lambda : gym.make('CartPole-v1')
exp_name_custom = 'ppo_custom_discrete'

logger_kwargs = setup_logger_kwargs(exp_name=exp_name_custom, data_dir=data_dir)

trainer = PPOTrainer()
trainer.train_mod(env_fn=env_fn, ac=MLPActorCritic, ac_kwargs=ac_kwargs, seed=seed,
                  steps_per_epoch=steps_per_epoch, batch_size=batch_size, epochs=epochs,
                  gamma=gamma, clip_ratio=clip_ratio, pi_lr=pi_lr, vf_lr=vf_lr, 
                  train_pi_iters=train_pi_iters, train_v_iters=train_v_iters, lam=lam, 
                  max_ep_len=max_ep_len, target_kl=target_kl, logger_kwargs=logger_kwargs,
                  save_freq=save_freq)

[32;1mLogging data to /home/sherif/user/python/DRL/data/ppo/ppo_custom_discrete/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac":	"MLPActorCritic",
    "ac_kwargs":	{
        "hidden_acts_actor":	"Tanh",
        "hidden_acts_critic":	"Tanh",
        "hidden_sizes_actor":	[
            64,
            64
        ],
        "hidden_sizes_critic":	[
            64,
            64
        ]
    },
    "ac_mod":	{
        "MLPActorCritic(\n  (actor): MLPActorDiscrete(\n    (net): Sequential(\n      (hidden_1): Linear(in_features=4, out_features=64, bias=True)\n      (activation_1): Tanh()\n      (hidden_2): Linear(in_features=64, out_features=64, bias=True)\n      (activation_2): Tanh()\n      (output): Linear(in_features=64, out_features=2, bias=True)\n    )\n  )\n  (critic): MLPCritic(\n    (net): Sequential(\n      (hidden_1): Linear(in_features=4, out_features=64, bias=True)\n      (activation_1): Tanh()\n      (hidden_2): Linear(in_features=64, out_features=64, bias=True)\n     