In [1]:
import os
import gym
import torch
from torch import nn
import numpy as np
from spinup import vpg_pytorch as vpg
from spinup.utils.run_utils import setup_logger_kwargs

from models import MLPActorDiscrete, MLPActorContinuous, MLPCritic, MLPActorCritic
from train import VPGTrainer

In [2]:
### Test out the MLP Actor for discrete action spaces 

obs_dim, act_dim = 4, 2
hidden_sizes = [32, 32]
hidden_acts = [nn.Tanh, nn.ReLU]
actor = MLPActorDiscrete(obs_dim, act_dim, hidden_sizes, hidden_acts)
actor.layer_summary()

obs = np.random.rand(obs_dim)
act = actor(torch.as_tensor(obs, dtype=torch.float32))
log_prob = actor.log_prob_no_grad(torch.as_tensor(act, dtype=torch.float32))
print(f'Action for obs = {obs} is {act} with prob {np.exp(log_prob)}')


Linear input & output shapes:	 torch.Size([1, 4]) torch.Size([1, 32])
Tanh input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
Linear input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
ReLU input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
Linear input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 2])


Action for obs = [0.30993708 0.39984592 0.07399921 0.61547044] is 1 with prob 0.5192795991897583


In [3]:
### Test out the MLP Actor for continuous action spaces

obs_dim, act_dim = 4, 2
hidden_sizes = [32, 32]
hidden_acts = nn.ReLU
actor = MLPActorContinuous(obs_dim, act_dim, hidden_sizes, hidden_acts)
actor.layer_summary()

obs = np.random.rand(obs_dim)
act = actor(torch.as_tensor(obs, dtype=torch.float32))
log_prob = actor.log_prob_no_grad(torch.as_tensor(act, dtype=torch.float32))
print(f'Action for obs = {obs} is {act} with prob {np.exp(log_prob)}')

Linear input & output shapes:	 torch.Size([1, 4]) torch.Size([1, 32])
ReLU input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
Linear input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
ReLU input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
Linear input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 2])


Action for obs = [0.33377039 0.2496791  0.89045715 0.4089222 ] is [0.46185857 0.6891026 ] with prob 0.2244461327791214


In [4]:
### Test out the MLP Critic

obs_dim = 2
hidden_sizes = [64, 32]
hidden_acts = nn.ReLU

critic = MLPCritic(obs_dim, hidden_sizes, hidden_acts)
critic.layer_summary()

obs = np.random.randn(obs_dim)
print(f'Value function approximation for observation = {obs} is {critic(torch.as_tensor(obs, dtype=torch.float32))}')

Linear input & output shapes:	 torch.Size([1, 2]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 32])
ReLU input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
Linear input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 1])


Value function approximation for observation = [ 0.48242617 -1.0727886 ] is 0.13343486189842224


In [5]:
### Test out the MLP Actor-Critic for both Discrete and Box (continuous) action spaces

env_discrete = gym.make('CartPole-v1')
hidden_sizes_actor, hidden_sizes_critic = [32, 32], [64]
hidden_acts_actor, hidden_acts_critic = nn.ReLU, nn.Sigmoid
agent_discrete = MLPActorCritic(env_discrete, hidden_sizes_actor, hidden_sizes_critic,
                                hidden_acts_actor, hidden_acts_critic)
print('Discrete Agent Summary \n\n')
agent_discrete.layer_summary()

env_box = gym.make('Pendulum-v0', g=9.81)
agent_continuous = MLPActorCritic(env_box, hidden_sizes_actor, hidden_sizes_critic,
                                hidden_acts_actor, hidden_acts_critic)
print('Continuous Agent Summary \n\n')
agent_continuous.layer_summary()



Discrete Agent Summary 


Actor Summary: 

Linear input & output shapes:	 torch.Size([1, 4]) torch.Size([1, 32])
ReLU input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
Linear input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
ReLU input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
Linear input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 2])


Critic Summary: 

Linear input & output shapes:	 torch.Size([1, 4]) torch.Size([1, 64])
Sigmoid input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 1])


Continuous Agent Summary 


Actor Summary: 

Linear input & output shapes:	 torch.Size([1, 3]) torch.Size([1, 32])
ReLU input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
Linear input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
ReLU input & output shapes:	 torch.Size([1, 32]) torch.Size([1, 32])
Linear input & output shapes:	 torch.Size([1, 32])



In [8]:
### Setup parameters for the actual VPG module and the environment for discrete action spaces

# AC module parameters
hidden_sizes_actor, hidden_sizes_critic = [64, 64], [64, 64]
hidden_acts_actor, hidden_acts_critic = nn.Tanh, nn.Tanh


# Training parameters
seed = 0
steps_per_epoch, max_ep_len, num_epochs = 4000, 1000, 50
gamma, lam = 0.99, 0.97
pi_lr, val_lr, train_v_iters = 3e-4, 1e-3, 80

# Experiment parameters
data_dir = '/home/sherif/user/python/DRL/data/vpg'
exp_name = 'vpg_custom_discrete_2'
save_freq = 10

In [9]:
### Perform training experiment with the Spinup VPG implementation

env_fn = lambda : gym.make('CartPole-v1')

ac_kwargs = dict(hidden_sizes=hidden_sizes_actor, activation=torch.nn.Tanh) 
logger_kwargs = setup_logger_kwargs(exp_name=f'vpg_spinup_discrete_2', data_dir=data_dir)

vpg(env_fn=env_fn, ac_kwargs=ac_kwargs, seed=seed, steps_per_epoch=steps_per_epoch,
    epochs=num_epochs, gamma=gamma, pi_lr=pi_lr, vf_lr=val_lr, train_v_iters=train_v_iters,
    lam=lam, max_ep_len=max_ep_len, logger_kwargs=logger_kwargs, save_freq=10)

[32;1mLogging data to /home/sherif/user/python/DRL/data/vpg/vpg_spinup_discrete_2/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"Tanh",
        "hidden_sizes":	[
            64,
            64
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "env_fn":	"<function <lambda> at 0x7d5439c97c80>",
    "epochs":	50,
    "exp_name":	"vpg_spinup_discrete_2",
    "gamma":	0.99,
    "lam":	0.97,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7d5438c9b518>":	{
            "epoch_dict":	{},
            "exp_name":	"vpg_spinup_discrete_2",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"/home/sherif/user/python/DRL/data/vpg/vpg_spinup_discrete_2",
            "output_file":	{
                "<_io.TextIOWrapper name='/home/sherif/user/python/DRL/data/vpg/vpg_spinup_discrete_2/progress.txt' mode='w' encoding='UTF-8'>":	{
                    "mode":	"w"
 



---------------------------------------
|             Epoch |               0 |
|      AverageEpRet |            21.5 |
|          StdEpRet |            11.2 |
|          MaxEpRet |              87 |
|          MinEpRet |               8 |
|             EpLen |            21.5 |
|      AverageVVals |          -0.259 |
|          StdVVals |           0.067 |
|          MaxVVals |         -0.0174 |
|          MinVVals |          -0.438 |
| TotalEnvInteracts |           4e+03 |
|            LossPi |         0.00776 |
|             LossV |             255 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -146 |
|           Entropy |           0.689 |
|                KL |        4.77e-10 |
|              Time |            2.34 |
---------------------------------------
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |              21 |
|          StdEpRet |            10.1 |
|          MaxEpRet |              66 |


In [None]:
### Start the training process using the custom VPG implementation

env_discrete_fn = lambda : gym.make('CartPole-v1')
ac_kwargs = dict(hidden_sizes_actor=hidden_sizes_actor, 
                 hidden_sizes_critic=hidden_sizes_critic,
                 hidden_acts_actor=hidden_acts_actor, 
                 hidden_acts_critic=hidden_acts_critic)
logger_kwargs = setup_logger_kwargs(exp_name=exp_name, seed=seed, data_dir=data_dir)


trainer = VPGTrainer()
trainer.train_mod(env_discrete_fn, ac=MLPActorCritic, ac_kwargs=ac_kwargs, 
                  seed=seed, buf_size=steps_per_epoch, max_ep_len=max_ep_len, 
                  num_epochs=num_epochs, gamma=gamma, lam=lam, pi_lr=pi_lr, 
                  val_lr=val_lr, train_v_iters=train_v_iters, save_freq=save_freq,
                  logger_kwargs=logger_kwargs)