In [1]:
### Resources:

# SAC Algorithm: https://spinningup.openai.com/en/latest/algorithms/sac.html

# SAC 1st Paper: https://arxiv.org/abs/1801.01290

# SAC 2nd Paper: https://arxiv.org/abs/1812.05905

# SAC 3rd Paper: https://arxiv.org/abs/1812.11103

In [1]:
import gym
import torch
from torch import nn
import numpy as np
from spinup.utils.run_utils import setup_logger_kwargs
from spinup import sac_pytorch as sac

from models import MLPActorCritic
from train import SACTrainer

In [3]:
from torch.distributions import Normal

mu = torch.full((32, 1), fill_value=0)
log_std = torch.full((32, 1), fill_value=0.5)
act = torch.normal(mean=mu, std=torch.exp(log_std))

# Compute log likelihood with torch.distributions
dist = Normal(loc=mu, scale=torch.exp(log_std))
log_prob_1 = dist.log_prob(act).sum(axis=-1)

print(act[0], torch.exp(log_prob_1).shape)
obs = torch.arange(10)
print(obs[None, :].shape)

tensor([-1.9270]) torch.Size([32])
torch.Size([1, 10])


In [4]:
### Test MLPActorCritic class for continuous environments 

env_continuous = gym.make('Pendulum-v0')
hidden_sizes, hidden_acts = [64, 64], nn.ReLU

ac = MLPActorCritic(env_continuous, hidden_sizes, hidden_sizes, 
                    hidden_acts, hidden_acts)
print('SAC Actor-Critic Summary \n\n')
ac.layer_summary()

SAC Actor-Critic Summary 


Actor Summary: 

Linear input & output shapes:	 torch.Size([1, 3]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 2])


Critic Summary: 

Linear input & output shapes:	 torch.Size([1, 4]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 1])






In [5]:
### Define parameters needed for training

# Actor-Critic module parameters
hidden_sizes, hidden_acts = [64, 64], nn.ReLU
ac_kwargs = dict(hidden_sizes_actor=hidden_sizes,
                 hidden_sizes_critic=hidden_sizes,
                 hidden_acts_actor=hidden_acts,
                 hidden_acts_critic=hidden_acts)

# Training parameters
seed = 0
steps_per_epoch, epochs = 4000, 50
buf_size = 1000000
gamma, polyak = 0.99, 0.995
lr = 1e-3
alpha, auto_alpha = 0.2, False
batch_size = 100
start_steps, learning_starts = 10000, 1000
update_every, num_test_episodes = 50, 10
max_ep_len, save_freq = 1000, 10

# Experiment parameters
data_dir = '/home/sherif/user/python/DRL/data/sac'

In [5]:
### Start the training process using the Spinning up SAC implementation

env_fn = lambda : gym.make('Pendulum-v0')
exp_name_custom = 'sac_spinup'

logger_kwargs = setup_logger_kwargs(exp_name=exp_name_custom, data_dir=data_dir, seed=seed)

sac(env_fn=env_fn, ac_kwargs=dict(hidden_sizes=hidden_sizes), seed=seed, 
    steps_per_epoch=steps_per_epoch, epochs=epochs, replay_size=buf_size, 
    gamma=gamma, polyak=polyak, lr=lr, alpha=alpha, batch_size=batch_size, 
    start_steps=start_steps, update_after=learning_starts, 
    update_every=update_every, num_test_episodes=num_test_episodes, 
    max_ep_len=max_ep_len, logger_kwargs=logger_kwargs, save_freq=save_freq)

[32;1mLogging data to /home/sherif/user/python/DRL/data/sac/sac_spinup/sac_spinup_s0/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            64,
            64
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.2,
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x7e026191ce18>",
    "epochs":	50,
    "exp_name":	"sac_spinup",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7e026192c2e8>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_spinup",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"/home/sherif/user/python/DRL/data/sac/sac_spinup/sac_spinup_s0",
            "output_file":	{
                "<_io.TextIOWrapper name='/home/sherif/user/python/DRL/data/sac/sac_spinup/sac_spinup_s0/progress.txt' mode='w' encoding='UTF-8'>":	{
                    "mode":	"w"
                }
  

In [6]:
### Start the training process using the custom SAC implementation (fixed alpha)

env_fn = lambda : gym.make('Pendulum-v0')
exp_name_custom = 'sac_custom_fixed'

logger_kwargs = setup_logger_kwargs(exp_name=exp_name_custom, data_dir=data_dir, seed=seed)

trainer = SACTrainer()
trainer.train_mod(env_fn, ac=MLPActorCritic, ac_kwargs=ac_kwargs, seed=seed, 
                  steps_per_epoch=steps_per_epoch, epochs=epochs, 
                  buf_size=buf_size, gamma=gamma, polyak=polyak, lr=lr,
                  alpha=alpha, auto_alpha=auto_alpha, batch_size=batch_size, 
                  start_steps=start_steps, learning_starts=learning_starts, 
                  update_every=update_every, num_test_episodes=num_test_episodes, 
                  max_ep_len=max_ep_len, logger_kwargs=logger_kwargs, 
                  save_freq=save_freq)

[32;1mLogging data to /home/sherif/user/python/DRL/data/sac/sac_custom_fixed/sac_custom_fixed_s0/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac":	"MLPActorCritic",
    "ac_kwargs":	{
        "hidden_acts_actor":	"ReLU",
        "hidden_acts_critic":	"ReLU",
        "hidden_sizes_actor":	[
            64,
            64
        ],
        "hidden_sizes_critic":	[
            64,
            64
        ]
    },
    "ac_mod":	{
        "MLPActorCritic(\n  (actor): MLPActor(\n    (net): Sequential(\n      (hidden_1): Linear(in_features=3, out_features=64, bias=True)\n      (activation_1): ReLU()\n      (hidden_2): Linear(in_features=64, out_features=64, bias=True)\n      (activation_2): ReLU()\n      (output): Linear(in_features=64, out_features=2, bias=True)\n    )\n  )\n  (critic_1): MLPDQN(\n    (net): Sequential(\n      (hidden_1): Linear(in_features=4, out_features=64, bias=True)\n      (activation_1): ReLU()\n      (hidden_2): Linear(in_features=64, out_features=64, bias=True

In [8]:
### Start the training process using the custom SAC implementation (trained alpha)

env_fn = lambda : gym.make('Pendulum-v0')
exp_name_custom = 'sac_custom_trained'
lr, auto_alpha = 3e-4, True

logger_kwargs = setup_logger_kwargs(exp_name=exp_name_custom, data_dir=data_dir, seed=seed)

trainer = SACTrainer()
trainer.train_mod(env_fn, ac=MLPActorCritic, ac_kwargs=ac_kwargs, seed=seed, 
                  steps_per_epoch=steps_per_epoch, epochs=epochs, 
                  buf_size=buf_size, gamma=gamma, polyak=polyak, lr=lr,
                  alpha=alpha, auto_alpha=auto_alpha, batch_size=batch_size, 
                  start_steps=start_steps, learning_starts=learning_starts, 
                  update_every=update_every, num_test_episodes=num_test_episodes, 
                  max_ep_len=max_ep_len, logger_kwargs=logger_kwargs, 
                  save_freq=save_freq)



[32;1mLogging data to /home/sherif/user/python/DRL/data/sac/sac_custom_trained/sac_custom_trained_s0/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac":	"MLPActorCritic",
    "ac_kwargs":	{
        "hidden_acts_actor":	"ReLU",
        "hidden_acts_critic":	"ReLU",
        "hidden_sizes_actor":	[
            64,
            64
        ],
        "hidden_sizes_critic":	[
            64,
            64
        ]
    },
    "ac_mod":	{
        "MLPActorCritic(\n  (actor): MLPActor(\n    (net): Sequential(\n      (hidden_1): Linear(in_features=3, out_features=64, bias=True)\n      (activation_1): ReLU()\n      (hidden_2): Linear(in_features=64, out_features=64, bias=True)\n      (activation_2): ReLU()\n      (output): Linear(in_features=64, out_features=2, bias=True)\n    )\n  )\n  (critic_1): MLPDQN(\n    (net): Sequential(\n      (hidden_1): Linear(in_features=4, out_features=64, bias=True)\n      (activation_1): ReLU()\n      (hidden_2): Linear(in_features=64, out_features=64, bias=