In [1]:
### Resources:

# TD3 Algorithm: https://spinningup.openai.com/en/latest/algorithms/td3.html

# TD3 Paper: https://arxiv.org/abs/1802.09477

In [1]:
import gym
import torch
from torch import nn
import numpy as np
from spinup.utils.run_utils import setup_logger_kwargs
from spinup import td3_pytorch as td3

from models import MLPActorCritic
from train import TD3Trainer

In [2]:
### Test MLPActorCritic class for continuous environments 

env_continuous = gym.make('Pendulum-v0')
hidden_sizes, hidden_acts = [64, 64], nn.ReLU
action_std = 0.1

ac = MLPActorCritic(env_continuous, hidden_sizes, hidden_sizes, hidden_acts, 
                    hidden_acts, action_std)
print('TD3 Actor-Critic Summary \n\n')
ac.layer_summary()

TD3 Actor-Critic Summary 


Actor Summary: 

Linear input & output shapes:	 torch.Size([1, 3]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 1])
Tanh input & output shapes:	 torch.Size([1, 1]) torch.Size([1, 1])


Critic Summary: 

Linear input & output shapes:	 torch.Size([1, 4]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 65]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 1])






In [4]:
### Define parameters needed for training

# Actor-Critic module parameters
hidden_sizes, hidden_acts = [64, 64], nn.ReLU
action_std = 0.1
ac_kwargs = dict(hidden_sizes_actor=hidden_sizes,
                 hidden_sizes_critic=hidden_sizes,
                 hidden_acts_actor=hidden_acts,
                 hidden_acts_critic=hidden_acts,
                 action_std=action_std)

# Training parameters
seed = 0
steps_per_epoch, epochs = 4000, 50
buf_size = 1000000
gamma, polyak = 0.99, 0.995
pi_lr, q_lr = 1e-3, 1e-3
batch_size = 100
start_steps, learning_starts = 10000, 1000
target_noise, noise_clip = 0.2, 0.5
policy_delay = 2
update_every, num_test_episodes = 50, 10
max_ep_len, save_freq = 1000, 10

# Experiment parameters
data_dir = '/home/sherif/user/python/DRL/data/td3'

In [5]:
### Start the training process using the Spinning up TD3 implementation

env_fn = lambda : gym.make('Pendulum-v0')
exp_name_custom = 'td3_spinup'

logger_kwargs = setup_logger_kwargs(exp_name=exp_name_custom, data_dir=data_dir, seed=seed)

td3(env_fn=env_fn, ac_kwargs=dict(hidden_sizes=hidden_sizes), seed=seed, 
     steps_per_epoch=steps_per_epoch, epochs=epochs, replay_size=buf_size, 
     gamma=gamma, polyak=polyak, pi_lr=pi_lr, q_lr=q_lr, batch_size=batch_size,
     start_steps=start_steps, update_after=learning_starts, update_every=update_every,
     act_noise=action_std, target_noise=target_noise, noise_clip=noise_clip, 
     policy_delay=policy_delay, num_test_episodes=num_test_episodes, 
     max_ep_len=max_ep_len, logger_kwargs=logger_kwargs, save_freq=save_freq)

[32;1mLogging data to /home/sherif/user/python/DRL/data/td3/td3_spinup/td3_spinup_s0/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            64,
            64
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x799759dbe950>",
    "epochs":	50,
    "exp_name":	"td3_spinup",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x799759b0f710>":	{
            "epoch_dict":	{},
            "exp_name":	"td3_spinup",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"/home/sherif/user/python/DRL/data/td3/td3_spinup/td3_spinup_s0",
            "output_file":	{
                "<_io.TextIOWrapper name='/home/sherif/user/python/DRL/data/td3/td3_spinup/td3_spinup_s0/progress.txt' mode='w' encoding='UTF-8'>":	{
                    "mode":	"w"
                

In [5]:
### Start the training process using the custom TD3 implementation

env_fn = lambda : gym.make('Pendulum-v0')
exp_name_custom = 'td3_custom'

logger_kwargs = setup_logger_kwargs(exp_name=exp_name_custom, data_dir=data_dir, seed=seed)

trainer = TD3Trainer()
trainer.train_mod(env_fn, ac=MLPActorCritic, ac_kwargs=ac_kwargs, seed=seed, 
                  steps_per_epoch=steps_per_epoch, epochs=epochs, 
                  buf_size=buf_size, gamma=gamma, polyak=polyak, pi_lr=pi_lr,
                  q_lr=q_lr, batch_size=batch_size, start_steps=start_steps,
                  learning_starts=learning_starts, update_every=update_every, 
                  target_noise=target_noise, noise_clip=noise_clip,
                  policy_delay=policy_delay, num_test_episodes=num_test_episodes, 
                  max_ep_len=max_ep_len, logger_kwargs=logger_kwargs, 
                  save_freq=save_freq)

[32;1mLogging data to /home/sherif/user/python/DRL/data/td3/td3_custom/td3_custom_s0/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac":	"MLPActorCritic",
    "ac_kwargs":	{
        "action_std":	0.1,
        "hidden_acts_actor":	"ReLU",
        "hidden_acts_critic":	"ReLU",
        "hidden_sizes_actor":	[
            64,
            64
        ],
        "hidden_sizes_critic":	[
            64,
            64
        ]
    },
    "ac_mod":	{
        "MLPActorCritic(\n  (actor): MLPActor(\n    (net): Sequential(\n      (hidden_1): Linear(in_features=3, out_features=64, bias=True)\n      (activation_1): ReLU()\n      (hidden_2): Linear(in_features=64, out_features=64, bias=True)\n      (activation_2): ReLU()\n      (output): Linear(in_features=64, out_features=1, bias=True)\n      (output_act): Tanh()\n    )\n    (net_target): Sequential(\n      (hidden_1): Linear(in_features=3, out_features=64, bias=True)\n      (activation_1): ReLU()\n      (hidden_2): Linear(in_features=64, out_