In [1]:
### Resources:

# TD3 Algorithm: https://spinningup.openai.com/en/latest/algorithms/td3.html

# TD3 Paper: https://arxiv.org/abs/1802.09477

In [1]:
import gym
import torch
from torch import nn
import numpy as np
from spinup.utils.run_utils import setup_logger_kwargs
from spinup import td3_pytorch as td3

from models import MLPActorCriticTD3
from train import TD3Trainer
from envs import sparse_rew_pendulum, sample_goal_pendulum
from her_strat import FinalStrategy, EpisodeStrategy, FutureStrategy

In [2]:
### Comparing dense and sparse rewards

env_fn = lambda: gym.make('Pendulum-v0')
env = env_fn()
obs = env.reset()
act = np.random.normal(loc=0, scale=0.5, size=(1,))
obs, rew, done, _ = env.step(act)
print(f'Dense Rewards: {rew}')

goal = np.array([0.7071, 0.7071])
obs = np.array([1.0, 0.0, 0.3])
rew = sparse_rew_pendulum(obs, act, goal, eps=np.deg2rad(15))
print(f'Sparse Rewards: {rew}')

Dense Rewards: -8.481493693577573
Sparse Rewards: [-1.00901253]




In [3]:
### Testing the different HER goal selection strategies

goal_mask = [0, 1]
obs = np.random.rand(6, 3)
print(f'Observations (x): {obs[:, 0]}\n')

# Final goal selection strategy
strat = FinalStrategy(goal_mask, 4)
goals = strat.get_goals(obs)
print('Final goal selection strategy')
print(f'Goal shape: {goals.shape}, Goals (x): {goals[:, :, 0]}\n')

# Episode goal selection strategy
strat = EpisodeStrategy(goal_mask, 4)
goals = strat.get_goals(obs)
print('Episode goal selection strategy')
print(f'Goal shape: {goals.shape}, Goals (x): {goals[:, :, 0]}\n')

# Future goal selection strategy
strat = FutureStrategy(goal_mask, 4)
goals = strat.get_goals(obs)
print('Future goal selection strategy')
print(f'Goal shape: {goals.shape}, Goals (x): {goals[:, :, 0]}\n')

Observations (x): [0.17488444 0.28024612 0.02573273 0.60293543 0.01738222 0.39302828]

Final goal selection strategy
Goal shape: (6, 1, 2), Goals (x): [[0.39302828]
 [0.39302828]
 [0.39302828]
 [0.39302828]
 [0.39302828]
 [0.39302828]]

Episode goal selection strategy
Goal shape: (6, 4, 2), Goals (x): [[0.17488444 0.39302828 0.39302828 0.17488444]
 [0.28024612 0.60293543 0.02573273 0.01738222]
 [0.17488444 0.02573273 0.17488444 0.28024612]
 [0.01738222 0.17488444 0.39302828 0.17488444]
 [0.02573273 0.28024612 0.28024612 0.28024612]
 [0.17488444 0.17488444 0.17488444 0.39302828]]

Future goal selection strategy
Goal shape: (6, 4, 2), Goals (x): [[0.17488444 0.02573273 0.17488444 0.39302828]
 [0.02573273 0.02573273 0.39302828 0.60293543]
 [0.01738222 0.39302828 0.01738222 0.01738222]
 [0.39302828 0.60293543 0.39302828 0.39302828]
 [0.39302828 0.39302828 0.39302828 0.39302828]
 [0.39302828 0.39302828 0.39302828 0.39302828]]



In [4]:
### Test MLPActorCritic class for continuous environments 

env_fn = lambda: gym.make('Pendulum-v0')
envs = env_fn()
goal_dim = 2
hidden_sizes, hidden_acts = [64, 64], nn.ReLU
action_std = 0.1

ac = MLPActorCriticTD3(env, goal_dim, hidden_sizes, hidden_sizes, 
                       hidden_acts, hidden_acts, action_std)
print('TD3 Actor-Critic Summary \n\n')
ac.layer_summary()

TD3 Actor-Critic Summary 


Actor Summary: 

Linear input & output shapes:	 torch.Size([1, 5]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 1])


Critic Summary: 

Linear input & output shapes:	 torch.Size([1, 6]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 65]) torch.Size([1, 64])
ReLU input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 64])
Linear input & output shapes:	 torch.Size([1, 64]) torch.Size([1, 1])






In [None]:
### Define parameters needed for training

# Actor-Critic module parameters
hidden_sizes, hidden_acts = [64, 64], nn.ReLU
action_std = 0.1
ac_kwargs = dict(hidden_sizes_actor=hidden_sizes,
                 hidden_sizes_critic=hidden_sizes,
                 hidden_acts_actor=hidden_acts,
                 hidden_acts_critic=hidden_acts,
                 action_std=action_std)

# Environment goal parameters
rew_fn, rew_eps = sparse_rew_pendulum, np.deg2rad(15)
goal_fn = sample_goal_pendulum
goal_mask = [0, 1]

# HER goal selection strategy parameters
her_strat = 'future'
her_k = 4

# Training parameters
seed = 0
steps_per_epoch, epochs = 4000, 50
buf_size = 1000000
gamma, polyak = 0.99, 0.995
pi_lr, q_lr = 1e-3, 1e-3
batch_size = 100
start_steps, learning_starts = 10000, 1000
target_noise, noise_clip = 0.2, 0.5
policy_delay = 2
update_every, num_test_episodes = 50, 10
max_ep_len, save_freq = 1000, 10

# Experiment parameters
data_dir = '/home/sherif/user/python/DRL/data/her'

In [16]:
### Start the training process using the Spinning up TD3 implementation

env_fn = lambda : gym.make('Pendulum-v0')
exp_name_custom = 'td3_spinup'

logger_kwargs = setup_logger_kwargs(exp_name=exp_name_custom, data_dir=data_dir, seed=seed)

td3(env_fn=env_fn, ac_kwargs=dict(hidden_sizes=hidden_sizes), seed=seed, 
     steps_per_epoch=steps_per_epoch, epochs=epochs, replay_size=buf_size, 
     gamma=gamma, polyak=polyak, pi_lr=pi_lr, q_lr=q_lr, batch_size=batch_size,
     start_steps=start_steps, update_after=learning_starts, update_every=update_every,
     act_noise=action_std, target_noise=target_noise, noise_clip=noise_clip, 
     policy_delay=policy_delay, num_test_episodes=num_test_episodes, 
     max_ep_len=max_ep_len, logger_kwargs=logger_kwargs, save_freq=save_freq)

[32;1mLogging data to /home/sherif/user/python/DRL/data/her/td3_spinup/td3_spinup_s0/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            64,
            64
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x78ae5d619730>",
    "epochs":	50,
    "exp_name":	"td3_spinup",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x78ae46f17da0>":	{
            "epoch_dict":	{},
            "exp_name":	"td3_spinup",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"/home/sherif/user/python/DRL/data/her/td3_spinup/td3_spinup_s0",
            "output_file":	{
                "<_io.TextIOWrapper name='/home/sherif/user/python/DRL/data/her/td3_spinup/td3_spinup_s0/progress.txt' mode='w' encoding='UTF-8'>":	{
                    "mode":	"w"
                

In [None]:
### Start the training process using the custom TD3 implementation

env_fn = lambda: gym.make('Pendulum-v0')
exp_name_custom = 'her_td3_custom'

logger_kwargs = setup_logger_kwargs(exp_name=exp_name_custom, data_dir=data_dir, seed=seed)

trainer = TD3Trainer()
trainer.train_mod(env_fn, rew_fn=rew_fn, rew_eps=rew_eps, goal_fn=goal_fn, 
                  goal_mask=goal_mask, her_strat=her_strat, her_k=her_k, 
                  ac=MLPActorCriticTD3, ac_kwargs=ac_kwargs, seed=seed, 
                  steps_per_epoch=steps_per_epoch, epochs=epochs, 
                  buf_size=buf_size, gamma=gamma, polyak=polyak, pi_lr=pi_lr,
                  q_lr=q_lr, batch_size=batch_size, start_steps=start_steps,
                  learning_starts=learning_starts, update_every=update_every, 
                  target_noise=target_noise, noise_clip=noise_clip,
                  policy_delay=policy_delay, num_test_episodes=num_test_episodes, 
                  max_ep_len=max_ep_len, logger_kwargs=logger_kwargs, 
                  save_freq=save_freq)



[32;1mLogging data to /home/sherif/user/python/DRL/data/her/her_td3_custom/her_td3_custom_s0/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_mod":	{
        "MLPActorCriticTD3(\n  (actor): MLPActorTD3(\n    (net): Sequential(\n      (hidden_1): Linear(in_features=5, out_features=64, bias=True)\n      (activation_1): ReLU()\n      (hidden_2): Linear(in_features=64, out_features=64, bias=True)\n      (activation_2): ReLU()\n      (output): Linear(in_features=64, out_features=1, bias=True)\n      (output_act): Tanh()\n    )\n    (net_target): Sequential(\n      (hidden_1): Linear(in_features=5, out_features=64, bias=True)\n      (activation_1): ReLU()\n      (hidden_2): Linear(in_features=64, out_features=64, bias=True)\n      (activation_2): ReLU()\n      (output): Linear(in_features=64, out_features=1, bias=True)\n      (output_act): Tanh()\n    )\n  )\n  (critic_1): MLPCriticTD3(\n    (net): Sequential(\n      (hidden_1): Linear(in_features=6, out_features=64, bias=True)\n      