# TD3 Results when used on OpenAI's Pendulum-v1 environment

In [6]:
from train_loops import *
import gymnasium as gym
import wandb 


## Hyperparameters used (wandb setup)

In [8]:
wandb.login()
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'Average return',
    'goal': 'maximize'
}
parameters = {
    'lr_critic':{
        'distribution':'uniform',
        'min':1e-5,
        'max': 2e-3
    },
    'lr_actor':{
        'distribution':'uniform',
        'min': 1e-5,
        'max': 5e-4
    },
    'tau':{
        'distribution':'uniform',
        'min':0,
        'max':0.1
    },
    'sigma':{
        'distribution':'uniform',
        'min':0.1,
        'max':1
    },
    'sigma_explore':{
        'distribution':'uniform',
        'min':0.1,
        'max':1
    },
    'noise_clip':{
        'distribution':'uniform',
        'min':0.1,
        'max':2
    },
    'gamma':{
        'value':0.99
    },
    'policy_interval':{
        'value':2
    }, 
    'buffer_size':{
        'value':50000
    },
    'num_episodes':{
        'value':100
    }, 
    'max_length_episode':{
        'value':200
    },
    'batch_size':{
        'value':256
    },
    'warmup_amount':{
        'value':10
    }
}
sweep_config['parameters'] = parameters
sweep_config['metric'] = metric
sweep_id = wandb.sweep(sweep_config, project="Pendulum-TD3")

wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: senna-renting (senna-renting-radboud-university). Use `wandb login --relogin` to force relogin


Create sweep with ID: o35iimk6
Sweep URL: https://wandb.ai/senna-renting-radboud-university/Pendulum-TD3/sweeps/o35iimk6


## Activate wandb sweep agent 

In [None]:
wandb.agent(sweep_id, wandb_train_td3, count=5)

wandb: Agent Starting Run: jkzee3tj with config:
wandb: 	batch_size: 256
wandb: 	buffer_size: 50000
wandb: 	gamma: 0.99
wandb: 	lr_actor: 0.0004755500563723337
wandb: 	lr_critic: 0.000561173550715831
wandb: 	max_length_episode: 200
wandb: 	noise_clip: 1.4676908416019383
wandb: 	num_episodes: 100
wandb: 	policy_interval: 2
wandb: 	sigma: 0.7973193538846707
wandb: 	sigma_explore: 0.765581788144996
wandb: 	tau: 0.07637355981110483
wandb: 	warmup_amount: 100


VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Actor average loss,▁▂▃▄▅▆▇▇▇▇▇▇▇▇▇█▇██▇████████████████████
Average return,▄▄█▅▅▄▄▄▃▄▃▄▃▃▃▄▃▂▃▃▁▃▃▅▂▃▃▂▆▂▃▂▆▂▅▃▃▄▆▃
Epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇█████
Highest reward,████▇█▇▇▇▆▆▇▇▇▇▇▇▆▇▁▇▅▇▇▆▆▆▅▆▇▆█▇▆▆▆▆▆▇▆
Qnet1 average loss,▅▃▁▃▄▅▇▅▅▅▆▅▅▆▆▆▅▆▆▆▅▆▅▆▇▇▆▇▇▆▇█▇▇▇▇▇▇▇█
Qnet2 average loss,▁▅▃▄▆▆▅▅▆▅▆▆▅▆▅▅▅▆▅▆▆▅▆▅▆▆▇▇█▆▆▇▇▇▇▇█▇▇▆

0,1
Actor average loss,7.29138
Average return,-7.74478
Epoch,100.0
Highest reward,-0.96039
Qnet1 average loss,0.30314
Qnet2 average loss,0.29926


wandb: Agent Starting Run: iktdwvn8 with config:
wandb: 	batch_size: 256
wandb: 	buffer_size: 50000
wandb: 	gamma: 0.99
wandb: 	lr_actor: 8.563448642302808e-05
wandb: 	lr_critic: 0.0017036102254398638
wandb: 	max_length_episode: 200
wandb: 	noise_clip: 1.0738948935126578
wandb: 	num_episodes: 100
wandb: 	policy_interval: 2
wandb: 	sigma: 0.8935491535780713
wandb: 	sigma_explore: 0.16294230068778875
wandb: 	tau: 0.0023819010155006317
wandb: 	warmup_amount: 100


# Regular non-wandb run

In [None]:
from gym.wrappers import TimeLimit
env = TimeLimit(gym.make("Pendulum-v1"), max_episode_steps=40)
num_episodes = 50
batch_size = 256
buffer_size = 50000
warmup_steps = 0
train_td3(env, num_episodes, batch_size, buffer_size, warmup_steps, policy_interval=2, gamma=0.96, lr_P=0.0056, lr_Q=0.0095, noise_clip=0.61, sigma=0.94, tau=0.45)

# Random run average return

In [10]:
print("Average return over 10 episodes: ", random_run(1000, seed=13, max_length_episodes=200))

Average return over 10 episodes:  -6.176537942057198


# Test a model

In [4]:
wandb_code = "i4073qge/floral-sweep-1" # pattern: sweep_id/run_name
wandb_test_saved_td3(wandb_code)

