In [1]:
import gymnasium as gym
import torch
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from assignment3.training_loop import training_loop
from assignment3.models import PolicyNetwork, ValueNetwork
from assignment3.dim_alignment import ENV_ACT_DIM, max_output_dim, max_input_dim
from assignment3.optuna_search import OptunaSearch, StudyFloatParamRange
from assignment3.device import get_device

In [2]:
def generalized_actor_critic(
        env_name,
        input_dim,
        output_dim,
        hidden_sizes_theta,
        hidden_sizes_w,
        dropout_layers,
        alpha_theta=0.001,
        alpha_w=0.001,
        episodes=500,
        gamma=0.99,
        dropout_p=0.7,
        log_dir="runs/actor_critic"
):
    """
    Train a policy and value network using Actor-Critic, with padded inputs/outputs.
    """
    device = get_device()
    env = gym.make(env_name)
    writer = SummaryWriter(log_dir=f"{log_dir}_{env_name}")

    policy_network = PolicyNetwork(input_dim, hidden_sizes_theta, output_dim, dropout_layers, dropout_p).to(device)
    value_network = ValueNetwork(input_dim, hidden_sizes_w).to(device)

    policy_optimizer = optim.Adam(policy_network.parameters(), lr=alpha_theta)
    value_optimizer = optim.Adam(value_network.parameters(), lr=alpha_w)

    rewards_per_episode = []

    # Identify the actual dimensionalities for this env
    actual_act_dim = ENV_ACT_DIM[env_name]

    train_time = training_loop(
        input_dim=input_dim,
        actual_act_dim=actual_act_dim,
        policy_network=policy_network,
        value_network=value_network,
        policy_optimizer=policy_optimizer,
        value_optimizer=value_optimizer,
        env=env,
        env_name=env_name,
        episodes=episodes,
        gamma=gamma,
        writer=writer,
        rewards_per_episode=rewards_per_episode,
    )

    writer.close()
    env.close()

    return policy_network, value_network, rewards_per_episode, train_time

In [3]:
# TODO: to find the best hyperparameters for each environment, initialize different ranges and params for each env separately

episodes = 2000
n_trials = 10
overall_results = {}

# Common hidden sizes options
hidden_sizes_theta_values = ["[16, 32, 16]", "[32, 64, 32]"]
hidden_sizes_w_values = ["[16, 32, 16]", "[32, 64, 32]"]
dropout_layers = [1]

# Define your search ranges
gamma_values = StudyFloatParamRange(low=0.95, high=0.99, step=0.01)
alpha_theta_values = StudyFloatParamRange(low=0.0005, high=0.0008, step=0.0001)
alpha_w_values = StudyFloatParamRange(low=0.0005, high=0.0008, step=0.0001)
dropout_p_values = StudyFloatParamRange(low=0.2, high=0.5, step=0.1)

In [4]:
def run_experiment(env_name,
                   hidden_sizes_theta_values=hidden_sizes_theta_values,
                   hidden_sizes_w_values=hidden_sizes_w_values,
                   dropout_layers=dropout_layers,
                   gamma_values=gamma_values,
                   alpha_theta_values=alpha_theta_values,
                   alpha_w_values=alpha_w_values,
                   dropout_p_values=dropout_p_values,
                   episodes=episodes,
                   n_trials=n_trials):
    optuna_search = OptunaSearch(
        train_function=generalized_actor_critic,
        env_name=env_name,
        max_input_dim=max_input_dim,
        max_output_dim=max_output_dim,
        hidden_sizes_theta_values=hidden_sizes_theta_values,
        hidden_sizes_w_values=hidden_sizes_w_values,
        dropout_layers=dropout_layers,
        gamma_values=gamma_values,
        alpha_theta_values=alpha_theta_values,
        alpha_w_values=alpha_w_values,
        dropout_p_values=dropout_p_values,
        episodes=episodes,
    )
    best_policy, best_value, best_params, best_reward, study = optuna_search.optuna_search_for_env(n_trials=n_trials,
                                                                                                   study_name=f"{env_name}_actor_critic_study")

    print("\nDone! Best parameters found by Optuna:", best_params)
    print("Best reward from Optuna:", best_reward)

    # save networks to pretrained_models
    torch.save(best_policy.state_dict(), f"pretrained_models/{env_name}_policy.pth")
    torch.save(best_value.state_dict(), f"pretrained_models/{env_name}_value.pth")

In [5]:
# Launch the search on, say, CartPole-v1
run_experiment("CartPole-v1", episodes=1000)

[I 2024-12-29 18:22:42,453] A new study created in memory with name: CartPole-v1_actor_critic_study



[OPTUNA Trial 0] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[32, 64, 32],
         gamma=0.99, dropout_p=0.4,
         alpha_theta=0.0006000000000000001, alpha_w=0.0008


Training: 100%|██████████| 1000/1000 [01:48<00:00,  9.22episode/s, Avg Reward(100)=309.83]
[I 2024-12-29 18:24:31,452] Trial 0 finished with value: 1000.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.99, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0008, 'dropout_p': 0.4}. Best is trial 0 with value: 1000.0.



[OPTUNA Trial 1] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[32, 64, 32],
         gamma=0.96, dropout_p=0.2,
         alpha_theta=0.0008, alpha_w=0.0006000000000000001


Training: 100%|██████████| 1000/1000 [00:46<00:00, 21.65episode/s, Avg Reward(100)=233.25]
[I 2024-12-29 18:25:17,652] Trial 1 finished with value: 1000.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.96, 'alpha_theta': 0.0008, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.2}. Best is trial 0 with value: 1000.0.



[OPTUNA Trial 2] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[32, 64, 32],
         gamma=0.99, dropout_p=0.2,
         alpha_theta=0.0005, alpha_w=0.0006000000000000001


Training:  86%|████████▋ | 863/1000 [01:31<00:14,  9.39episode/s, Avg Reward(100)=223.36]
[I 2024-12-29 18:26:49,531] Trial 2 finished with value: 864.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.99, 'alpha_theta': 0.0005, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.2}. Best is trial 2 with value: 864.0.


Solved CartPole-v1 in 864 episodes!

[OPTUNA Trial 3] Env=CartPole-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[32, 64, 32],
         gamma=0.97, dropout_p=0.4,
         alpha_theta=0.0008, alpha_w=0.0007


Training: 100%|██████████| 1000/1000 [01:17<00:00, 12.84episode/s, Avg Reward(100)=175.56]
[I 2024-12-29 18:28:07,396] Trial 3 finished with value: 1000.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.97, 'alpha_theta': 0.0008, 'alpha_w': 0.0007, 'dropout_p': 0.4}. Best is trial 2 with value: 864.0.



[OPTUNA Trial 4] Env=CartPole-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[32, 64, 32],
         gamma=0.98, dropout_p=0.30000000000000004,
         alpha_theta=0.0006000000000000001, alpha_w=0.0008


Training: 100%|██████████| 1000/1000 [01:02<00:00, 15.96episode/s, Avg Reward(100)=386.59]
[I 2024-12-29 18:29:10,068] Trial 4 finished with value: 1000.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.98, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0008, 'dropout_p': 0.30000000000000004}. Best is trial 2 with value: 864.0.



[OPTUNA Trial 5] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[32, 64, 32],
         gamma=0.97, dropout_p=0.30000000000000004,
         alpha_theta=0.0007, alpha_w=0.0005


Training:  95%|█████████▍| 947/1000 [01:31<00:05, 10.33episode/s, Avg Reward(100)=384.07]
[I 2024-12-29 18:30:41,744] Trial 5 finished with value: 948.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.97, 'alpha_theta': 0.0007, 'alpha_w': 0.0005, 'dropout_p': 0.30000000000000004}. Best is trial 2 with value: 864.0.


Solved CartPole-v1 in 948 episodes!

[OPTUNA Trial 6] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.96, dropout_p=0.30000000000000004,
         alpha_theta=0.0007, alpha_w=0.0008


Training: 100%|██████████| 1000/1000 [00:22<00:00, 44.76episode/s, Avg Reward(100)=9.47]  
[I 2024-12-29 18:31:04,092] Trial 6 finished with value: 1000.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.96, 'alpha_theta': 0.0007, 'alpha_w': 0.0008, 'dropout_p': 0.30000000000000004}. Best is trial 2 with value: 864.0.



[OPTUNA Trial 7] Env=CartPole-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[32, 64, 32],
         gamma=0.98, dropout_p=0.4,
         alpha_theta=0.0007, alpha_w=0.0008


Training:  88%|████████▊ | 884/1000 [01:38<00:12,  8.99episode/s, Avg Reward(100)=225.84]
[I 2024-12-29 18:32:42,416] Trial 7 finished with value: 885.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.98, 'alpha_theta': 0.0007, 'alpha_w': 0.0008, 'dropout_p': 0.4}. Best is trial 2 with value: 864.0.


Solved CartPole-v1 in 885 episodes!

[OPTUNA Trial 8] Env=CartPole-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[16, 32, 16],
         gamma=0.98, dropout_p=0.2,
         alpha_theta=0.0006000000000000001, alpha_w=0.0006000000000000001


Training: 100%|██████████| 1000/1000 [01:33<00:00, 10.69episode/s, Avg Reward(100)=364.54]
[I 2024-12-29 18:34:15,972] Trial 8 finished with value: 1000.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.98, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.2}. Best is trial 2 with value: 864.0.



[OPTUNA Trial 9] Env=CartPole-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[32, 64, 32],
         gamma=0.98, dropout_p=0.2,
         alpha_theta=0.0008, alpha_w=0.0006000000000000001


Training:  76%|███████▌  | 759/1000 [01:00<00:19, 12.60episode/s, Avg Reward(100)=366.82]
[I 2024-12-29 18:35:16,218] Trial 9 finished with value: 760.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.98, 'alpha_theta': 0.0008, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.2}. Best is trial 9 with value: 760.0.


Solved CartPole-v1 in 760 episodes!

[OPTUNA] Best trial: trail 9
  Value (Reward): 760.00
  Params: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.98, 'alpha_theta': 0.0008, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.2}


Training: 100%|██████████| 1000/1000 [01:05<00:00, 15.21episode/s, Avg Reward(100)=174.44]


Total Optuna search time for CartPole-v1: 819.50s

Done! Best parameters found by Optuna: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.98, 'alpha_theta': 0.0008, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.2}
Best reward from Optuna: 760.0





In [6]:
run_experiment("Acrobot-v1", episodes=500)

[I 2024-12-29 18:36:21,985] A new study created in memory with name: Acrobot-v1_actor_critic_study



[OPTUNA Trial 0] Env=Acrobot-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.99, dropout_p=0.30000000000000004,
         alpha_theta=0.0006000000000000001, alpha_w=0.0008


Training:  20%|██        | 100/500 [00:05<00:20, 19.48episode/s, Avg Reward(100)=-101.64]
[I 2024-12-29 18:36:27,128] Trial 0 finished with value: 101.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.99, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0008, 'dropout_p': 0.30000000000000004}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 101 episodes!

[OPTUNA Trial 1] Env=Acrobot-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.96, dropout_p=0.4,
         alpha_theta=0.0006000000000000001, alpha_w=0.0007


Training:  21%|██▏       | 107/500 [00:06<00:23, 16.88episode/s, Avg Reward(100)=-118.61]
[I 2024-12-29 18:36:33,471] Trial 1 finished with value: 108.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.96, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0007, 'dropout_p': 0.4}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 108 episodes!

[OPTUNA Trial 2] Env=Acrobot-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.98, dropout_p=0.2,
         alpha_theta=0.0006000000000000001, alpha_w=0.0005


Training: 100%|██████████| 500/500 [02:01<00:00,  4.10episode/s, Avg Reward(100)=-500.00]
[I 2024-12-29 18:38:35,313] Trial 2 finished with value: 500.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.98, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0005, 'dropout_p': 0.2}. Best is trial 0 with value: 101.0.



[OPTUNA Trial 3] Env=Acrobot-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[32, 64, 32],
         gamma=0.96, dropout_p=0.2,
         alpha_theta=0.0006000000000000001, alpha_w=0.0006000000000000001


Training:  47%|████▋     | 235/500 [00:23<00:26,  9.94episode/s, Avg Reward(100)=-140.40]
[I 2024-12-29 18:38:58,959] Trial 3 finished with value: 236.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.96, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.2}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 236 episodes!

[OPTUNA Trial 4] Env=Acrobot-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.95, dropout_p=0.5,
         alpha_theta=0.0008, alpha_w=0.0008


Training: 100%|██████████| 500/500 [02:02<00:00,  4.09episode/s, Avg Reward(100)=-500.00]
[I 2024-12-29 18:41:01,128] Trial 4 finished with value: 500.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.95, 'alpha_theta': 0.0008, 'alpha_w': 0.0008, 'dropout_p': 0.5}. Best is trial 0 with value: 101.0.



[OPTUNA Trial 5] Env=Acrobot-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[32, 64, 32],
         gamma=0.98, dropout_p=0.2,
         alpha_theta=0.0007, alpha_w=0.0006000000000000001


Training:  21%|██        | 106/500 [00:06<00:25, 15.49episode/s, Avg Reward(100)=-128.25]
[I 2024-12-29 18:41:07,977] Trial 5 finished with value: 107.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.98, 'alpha_theta': 0.0007, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.2}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 107 episodes!

[OPTUNA Trial 6] Env=Acrobot-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.98, dropout_p=0.30000000000000004,
         alpha_theta=0.0005, alpha_w=0.0007


Training:  49%|████▉     | 245/500 [00:38<00:40,  6.36episode/s, Avg Reward(100)=-254.41]
[I 2024-12-29 18:41:46,503] Trial 6 finished with value: 246.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.98, 'alpha_theta': 0.0005, 'alpha_w': 0.0007, 'dropout_p': 0.30000000000000004}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 246 episodes!

[OPTUNA Trial 7] Env=Acrobot-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[32, 64, 32],
         gamma=0.95, dropout_p=0.30000000000000004,
         alpha_theta=0.0008, alpha_w=0.0005


Training: 100%|██████████| 500/500 [01:59<00:00,  4.17episode/s, Avg Reward(100)=-500.00]
[I 2024-12-29 18:43:46,504] Trial 7 finished with value: 500.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.95, 'alpha_theta': 0.0008, 'alpha_w': 0.0005, 'dropout_p': 0.30000000000000004}. Best is trial 0 with value: 101.0.



[OPTUNA Trial 8] Env=Acrobot-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[16, 32, 16],
         gamma=0.98, dropout_p=0.2,
         alpha_theta=0.0008, alpha_w=0.0007


Training:  24%|██▎       | 118/500 [00:07<00:23, 16.09episode/s, Avg Reward(100)=-133.23]
[I 2024-12-29 18:43:53,841] Trial 8 finished with value: 119.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.98, 'alpha_theta': 0.0008, 'alpha_w': 0.0007, 'dropout_p': 0.2}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 119 episodes!

[OPTUNA Trial 9] Env=Acrobot-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[32, 64, 32],
         gamma=0.99, dropout_p=0.2,
         alpha_theta=0.0007, alpha_w=0.0008


Training:  21%|██        | 105/500 [00:05<00:22, 17.68episode/s, Avg Reward(100)=-115.45]
[I 2024-12-29 18:43:59,781] Trial 9 finished with value: 106.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.99, 'alpha_theta': 0.0007, 'alpha_w': 0.0008, 'dropout_p': 0.2}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 106 episodes!

[OPTUNA] Best trial: trail 0
  Value (Reward): 101.00
  Params: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.99, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0008, 'dropout_p': 0.30000000000000004}


Training:  37%|███▋      | 185/500 [00:12<00:21, 14.71episode/s, Avg Reward(100)=-167.91]

Solved Acrobot-v1 in 186 episodes!

Total Optuna search time for Acrobot-v1: 470.38s

Done! Best parameters found by Optuna: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.99, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0008, 'dropout_p': 0.30000000000000004}
Best reward from Optuna: 101.0





In [7]:
run_experiment("MountainCarContinuous-v0", episodes=1500, hidden_sizes_theta_values = ["[32, 64, 32]", "[32, 128, 32]"], hidden_sizes_w_values = ["[32, 64, 32]", "[32, 128, 32]"])


[I 2024-12-29 18:44:12,391] A new study created in memory with name: MountainCarContinuous-v0_actor_critic_study



[OPTUNA Trial 0] Env=MountainCarContinuous-v0:
        hidden_sizes_theta=[32, 128, 32], hidden_sizes_w=[32, 128, 32],
         gamma=0.99, dropout_p=0.4,
         alpha_theta=0.0007, alpha_w=0.0007


Training: 100%|██████████| 1500/1500 [13:23<00:00,  1.87episode/s, Avg Reward(100)=-19.80]
[I 2024-12-29 18:57:35,889] Trial 0 finished with value: 1500.0 and parameters: {'hidden_sizes_theta': '[32, 128, 32]', 'hidden_sizes_w': '[32, 128, 32]', 'gamma': 0.99, 'alpha_theta': 0.0007, 'alpha_w': 0.0007, 'dropout_p': 0.4}. Best is trial 0 with value: 1500.0.



[OPTUNA Trial 1] Env=MountainCarContinuous-v0:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[32, 128, 32],
         gamma=0.95, dropout_p=0.4,
         alpha_theta=0.0006000000000000001, alpha_w=0.0007


Training: 100%|██████████| 1500/1500 [13:29<00:00,  1.85episode/s, Avg Reward(100)=-24.83]
[I 2024-12-29 19:11:05,422] Trial 1 finished with value: 1500.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 128, 32]', 'gamma': 0.95, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0007, 'dropout_p': 0.4}. Best is trial 0 with value: 1500.0.



[OPTUNA Trial 2] Env=MountainCarContinuous-v0:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[32, 64, 32],
         gamma=0.98, dropout_p=0.5,
         alpha_theta=0.0007, alpha_w=0.0007


Training: 100%|██████████| 1500/1500 [10:33<00:00,  2.37episode/s, Avg Reward(100)=-20.36]
[I 2024-12-29 19:21:38,555] Trial 2 finished with value: 1500.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.98, 'alpha_theta': 0.0007, 'alpha_w': 0.0007, 'dropout_p': 0.5}. Best is trial 0 with value: 1500.0.



[OPTUNA Trial 3] Env=MountainCarContinuous-v0:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[32, 128, 32],
         gamma=0.98, dropout_p=0.4,
         alpha_theta=0.0007, alpha_w=0.0006000000000000001


Training: 100%|██████████| 1500/1500 [13:16<00:00,  1.88episode/s, Avg Reward(100)=-19.47]
[I 2024-12-29 19:34:55,406] Trial 3 finished with value: 1500.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 128, 32]', 'gamma': 0.98, 'alpha_theta': 0.0007, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.4}. Best is trial 0 with value: 1500.0.



[OPTUNA Trial 4] Env=MountainCarContinuous-v0:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[32, 64, 32],
         gamma=0.96, dropout_p=0.2,
         alpha_theta=0.0007, alpha_w=0.0007


Training: 100%|██████████| 1500/1500 [10:28<00:00,  2.39episode/s, Avg Reward(100)=-22.38]
[I 2024-12-29 19:45:24,148] Trial 4 finished with value: 1500.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.96, 'alpha_theta': 0.0007, 'alpha_w': 0.0007, 'dropout_p': 0.2}. Best is trial 0 with value: 1500.0.



[OPTUNA Trial 5] Env=MountainCarContinuous-v0:
        hidden_sizes_theta=[32, 128, 32], hidden_sizes_w=[32, 64, 32],
         gamma=0.98, dropout_p=0.30000000000000004,
         alpha_theta=0.0005, alpha_w=0.0008


Training: 100%|██████████| 1500/1500 [13:01<00:00,  1.92episode/s, Avg Reward(100)=-21.71]
[I 2024-12-29 19:58:25,197] Trial 5 finished with value: 1500.0 and parameters: {'hidden_sizes_theta': '[32, 128, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.98, 'alpha_theta': 0.0005, 'alpha_w': 0.0008, 'dropout_p': 0.30000000000000004}. Best is trial 0 with value: 1500.0.



[OPTUNA Trial 6] Env=MountainCarContinuous-v0:
        hidden_sizes_theta=[32, 128, 32], hidden_sizes_w=[32, 128, 32],
         gamma=0.98, dropout_p=0.4,
         alpha_theta=0.0008, alpha_w=0.0008


Training: 100%|██████████| 1500/1500 [13:34<00:00,  1.84episode/s, Avg Reward(100)=-20.12]
[I 2024-12-29 20:11:59,960] Trial 6 finished with value: 1500.0 and parameters: {'hidden_sizes_theta': '[32, 128, 32]', 'hidden_sizes_w': '[32, 128, 32]', 'gamma': 0.98, 'alpha_theta': 0.0008, 'alpha_w': 0.0008, 'dropout_p': 0.4}. Best is trial 0 with value: 1500.0.



[OPTUNA Trial 7] Env=MountainCarContinuous-v0:
        hidden_sizes_theta=[32, 128, 32], hidden_sizes_w=[32, 64, 32],
         gamma=0.97, dropout_p=0.30000000000000004,
         alpha_theta=0.0005, alpha_w=0.0007


Training: 100%|██████████| 1500/1500 [13:13<00:00,  1.89episode/s, Avg Reward(100)=-20.87]
[I 2024-12-29 20:25:13,520] Trial 7 finished with value: 1500.0 and parameters: {'hidden_sizes_theta': '[32, 128, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.97, 'alpha_theta': 0.0005, 'alpha_w': 0.0007, 'dropout_p': 0.30000000000000004}. Best is trial 0 with value: 1500.0.



[OPTUNA Trial 8] Env=MountainCarContinuous-v0:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[32, 128, 32],
         gamma=0.96, dropout_p=0.2,
         alpha_theta=0.0005, alpha_w=0.0005


Training: 100%|██████████| 1500/1500 [13:14<00:00,  1.89episode/s, Avg Reward(100)=-22.89]
[I 2024-12-29 20:38:27,749] Trial 8 finished with value: 1500.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 128, 32]', 'gamma': 0.96, 'alpha_theta': 0.0005, 'alpha_w': 0.0005, 'dropout_p': 0.2}. Best is trial 0 with value: 1500.0.



[OPTUNA Trial 9] Env=MountainCarContinuous-v0:
        hidden_sizes_theta=[32, 128, 32], hidden_sizes_w=[32, 128, 32],
         gamma=0.95, dropout_p=0.2,
         alpha_theta=0.0005, alpha_w=0.0008


Training: 100%|██████████| 1500/1500 [13:35<00:00,  1.84episode/s, Avg Reward(100)=-23.83]
[I 2024-12-29 20:52:02,837] Trial 9 finished with value: 1500.0 and parameters: {'hidden_sizes_theta': '[32, 128, 32]', 'hidden_sizes_w': '[32, 128, 32]', 'gamma': 0.95, 'alpha_theta': 0.0005, 'alpha_w': 0.0008, 'dropout_p': 0.2}. Best is trial 0 with value: 1500.0.



[OPTUNA] Best trial: trail 0
  Value (Reward): 1500.00
  Params: {'hidden_sizes_theta': '[32, 128, 32]', 'hidden_sizes_w': '[32, 128, 32]', 'gamma': 0.99, 'alpha_theta': 0.0007, 'alpha_w': 0.0007, 'dropout_p': 0.4}


Training: 100%|██████████| 1500/1500 [13:30<00:00,  1.85episode/s, Avg Reward(100)=-20.06]


Total Optuna search time for MountainCarContinuous-v0: 8480.48s

Done! Best parameters found by Optuna: {'hidden_sizes_theta': '[32, 128, 32]', 'hidden_sizes_w': '[32, 128, 32]', 'gamma': 0.99, 'alpha_theta': 0.0007, 'alpha_w': 0.0007, 'dropout_p': 0.4}
Best reward from Optuna: 1500.0



