In [1]:
import gymnasium as gym
import torch
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from training_loop import training_loop
from models import PolicyNetwork, ValueNetwork
from dim_alignment import ENV_ACT_DIM, max_output_dim, max_input_dim
from optuna_search import OptunaSearch
from hyper_params import StudyFloatParamRange, HyperParamsRanges, HyperParams
from device import get_device
from action_selector import ActionSelector

In [2]:
def generalized_actor_critic(
        env_name,
        input_dim,
        output_dim,
        episodes,
        hyper_params: HyperParams,
        log_dir="runs/actor_critic"
):
    """
    Train a policy and value network using Actor-Critic, with padded inputs/outputs.
    """
    device = get_device()
    env = gym.make(env_name)
    writer = SummaryWriter(log_dir=f"{log_dir}_{env_name}")

    policy_network = PolicyNetwork(input_dim, hyper_params.hidden_sizes_theta, output_dim).to(device)
    value_network = ValueNetwork(input_dim, hyper_params.hidden_sizes_w).to(device)

    policy_optimizer = optim.Adam(policy_network.parameters(), lr=hyper_params.alpha_theta)
    value_optimizer = optim.Adam(value_network.parameters(), lr=hyper_params.alpha_w)

    action_selector = ActionSelector()

    rewards_per_episode = []

    # Identify the actual dimensionality for this env
    actual_act_dim = ENV_ACT_DIM[env_name]

    train_time = training_loop(
        input_dim=input_dim,
        actual_act_dim=actual_act_dim,
        policy_network=policy_network,
        value_network=value_network,
        policy_optimizer=policy_optimizer,
        value_optimizer=value_optimizer,
        env=env,
        env_name=env_name,
        episodes=episodes,
        gamma=hyper_params.gamma,
        writer=writer,
        rewards_per_episode=rewards_per_episode,
        action_selector=action_selector
    )

    writer.close()
    env.close()

    return policy_network, value_network, rewards_per_episode, train_time

In [6]:
episodes = 2000
n_trials = 10

# Common hyperparameters for all environments
hyper_params_default_ranges = HyperParamsRanges(
    hidden_sizes_theta_values=["[16, 32, 16]", "[32, 64, 32]"],
    hidden_sizes_w_values=["[16, 32, 16]", "[32, 64, 32]"],
    alpha_theta_values=StudyFloatParamRange(low=0.0005, high=0.0008, step=0.0001),
    alpha_w_values=StudyFloatParamRange(low=0.0005, high=0.0008, step=0.0001),
    gamma_values=StudyFloatParamRange(low=0.95, high=0.99, step=0.01),
)

In [7]:
def run_experiment(env_name,
                   episodes=episodes,
                   hyper_params_ranges=hyper_params_default_ranges,
                   n_trials=n_trials):
    optuna_search = OptunaSearch(
        train_function=generalized_actor_critic,
        env_name=env_name,
        max_input_dim=max_input_dim,
        max_output_dim=max_output_dim,
        episodes=episodes,
        hyper_params_ranges=hyper_params_ranges
    )
    best_policy, best_value, best_params, best_reward, study = optuna_search.optuna_search_for_env(n_trials=n_trials,
                                                                                                   study_name=f"{env_name}_actor_critic_study")

    print("\nDone! Best parameters found by Optuna:", best_params)
    print("Best reward from Optuna:", best_reward)

In [8]:
# Launch the search on, say, CartPole-v1
run_experiment("CartPole-v1", episodes=1000)

[I 2025-01-04 14:20:42,999] A new study created in memory with name: CartPole-v1_actor_critic_study



[OPTUNA Trial 0] Env=CartPole-v1
hidden_sizes_theta=[32, 64, 32]  |  hidden_sizes_w=[16, 32, 16]
        gamma=0.9900
        alpha_theta=0.0008  |  alpha_w=0.0007


Training: 100%|██████████| 1000/1000 [06:25<00:00,  2.59episode/s, Avg Reward(100)=345.58]
[I 2025-01-04 14:27:08,618] Trial 0 finished with value: 345.58 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'alpha_theta': 0.0008, 'alpha_w': 0.0007, 'gamma': 0.99}. Best is trial 0 with value: 345.58.



[OPTUNA Trial 1] Env=CartPole-v1
hidden_sizes_theta=[16, 32, 16]  |  hidden_sizes_w=[32, 64, 32]
        gamma=0.9700
        alpha_theta=0.0005  |  alpha_w=0.0005


Training: 100%|██████████| 1000/1000 [06:35<00:00,  2.53episode/s, Avg Reward(100)=287.83]
[I 2025-01-04 14:33:43,998] Trial 1 finished with value: 287.83 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'alpha_theta': 0.0005, 'alpha_w': 0.0005, 'gamma': 0.97}. Best is trial 0 with value: 345.58.



[OPTUNA Trial 2] Env=CartPole-v1
hidden_sizes_theta=[32, 64, 32]  |  hidden_sizes_w=[16, 32, 16]
        gamma=0.9600
        alpha_theta=0.0007  |  alpha_w=0.0008


Training:  50%|█████     | 502/1000 [03:56<03:54,  2.13episode/s, Avg Reward(100)=471.44]
[I 2025-01-04 14:37:40,060] Trial 2 finished with value: 475.35 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'alpha_theta': 0.0007, 'alpha_w': 0.0008, 'gamma': 0.96}. Best is trial 2 with value: 475.35.


Solved CartPole-v1 in 503 episodes!

[OPTUNA Trial 3] Env=CartPole-v1
hidden_sizes_theta=[16, 32, 16]  |  hidden_sizes_w=[16, 32, 16]
        gamma=0.9900
        alpha_theta=0.0007  |  alpha_w=0.0008


Training: 100%|██████████| 1000/1000 [09:08<00:00,  1.82episode/s, Avg Reward(100)=419.57]
[I 2025-01-04 14:46:48,689] Trial 3 finished with value: 419.57 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[16, 32, 16]', 'alpha_theta': 0.0007, 'alpha_w': 0.0008, 'gamma': 0.99}. Best is trial 2 with value: 475.35.



[OPTUNA Trial 4] Env=CartPole-v1
hidden_sizes_theta=[32, 64, 32]  |  hidden_sizes_w=[16, 32, 16]
        gamma=0.9600
        alpha_theta=0.0008  |  alpha_w=0.0005


Training: 100%|██████████| 1000/1000 [00:27<00:00, 35.87episode/s, Avg Reward(100)=9.45]
[I 2025-01-04 14:47:16,572] Trial 4 finished with value: 9.45 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'alpha_theta': 0.0008, 'alpha_w': 0.0005, 'gamma': 0.96}. Best is trial 2 with value: 475.35.



[OPTUNA Trial 5] Env=CartPole-v1
hidden_sizes_theta=[16, 32, 16]  |  hidden_sizes_w=[32, 64, 32]
        gamma=0.9500
        alpha_theta=0.0007  |  alpha_w=0.0006


Training:  75%|███████▍  | 748/1000 [06:24<02:09,  1.94episode/s, Avg Reward(100)=418.21]
[I 2025-01-04 14:53:41,415] Trial 5 finished with value: 475.83 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'alpha_theta': 0.0007, 'alpha_w': 0.0006000000000000001, 'gamma': 0.95}. Best is trial 5 with value: 475.83.


Solved CartPole-v1 in 749 episodes!

[OPTUNA Trial 6] Env=CartPole-v1
hidden_sizes_theta=[16, 32, 16]  |  hidden_sizes_w=[32, 64, 32]
        gamma=0.9800
        alpha_theta=0.0005  |  alpha_w=0.0007


Training: 100%|██████████| 1000/1000 [09:44<00:00,  1.71episode/s, Avg Reward(100)=407.74]
[I 2025-01-04 15:03:26,075] Trial 6 finished with value: 407.74 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'alpha_theta': 0.0005, 'alpha_w': 0.0007, 'gamma': 0.98}. Best is trial 5 with value: 475.83.



[OPTUNA Trial 7] Env=CartPole-v1
hidden_sizes_theta=[32, 64, 32]  |  hidden_sizes_w=[32, 64, 32]
        gamma=0.9800
        alpha_theta=0.0007  |  alpha_w=0.0008


Training: 100%|██████████| 1000/1000 [01:36<00:00, 10.35episode/s, Avg Reward(100)=24.60]
[I 2025-01-04 15:05:02,717] Trial 7 finished with value: 24.6 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'alpha_theta': 0.0007, 'alpha_w': 0.0008, 'gamma': 0.98}. Best is trial 5 with value: 475.83.



[OPTUNA Trial 8] Env=CartPole-v1
hidden_sizes_theta=[32, 64, 32]  |  hidden_sizes_w=[16, 32, 16]
        gamma=0.9600
        alpha_theta=0.0008  |  alpha_w=0.0008


Training: 100%|██████████| 1000/1000 [03:43<00:00,  4.48episode/s, Avg Reward(100)=9.35] 
[I 2025-01-04 15:08:45,775] Trial 8 finished with value: 9.35 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'alpha_theta': 0.0008, 'alpha_w': 0.0008, 'gamma': 0.96}. Best is trial 5 with value: 475.83.



[OPTUNA Trial 9] Env=CartPole-v1
hidden_sizes_theta=[16, 32, 16]  |  hidden_sizes_w=[16, 32, 16]
        gamma=0.9500
        alpha_theta=0.0007  |  alpha_w=0.0007


Training: 100%|██████████| 1000/1000 [01:49<00:00,  9.11episode/s, Avg Reward(100)=20.62]
[I 2025-01-04 15:10:35,515] Trial 9 finished with value: 20.62 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[16, 32, 16]', 'alpha_theta': 0.0007, 'alpha_w': 0.0007, 'gamma': 0.95}. Best is trial 5 with value: 475.83.



[OPTUNA] Best trial: trail 5
  Value (Reward): 475.83
  Params: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'alpha_theta': 0.0007, 'alpha_w': 0.0006000000000000001, 'gamma': 0.95}


Training:  68%|██████▊   | 680/1000 [04:58<02:20,  2.28episode/s, Avg Reward(100)=269.32]

Solved CartPole-v1 in 681 episodes!

Total Optuna search time for CartPole-v1: 3291.04s

Done! Best parameters found by Optuna: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'alpha_theta': 0.0007, 'alpha_w': 0.0006000000000000001, 'gamma': 0.95}
Best reward from Optuna: 475.83





In [11]:
run_experiment("Acrobot-v1", episodes=500, n_trials=3)

[I 2025-01-04 15:25:51,069] A new study created in memory with name: Acrobot-v1_actor_critic_study



[OPTUNA Trial 0] Env=Acrobot-v1
hidden_sizes_theta=[32, 64, 32]  |  hidden_sizes_w=[16, 32, 16]
        gamma=0.9500
        alpha_theta=0.0007  |  alpha_w=0.0005


Training: 100%|██████████| 500/500 [14:48<00:00,  1.78s/episode, Avg Reward(100)=-500.00]
[I 2025-01-04 15:40:39,712] Trial 0 finished with value: -500.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'alpha_theta': 0.0007, 'alpha_w': 0.0005, 'gamma': 0.95}. Best is trial 0 with value: -500.0.



[OPTUNA Trial 1] Env=Acrobot-v1
hidden_sizes_theta=[32, 64, 32]  |  hidden_sizes_w=[16, 32, 16]
        gamma=0.9800
        alpha_theta=0.0007  |  alpha_w=0.0006


Training:  21%|██        | 105/500 [00:52<03:17,  2.00episode/s, Avg Reward(100)=-124.69]
[I 2025-01-04 15:41:32,283] Trial 1 finished with value: -99.14 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'alpha_theta': 0.0007, 'alpha_w': 0.0006000000000000001, 'gamma': 0.98}. Best is trial 1 with value: -99.14.


Solved Acrobot-v1 in 106 episodes!

[OPTUNA Trial 2] Env=Acrobot-v1
hidden_sizes_theta=[32, 64, 32]  |  hidden_sizes_w=[16, 32, 16]
        gamma=0.9700
        alpha_theta=0.0007  |  alpha_w=0.0006


Training: 100%|██████████| 500/500 [19:46<00:00,  2.37s/episode, Avg Reward(100)=-500.00]
[I 2025-01-04 16:01:18,958] Trial 2 finished with value: -500.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'alpha_theta': 0.0007, 'alpha_w': 0.0006000000000000001, 'gamma': 0.97}. Best is trial 1 with value: -99.14.



[OPTUNA] Best trial: trail 1
  Value (Reward): -99.14
  Params: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'alpha_theta': 0.0007, 'alpha_w': 0.0006000000000000001, 'gamma': 0.98}


Training: 100%|██████████| 500/500 [13:18<00:00,  1.60s/episode, Avg Reward(100)=-500.00]


Total Optuna search time for Acrobot-v1: 2926.05s

Done! Best parameters found by Optuna: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'alpha_theta': 0.0007, 'alpha_w': 0.0006000000000000001, 'gamma': 0.98}
Best reward from Optuna: -99.14



