In [1]:
import gymnasium as gym
import torch
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from assignment3.training_loop import training_loop
from assignment3.models import PolicyNetwork, ValueNetwork
from assignment3.dim_alignment import ENV_ACT_DIM, max_output_dim, max_input_dim
from assignment3.optuna_search import OptunaSearch
from assignment3.hyper_params import StudyFloatParamRange, HyperParamsRanges, HyperParams
from assignment3.device import get_device
from assignment3.action_selector import ContinuousActionSelector, ActionSelector

In [2]:
def generalized_actor_critic(
        env_name,
        input_dim,
        output_dim,
        dropout_layers,
        episodes,
        hyper_params: HyperParams,
        log_dir="runs/actor_critic"
):
    """
    Train a policy and value network using Actor-Critic, with padded inputs/outputs.
    """
    device = get_device()
    env = gym.make(env_name)
    writer = SummaryWriter(log_dir=f"{log_dir}_{env_name}")

    policy_network = PolicyNetwork(input_dim, hyper_params.hidden_sizes_theta, output_dim, dropout_layers,
                                   hyper_params.dropout_p).to(device)
    value_network = ValueNetwork(input_dim, hyper_params.hidden_sizes_w).to(device)

    policy_optimizer = optim.Adam(policy_network.parameters(), lr=hyper_params.alpha_theta)
    value_optimizer = optim.Adam(value_network.parameters(), lr=hyper_params.alpha_w)

    action_selector = ContinuousActionSelector(
        epsilon=hyper_params.epsilon,
        epsilon_decay=hyper_params.epsilon_decay,
        min_noise_std=hyper_params.min_noise_std,
        max_noise_std=hyper_params.max_noise_std
    ) if env_name == "MountainCarContinuous-v0" else ActionSelector()

    rewards_per_episode = []

    # Identify the actual dimensionalities for this env
    actual_act_dim = ENV_ACT_DIM[env_name]

    train_time = training_loop(
        input_dim=input_dim,
        actual_act_dim=actual_act_dim,
        policy_network=policy_network,
        value_network=value_network,
        policy_optimizer=policy_optimizer,
        value_optimizer=value_optimizer,
        env=env,
        env_name=env_name,
        episodes=episodes,
        gamma=hyper_params.gamma,
        writer=writer,
        rewards_per_episode=rewards_per_episode,
        action_selector=action_selector
    )

    writer.close()
    env.close()

    return policy_network, value_network, rewards_per_episode, train_time

In [3]:
episodes = 2000
n_trials = 10

# Common hyperparameters for all environments
hyper_params_default_ranges = HyperParamsRanges(
    hidden_sizes_theta_values=["[16, 32, 16]", "[32, 64, 32]"],
    hidden_sizes_w_values=["[16, 32, 16]", "[32, 64, 32]"],
    alpha_theta_values=StudyFloatParamRange(low=0.0005, high=0.0008, step=0.0001),
    alpha_w_values=StudyFloatParamRange(low=0.0005, high=0.0008, step=0.0001),
    gamma_values=StudyFloatParamRange(low=0.95, high=0.99, step=0.01),
    dropout_p_values=StudyFloatParamRange(low=0.2, high=0.5, step=0.1)
)

dropout_layers = [1]

In [4]:
def run_experiment(env_name,
                   dropout_layers=dropout_layers,
                   episodes=episodes,
                   hyper_params_ranges=hyper_params_default_ranges,
                   n_trials=n_trials):
    optuna_search = OptunaSearch(
        train_function=generalized_actor_critic,
        env_name=env_name,
        max_input_dim=max_input_dim,
        max_output_dim=max_output_dim,
        dropout_layers=dropout_layers,
        episodes=episodes,
        hyper_params_ranges=hyper_params_ranges
    )
    best_policy, best_value, best_params, best_reward, study = optuna_search.optuna_search_for_env(n_trials=n_trials,
                                                                                                   study_name=f"{env_name}_actor_critic_study")

    print("\nDone! Best parameters found by Optuna:", best_params)
    print("Best reward from Optuna:", best_reward)

    # save networks to pretrained_models
    torch.save(best_policy.state_dict(), f"pretrained_models/{env_name}_policy.pth")
    torch.save(best_value.state_dict(), f"pretrained_models/{env_name}_value.pth")

In [5]:
# Launch the search on, say, CartPole-v1
run_experiment("CartPole-v1", episodes=1000)

[I 2024-12-30 11:02:04,068] A new study created in memory with name: CartPole-v1_actor_critic_study



[OPTUNA Trial 0] Env=CartPole-v1

        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[16, 32, 16],
        gamma=0.98, dropout_p=0.4,
        alpha_theta=0.0005, alpha_w=0.0005
        


Training:  52%|█████▏    | 519/1000 [00:07<00:07, 66.04episode/s, Avg Reward(100)=63.86] 
[W 2024-12-30 11:02:12,449] Trial 0 failed with parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[16, 32, 16]', 'alpha_theta': 0.0005, 'alpha_w': 0.0005, 'gamma': 0.98, 'dropout_p': 0.4} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/anaconda3/envs/DRLCourse/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/Users/nadav/PycharmProjects/Deep-Reinforcement-Learning-Policy-Gradient-Methods/assignment3/optuna_search.py", line 111, in objective_wrapper
    return self.objective(
           ^^^^^^^^^^^^^^^
  File "/Users/nadav/PycharmProjects/Deep-Reinforcement-Learning-Policy-Gradient-Methods/assignment3/optuna_search.py", line 74, in objective
    policy_network, value_network, rewards, train_time = self.train_function(**t

KeyboardInterrupt: 

In [6]:
run_experiment("Acrobot-v1", episodes=500)

[I 2024-12-29 18:36:21,985] A new study created in memory with name: Acrobot-v1_actor_critic_study



[OPTUNA Trial 0] Env=Acrobot-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.99, dropout_p=0.30000000000000004,
         alpha_theta=0.0006000000000000001, alpha_w=0.0008


Training:  20%|██        | 100/500 [00:05<00:20, 19.48episode/s, Avg Reward(100)=-101.64]
[I 2024-12-29 18:36:27,128] Trial 0 finished with value: 101.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.99, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0008, 'dropout_p': 0.30000000000000004}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 101 episodes!

[OPTUNA Trial 1] Env=Acrobot-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.96, dropout_p=0.4,
         alpha_theta=0.0006000000000000001, alpha_w=0.0007


Training:  21%|██▏       | 107/500 [00:06<00:23, 16.88episode/s, Avg Reward(100)=-118.61]
[I 2024-12-29 18:36:33,471] Trial 1 finished with value: 108.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.96, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0007, 'dropout_p': 0.4}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 108 episodes!

[OPTUNA Trial 2] Env=Acrobot-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.98, dropout_p=0.2,
         alpha_theta=0.0006000000000000001, alpha_w=0.0005


Training: 100%|██████████| 500/500 [02:01<00:00,  4.10episode/s, Avg Reward(100)=-500.00]
[I 2024-12-29 18:38:35,313] Trial 2 finished with value: 500.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.98, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0005, 'dropout_p': 0.2}. Best is trial 0 with value: 101.0.



[OPTUNA Trial 3] Env=Acrobot-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[32, 64, 32],
         gamma=0.96, dropout_p=0.2,
         alpha_theta=0.0006000000000000001, alpha_w=0.0006000000000000001


Training:  47%|████▋     | 235/500 [00:23<00:26,  9.94episode/s, Avg Reward(100)=-140.40]
[I 2024-12-29 18:38:58,959] Trial 3 finished with value: 236.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.96, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.2}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 236 episodes!

[OPTUNA Trial 4] Env=Acrobot-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.95, dropout_p=0.5,
         alpha_theta=0.0008, alpha_w=0.0008


Training: 100%|██████████| 500/500 [02:02<00:00,  4.09episode/s, Avg Reward(100)=-500.00]
[I 2024-12-29 18:41:01,128] Trial 4 finished with value: 500.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.95, 'alpha_theta': 0.0008, 'alpha_w': 0.0008, 'dropout_p': 0.5}. Best is trial 0 with value: 101.0.



[OPTUNA Trial 5] Env=Acrobot-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[32, 64, 32],
         gamma=0.98, dropout_p=0.2,
         alpha_theta=0.0007, alpha_w=0.0006000000000000001


Training:  21%|██        | 106/500 [00:06<00:25, 15.49episode/s, Avg Reward(100)=-128.25]
[I 2024-12-29 18:41:07,977] Trial 5 finished with value: 107.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.98, 'alpha_theta': 0.0007, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.2}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 107 episodes!

[OPTUNA Trial 6] Env=Acrobot-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.98, dropout_p=0.30000000000000004,
         alpha_theta=0.0005, alpha_w=0.0007


Training:  49%|████▉     | 245/500 [00:38<00:40,  6.36episode/s, Avg Reward(100)=-254.41]
[I 2024-12-29 18:41:46,503] Trial 6 finished with value: 246.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.98, 'alpha_theta': 0.0005, 'alpha_w': 0.0007, 'dropout_p': 0.30000000000000004}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 246 episodes!

[OPTUNA Trial 7] Env=Acrobot-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[32, 64, 32],
         gamma=0.95, dropout_p=0.30000000000000004,
         alpha_theta=0.0008, alpha_w=0.0005


Training: 100%|██████████| 500/500 [01:59<00:00,  4.17episode/s, Avg Reward(100)=-500.00]
[I 2024-12-29 18:43:46,504] Trial 7 finished with value: 500.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.95, 'alpha_theta': 0.0008, 'alpha_w': 0.0005, 'dropout_p': 0.30000000000000004}. Best is trial 0 with value: 101.0.



[OPTUNA Trial 8] Env=Acrobot-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[16, 32, 16],
         gamma=0.98, dropout_p=0.2,
         alpha_theta=0.0008, alpha_w=0.0007


Training:  24%|██▎       | 118/500 [00:07<00:23, 16.09episode/s, Avg Reward(100)=-133.23]
[I 2024-12-29 18:43:53,841] Trial 8 finished with value: 119.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.98, 'alpha_theta': 0.0008, 'alpha_w': 0.0007, 'dropout_p': 0.2}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 119 episodes!

[OPTUNA Trial 9] Env=Acrobot-v1:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[32, 64, 32],
         gamma=0.99, dropout_p=0.2,
         alpha_theta=0.0007, alpha_w=0.0008


Training:  21%|██        | 105/500 [00:05<00:22, 17.68episode/s, Avg Reward(100)=-115.45]
[I 2024-12-29 18:43:59,781] Trial 9 finished with value: 106.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.99, 'alpha_theta': 0.0007, 'alpha_w': 0.0008, 'dropout_p': 0.2}. Best is trial 0 with value: 101.0.


Solved Acrobot-v1 in 106 episodes!

[OPTUNA] Best trial: trail 0
  Value (Reward): 101.00
  Params: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.99, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0008, 'dropout_p': 0.30000000000000004}


Training:  37%|███▋      | 185/500 [00:12<00:21, 14.71episode/s, Avg Reward(100)=-167.91]

Solved Acrobot-v1 in 186 episodes!

Total Optuna search time for Acrobot-v1: 470.38s

Done! Best parameters found by Optuna: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.99, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0008, 'dropout_p': 0.30000000000000004}
Best reward from Optuna: 101.0





In [5]:
hyper_params_ranges_mountain_car = hyper_params_default_ranges.copy(**{
    "hidden_sizes_theta_values": ["[32, 64, 32]", "[32, 128, 32]"],
    "hidden_sizes_w_values": ["[32, 64, 32]", "[32, 128, 32]"],
    "epsilon_values": StudyFloatParamRange(low=0.9995, high=0.9999, step=0.0001),
    "epsilon_decay_values": StudyFloatParamRange(low=0.9995, high=0.9999, step=0.0001),
    "min_noise_std_values": StudyFloatParamRange(low=0.05, high=0.1, step=0.01),
    "max_noise_std_values": StudyFloatParamRange(low=0.2, high=0.4, step=0.1)
})
run_experiment("MountainCarContinuous-v0", episodes=1500, hyper_params_ranges=hyper_params_ranges_mountain_car)


[I 2024-12-30 11:24:14,160] A new study created in memory with name: MountainCarContinuous-v0_actor_critic_study



[OPTUNA Trial 0] Env=MountainCarContinuous-v0
hidden_sizes_theta=[32, 128, 32]  |  hidden_sizes_w=[32, 128, 32]
        gamma=0.9900  |  dropout_p=0.4000
        alpha_theta=0.0007  |  alpha_w=0.0008
            epsilon=0.9998  |  epsilon_decay=0.9995
            min_noise_std=0.1000  |  max_noise_std=0.3000


Training:   2%|▏         | 25/1500 [00:12<12:30,  1.97episode/s]
[W 2024-12-30 11:24:27,433] Trial 0 failed with parameters: {'hidden_sizes_theta': '[32, 128, 32]', 'hidden_sizes_w': '[32, 128, 32]', 'alpha_theta': 0.0007, 'alpha_w': 0.0008, 'gamma': 0.99, 'dropout_p': 0.4, 'epsilon': 0.9998, 'epsilon_decay': 0.9995, 'min_noise_std': 0.1, 'max_noise_std': 0.30000000000000004} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/anaconda3/envs/DRLCourse/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/Users/nadav/PycharmProjects/Deep-Reinforcement-Learning-Policy-Gradient-Methods/assignment3/optuna_search.py", line 111, in objective_wrapper
    return self.objective(
           ^^^^^^^^^^^^^^^
  File "/Users/nadav/PycharmProjects/Deep-Reinforcement-Learning-Policy-Gradient-Methods/assignment3/optuna_search.py", line 74, in objective

KeyboardInterrupt: 