In [2]:
import gymnasium as gym
import torch
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from assignment3.training_loop import training_loop
from assignment3.models import PolicyNetwork, ValueNetwork
from assignment3.dim_alignment import ENV_ACT_DIM, max_output_dim, max_input_dim
from assignment3.optuna_search import OptunaSearch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def generalized_actor_critic(
        env_name,
        input_dim,
        output_dim,
        hidden_sizes_theta,
        hidden_sizes_w,
        alpha_theta=0.001,
        alpha_w=0.001,
        episodes=500,
        gamma=0.99,
        log_dir="runs/actor_critic"
):
    """
    Train a policy and value network using Actor-Critic, with padded inputs/outputs.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env = gym.make(env_name)
    writer = SummaryWriter(log_dir=f"{log_dir}_{env_name}")

    policy_network = PolicyNetwork(input_dim, hidden_sizes_theta, output_dim).to(device)
    value_network = ValueNetwork(input_dim, hidden_sizes_w).to(device)

    policy_optimizer = optim.Adam(policy_network.parameters(), lr=alpha_theta)
    value_optimizer = optim.Adam(value_network.parameters(), lr=alpha_w)

    rewards_per_episode = []

    # Identify the actual dimensionalities for this env
    actual_act_dim = ENV_ACT_DIM[env_name] 

    train_time = training_loop(
        input_dim=input_dim,
        actual_act_dim=actual_act_dim,
        policy_network=policy_network,
        value_network=value_network,
        policy_optimizer=policy_optimizer,
        value_optimizer=value_optimizer,
        env=env,
        env_name=env_name,
        episodes=episodes,
        gamma=gamma,
        device=device,
        writer=writer,
        rewards_per_episode=rewards_per_episode,
    )

    writer.close()
    env.close()

    return policy_network, value_network, rewards_per_episode, train_time

In [4]:
# TODO: to find the best hyperparameters for each environment, initialize different ranges and params for each env separately

# Common hidden sizes
hidden_sizes_theta = [16, 32, 16]
hidden_sizes_w = [16, 32, 16]
episodes = 2000
n_trials = 10
overall_results = {}

# Define your search ranges
gamma_values = [0.95, 0.99]
alpha_theta_values = [0.001, 0.0005]
alpha_w_values = [0.001, 0.0005]

In [5]:
def run_experiment(env_name):
    optuna_search = OptunaSearch(
        train_function=generalized_actor_critic,
        env_name=env_name,
        max_input_dim=max_input_dim,
        max_output_dim=max_output_dim,
        hidden_sizes_theta=hidden_sizes_theta,
        hidden_sizes_w=hidden_sizes_w,
        gamma_values=gamma_values,
        alpha_theta_values=alpha_theta_values,
        alpha_w_values=alpha_w_values,
        episodes=episodes,
    )
    best_policy, best_value, best_params, best_reward, study = optuna_search.optuna_search_for_env(n_trials=n_trials)

    print("\nDone! Best parameters found by Optuna:", best_params)
    print("Best reward from Optuna:", best_reward)


# save networks to pretrained_models
    torch.save(best_policy.state_dict(), f"pretrained_models/{env_name}_policy.pth")
    torch.save(best_value.state_dict(), f"pretrained_models/{env_name}_value.pth")

In [5]:
# Launch the search on, say, CartPole-v1
# TODO: experiment fails with Nan values somehow, need to investigate
run_experiment("CartPole-v1")

[I 2024-12-28 15:30:13,131] A new study created in memory with name: no-name-a30ec3c4-ab68-4446-b418-1243aab85191



[OPTUNA Trial] Env=CartPole-v1 | gamma=0.95, alpha_theta=0.0005, alpha_w=0.001
Episode 100: Reward=10.00, Avg(100)=15.23
Episode 200: Reward=16.00, Avg(100)=26.41
Episode 300: Reward=44.00, Avg(100)=56.46
Episode 400: Reward=64.00, Avg(100)=72.98
Episode 500: Reward=155.00, Avg(100)=126.16
Episode 600: Reward=124.00, Avg(100)=146.36
Episode 700: Reward=175.00, Avg(100)=141.31
Episode 800: Reward=65.00, Avg(100)=80.91
Episode 900: Reward=128.00, Avg(100)=101.47


[W 2024-12-28 15:30:51,428] Trial 0 failed with parameters: {'gamma': 0.95, 'alpha_theta': 0.0005, 'alpha_w': 0.001} because of the following error: ValueError('Expected parameter probs (Tensor of shape (2,)) of distribution Categorical(probs: torch.Size([2])) to satisfy the constraint Simplex(), but found invalid values:\ntensor([nan, nan], grad_fn=<DivBackward0>)').
Traceback (most recent call last):
  File "/opt/anaconda3/envs/DRLCourse/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/Users/nadav/PycharmProjects/Deep-Reinforcement-Learning-Policy-Gradient-Methods/assignment3/optuna_search.py", line 98, in objective_wrapper
    return self.objective(
           ^^^^^^^^^^^^^^^
  File "/Users/nadav/PycharmProjects/Deep-Reinforcement-Learning-Policy-Gradient-Methods/assignment3/optuna_search.py", line 56, in objective
    policy_network, value_network, rewards, train_time = self

ValueError: Expected parameter probs (Tensor of shape (2,)) of distribution Categorical(probs: torch.Size([2])) to satisfy the constraint Simplex(), but found invalid values:
tensor([nan, nan], grad_fn=<DivBackward0>)

In [6]:
run_experiment("Acrobot-v1")

[I 2024-12-28 15:31:35,344] A new study created in memory with name: no-name-37523063-bac5-4a73-92ff-f3e358741da1



[OPTUNA Trial] Env=Acrobot-v1 | gamma=0.95, alpha_theta=0.001, alpha_w=0.001
Episode 100: Reward=-77.00, Avg(100)=-138.74
Episode 200: Reward=-500.00, Avg(100)=-280.03
Episode 300: Reward=-121.00, Avg(100)=-245.81


[I 2024-12-28 15:32:08,504] Trial 0 finished with value: -98.38 and parameters: {'gamma': 0.95, 'alpha_theta': 0.001, 'alpha_w': 0.001}. Best is trial 0 with value: -98.38.


Solved Acrobot-v1 in 338 episodes!

[OPTUNA Trial] Env=Acrobot-v1 | gamma=0.95, alpha_theta=0.0005, alpha_w=0.0005
Episode 100: Reward=-500.00, Avg(100)=-500.00
Episode 200: Reward=-108.00, Avg(100)=-276.23


[I 2024-12-28 15:32:47,037] Trial 1 finished with value: -99.9 and parameters: {'gamma': 0.95, 'alpha_theta': 0.0005, 'alpha_w': 0.0005}. Best is trial 0 with value: -98.38.


Solved Acrobot-v1 in 247 episodes!

[OPTUNA Trial] Env=Acrobot-v1 | gamma=0.99, alpha_theta=0.001, alpha_w=0.0005
Episode 100: Reward=-500.00, Avg(100)=-500.00
Episode 200: Reward=-500.00, Avg(100)=-500.00
Episode 300: Reward=-500.00, Avg(100)=-500.00
Episode 400: Reward=-500.00, Avg(100)=-500.00
Episode 500: Reward=-500.00, Avg(100)=-500.00
Episode 600: Reward=-500.00, Avg(100)=-500.00
Episode 700: Reward=-500.00, Avg(100)=-500.00
Episode 800: Reward=-500.00, Avg(100)=-500.00
Episode 900: Reward=-500.00, Avg(100)=-500.00
Episode 1000: Reward=-500.00, Avg(100)=-500.00
Episode 1100: Reward=-500.00, Avg(100)=-500.00
Episode 1200: Reward=-500.00, Avg(100)=-500.00
Episode 1300: Reward=-500.00, Avg(100)=-500.00
Episode 1400: Reward=-500.00, Avg(100)=-500.00
Episode 1500: Reward=-500.00, Avg(100)=-500.00
Episode 1600: Reward=-500.00, Avg(100)=-500.00
Episode 1700: Reward=-500.00, Avg(100)=-500.00
Episode 1800: Reward=-500.00, Avg(100)=-500.00
Episode 1900: Reward=-500.00, Avg(100)=-500.00


[I 2024-12-28 15:40:37,751] Trial 2 finished with value: -500.0 and parameters: {'gamma': 0.99, 'alpha_theta': 0.001, 'alpha_w': 0.0005}. Best is trial 0 with value: -98.38.


Episode 2000: Reward=-500.00, Avg(100)=-500.00

[OPTUNA Trial] Env=Acrobot-v1 | gamma=0.99, alpha_theta=0.001, alpha_w=0.0005
Episode 100: Reward=-81.00, Avg(100)=-125.70


[I 2024-12-28 15:40:44,220] Trial 3 finished with value: -99.69 and parameters: {'gamma': 0.99, 'alpha_theta': 0.001, 'alpha_w': 0.0005}. Best is trial 0 with value: -98.38.


Solved Acrobot-v1 in 111 episodes!

[OPTUNA Trial] Env=Acrobot-v1 | gamma=0.95, alpha_theta=0.0005, alpha_w=0.0005
Episode 100: Reward=-76.00, Avg(100)=-190.68
Episode 200: Reward=-96.00, Avg(100)=-103.22


[I 2024-12-28 15:40:59,565] Trial 4 finished with value: -99.77 and parameters: {'gamma': 0.95, 'alpha_theta': 0.0005, 'alpha_w': 0.0005}. Best is trial 0 with value: -98.38.


Solved Acrobot-v1 in 224 episodes!

[OPTUNA Trial] Env=Acrobot-v1 | gamma=0.95, alpha_theta=0.0005, alpha_w=0.001
Episode 100: Reward=-500.00, Avg(100)=-175.16
Episode 200: Reward=-500.00, Avg(100)=-495.42
Episode 300: Reward=-500.00, Avg(100)=-500.00
Episode 400: Reward=-500.00, Avg(100)=-500.00
Episode 500: Reward=-500.00, Avg(100)=-500.00
Episode 600: Reward=-122.00, Avg(100)=-404.76
Episode 700: Reward=-72.00, Avg(100)=-160.81
Episode 800: Reward=-107.00, Avg(100)=-381.62


[I 2024-12-28 15:43:29,812] Trial 5 finished with value: -98.96 and parameters: {'gamma': 0.95, 'alpha_theta': 0.0005, 'alpha_w': 0.001}. Best is trial 0 with value: -98.38.


Solved Acrobot-v1 in 881 episodes!

[OPTUNA Trial] Env=Acrobot-v1 | gamma=0.95, alpha_theta=0.0005, alpha_w=0.001
Episode 100: Reward=-89.00, Avg(100)=-221.24


[I 2024-12-28 15:43:43,979] Trial 6 finished with value: -99.26 and parameters: {'gamma': 0.95, 'alpha_theta': 0.0005, 'alpha_w': 0.001}. Best is trial 0 with value: -98.38.


Solved Acrobot-v1 in 184 episodes!

[OPTUNA Trial] Env=Acrobot-v1 | gamma=0.99, alpha_theta=0.0005, alpha_w=0.001
Episode 100: Reward=-97.00, Avg(100)=-124.70


[I 2024-12-28 15:43:50,447] Trial 7 finished with value: -99.71 and parameters: {'gamma': 0.99, 'alpha_theta': 0.0005, 'alpha_w': 0.001}. Best is trial 0 with value: -98.38.


Solved Acrobot-v1 in 112 episodes!

[OPTUNA Trial] Env=Acrobot-v1 | gamma=0.99, alpha_theta=0.001, alpha_w=0.0005
Episode 100: Reward=-155.00, Avg(100)=-422.46
Episode 200: Reward=-500.00, Avg(100)=-485.44
Episode 300: Reward=-500.00, Avg(100)=-481.27
Episode 400: Reward=-97.00, Avg(100)=-250.75


[I 2024-12-28 15:45:09,783] Trial 8 finished with value: -97.64 and parameters: {'gamma': 0.99, 'alpha_theta': 0.001, 'alpha_w': 0.0005}. Best is trial 8 with value: -97.64.


Solved Acrobot-v1 in 440 episodes!

[OPTUNA Trial] Env=Acrobot-v1 | gamma=0.99, alpha_theta=0.001, alpha_w=0.0005
Episode 100: Reward=-500.00, Avg(100)=-500.00
Episode 200: Reward=-500.00, Avg(100)=-500.00
Episode 300: Reward=-500.00, Avg(100)=-500.00
Episode 400: Reward=-500.00, Avg(100)=-500.00
Episode 500: Reward=-500.00, Avg(100)=-499.20
Episode 600: Reward=-500.00, Avg(100)=-500.00
Episode 700: Reward=-500.00, Avg(100)=-500.00
Episode 800: Reward=-500.00, Avg(100)=-500.00
Episode 900: Reward=-500.00, Avg(100)=-500.00
Episode 1000: Reward=-500.00, Avg(100)=-500.00
Episode 1100: Reward=-500.00, Avg(100)=-500.00
Episode 1200: Reward=-500.00, Avg(100)=-500.00
Episode 1300: Reward=-500.00, Avg(100)=-500.00
Episode 1400: Reward=-500.00, Avg(100)=-500.00
Episode 1500: Reward=-500.00, Avg(100)=-500.00
Episode 1600: Reward=-500.00, Avg(100)=-500.00
Episode 1700: Reward=-500.00, Avg(100)=-500.00
Episode 1800: Reward=-500.00, Avg(100)=-500.00
Episode 1900: Reward=-500.00, Avg(100)=-500.00


[I 2024-12-28 15:53:02,569] Trial 9 finished with value: -500.0 and parameters: {'gamma': 0.99, 'alpha_theta': 0.001, 'alpha_w': 0.0005}. Best is trial 8 with value: -97.64.


Episode 2000: Reward=-500.00, Avg(100)=-500.00

[OPTUNA] Best trial:
  Value (Reward): -97.64
  Params: {'gamma': 0.99, 'alpha_theta': 0.001, 'alpha_w': 0.0005}
Episode 100: Reward=-94.00, Avg(100)=-141.07
Episode 200: Reward=-95.00, Avg(100)=-101.69
Episode 300: Reward=-80.00, Avg(100)=-108.48
Solved Acrobot-v1 in 345 episodes!

Total Optuna search time for Acrobot-v1: 1306.06s

Done! Best parameters found by Optuna: {'gamma': 0.99, 'alpha_theta': 0.001, 'alpha_w': 0.0005}
Best reward from Optuna: -97.64


In [6]:
run_experiment("MountainCarContinuous-v0")
# TODO: experiment fails with 'int' object is not subscriptable, need to investigate


[I 2024-12-28 16:38:02,597] A new study created in memory with name: no-name-e2761128-682d-4f3c-83f1-0e455df0e4e5



[OPTUNA Trial] Env=MountainCarContinuous-v0 | gamma=0.99, alpha_theta=0.0005, alpha_w=0.0005


[W 2024-12-28 16:38:03,059] Trial 0 failed with parameters: {'gamma': 0.99, 'alpha_theta': 0.0005, 'alpha_w': 0.0005} because of the following error: TypeError("'int' object is not subscriptable").
Traceback (most recent call last):
  File "/opt/anaconda3/envs/DRLCourse/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/Users/nadav/PycharmProjects/Deep-Reinforcement-Learning-Policy-Gradient-Methods/assignment3/optuna_search.py", line 108, in objective_wrapper
    return self.objective(
           ^^^^^^^^^^^^^^^
  File "/Users/nadav/PycharmProjects/Deep-Reinforcement-Learning-Policy-Gradient-Methods/assignment3/optuna_search.py", line 77, in objective
    policy_network, value_network, rewards, train_time = self.train_function(**train_params)
                                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/q7/94wmcf8921379lkml24q3wb

TypeError: 'int' object is not subscriptable

In [7]:
# Print final summary
# TODO: show statistics?
for env in overall_results:
    print(f"\n=== {env} Final Results ===")
    print(f"  Best Params: {overall_results[env]['best_params']}")
    print(f"  Best Episodes: {overall_results[env]['best_episodes']}")
    print(f"  Best Avg Reward: {overall_results[env]['best_reward']:.2f}")
    print("  Grid Search Results (gamma, alpha_theta, alpha_w, episodes, avg_reward, train_time_s):")
    for res in overall_results[env]["grid_results"]:
        print(f"    {res}")