In [1]:
import time

import gymnasium
import optuna
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter

from assignment3.Section1.CartPole_AcroBot.dim_alignment import max_input_dim, max_output_dim
from assignment3.Section1.CartPole_AcroBot.models import PolicyNetwork, ValueNetwork
from assignment3.Section1.CartPole_AcroBot.action_selector import ActionSelector
from assignment3.Section1.CartPole_AcroBot.device import get_device
from assignment3.Section1.CartPole_AcroBot.training_loop import training_loop

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = get_device()

In [3]:
def reinitialize_output_layer(model):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and module.out_features == 3:
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
    return model

In [4]:
# best hyperparameters
# hidden_sizes_theta = [32, 64, 32]
hidden_sizes_w = [16, 32, 16]
alpha_theta = 0.0007
alpha_w = 0.0006000000000000001
gamma = 0.98

episodes = 1000

In [9]:
def objective(trial):
    """
    Objective function for Optuna hyperparameter optimization.
    Fine-tunes the Acrobot policy network on the CartPole environment.
    """

    # Define hyperparameter search space
    alpha_theta = trial.suggest_loguniform('alpha_theta', 1e-5, 1e-2)
    alpha_w = trial.suggest_loguniform('alpha_w', 1e-5, 1e-2)
    gamma = trial.suggest_uniform('gamma', 0.90, 0.999)

    episodes = 2000  # You can also make this a hyperparameter if desired

    # Define the unique log directory for this trial
    log_dir = f"runs/fine_tuning_acrobot_to_cartpole_trial_{trial.number}"

    # Print the trial number and hyperparameters
    hyperparams = {
        'alpha_theta': alpha_theta,
        'alpha_w': alpha_w,
        'gamma': gamma,
    }
    hyperparams_str = ', '.join([f"{key}={value:.6f}" for key, value in hyperparams.items()])
    print(f"Starting Trial {trial.number}: {hyperparams_str}")

    # Initialize networks

    # Load the pre-trained Acrobot model
    acrobot_policy_network = PolicyNetwork(max_input_dim, [32, 64, 32], max_output_dim).to(device)
    acrobot_policy_network.load_state_dict(torch.load(
        '../Section1/CartPole_AcroBot/models/Acrobot-v1/best/policy.pth',
        map_location=device
    ))

    acrobot_policy_network = reinitialize_output_layer(acrobot_policy_network)
    acrobot_policy_network.train()

        # Initialize the Value Network
    value_network = ValueNetwork(max_input_dim, hidden_sizes_w).to(device)
    value_network.train()

    # Initialize optimizers
    policy_optimizer = Adam(acrobot_policy_network.parameters(), lr=alpha_theta)
    value_optimizer = Adam(value_network.parameters(), lr=alpha_w)


    # Initialize rewards tracking
    rewards_per_episode = []

    # Initialize the action selector
    action_selector = ActionSelector()

    # Set up the CartPole environment
    env = gymnasium.make("CartPole-v1")
    writer = SummaryWriter(log_dir=log_dir)

    # Start fine-tuning
    start_time = time.time()

    train_time, avg_reward = training_loop(
        input_dim=max_input_dim,
        actual_act_dim=env.action_space.n,  # CartPole has 2 actions
        policy_network=acrobot_policy_network,
        value_network=value_network,
        policy_optimizer=policy_optimizer,
        value_optimizer=value_optimizer,
        env=env,
        env_name="CartPole-v1",
        episodes=episodes,
        gamma=gamma,
        writer=writer,
        rewards_per_episode=rewards_per_episode,
        action_selector=action_selector
    )


    elapsed_time = time.time() - start_time
    print(f"Trial {trial.number} completed in {elapsed_time:.2f} seconds with Average Reward {avg_reward:.2f}.")

    # Optionally, save the model for the best trial
    # This can be handled outside the objective function if preferred

    # Close the writer to free resources
    writer.close()

    # Return the metric to be maximized
    return avg_reward

In [10]:
def run_optuna_study(n_trials=50):
    """
    Runs an Optuna study to optimize hyperparameters for fine-tuning the Acrobot policy network.
    """
    # Create a study object
    study = optuna.create_study(
        direction='maximize',  # We aim to maximize the average reward
        sampler=optuna.samplers.TPESampler(seed=42)  # Set a seed for reproducibility
    )

    # Optimize the objective function
    study.optimize(objective, n_trials=n_trials, timeout=3600)  # e.g., 50 trials or 1 hour

    # Print study statistics
    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print(f"  Value (Average Reward): {trial.value:.2f}")

    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    return study

In [11]:
study = run_optuna_study(n_trials=30)

[I 2025-01-09 10:59:23,044] A new study created in memory with name: no-name-1ab27c33-41e7-458b-9944-dad90ff12eca
  alpha_theta = trial.suggest_loguniform('alpha_theta', 1e-5, 1e-2)
  alpha_w = trial.suggest_loguniform('alpha_w', 1e-5, 1e-2)
  gamma = trial.suggest_uniform('gamma', 0.90, 0.999)
  acrobot_policy_network.load_state_dict(torch.load(


Starting Trial 0: alpha_theta=0.000133, alpha_w=0.007114, gamma=0.972467


Training: 100%|██████████| 2000/2000 [17:02<00:00,  1.96episode/s, Avg Reward(100)=389.24]
[I 2025-01-09 11:16:25,248] Trial 0 finished with value: 389.24 and parameters: {'alpha_theta': 0.0001329291894316216, 'alpha_w': 0.0071144760093434225, 'gamma': 0.9724674002393291}. Best is trial 0 with value: 389.24.


Trial 0 completed in 1022.19 seconds with Average Reward 389.24.
Starting Trial 1: alpha_theta=0.000625, alpha_w=0.000029, gamma=0.915443


Training: 100%|██████████| 2000/2000 [01:17<00:00, 25.69episode/s, Avg Reward(100)=9.36]
[I 2025-01-09 11:17:43,110] Trial 1 finished with value: 9.36 and parameters: {'alpha_theta': 0.0006251373574521745, 'alpha_w': 2.9380279387035334e-05, 'gamma': 0.915443457513284}. Best is trial 0 with value: 389.24.


Trial 1 completed in 77.86 seconds with Average Reward 9.36.
Starting Trial 2: alpha_theta=0.000015, alpha_w=0.003968, gamma=0.959510


Training: 100%|██████████| 2000/2000 [21:12<00:00,  1.57episode/s, Avg Reward(100)=197.15]
[I 2025-01-09 11:38:55,958] Trial 2 finished with value: 197.15 and parameters: {'alpha_theta': 1.493656855461762e-05, 'alpha_w': 0.003967605077052989, 'gamma': 0.9595103861625777}. Best is trial 0 with value: 389.24.


Trial 2 completed in 1272.84 seconds with Average Reward 197.15.
Starting Trial 3: alpha_theta=0.001331, alpha_w=0.000012, gamma=0.996021


Training: 100%|██████████| 2000/2000 [01:17<00:00, 25.92episode/s, Avg Reward(100)=9.48]
[I 2025-01-09 11:40:13,131] Trial 3 finished with value: 9.48 and parameters: {'alpha_theta': 0.001331121608073689, 'alpha_w': 1.1527987128232396e-05, 'gamma': 0.9960210753640374}. Best is trial 0 with value: 389.24.


Trial 3 completed in 77.16 seconds with Average Reward 9.48.
Starting Trial 4: alpha_theta=0.003143, alpha_w=0.000043, gamma=0.918001


Training: 100%|██████████| 2000/2000 [01:15<00:00, 26.44episode/s, Avg Reward(100)=9.28]
[I 2025-01-09 11:41:28,788] Trial 4 finished with value: 9.28 and parameters: {'alpha_theta': 0.00314288089084011, 'alpha_w': 4.335281794951564e-05, 'gamma': 0.918000671753503}. Best is trial 0 with value: 389.24.


Trial 4 completed in 75.64 seconds with Average Reward 9.28.
Starting Trial 5: alpha_theta=0.000035, alpha_w=0.000082, gamma=0.951951


Training: 100%|██████████| 2000/2000 [01:13<00:00, 27.03episode/s, Avg Reward(100)=9.35]
[I 2025-01-09 11:42:42,780] Trial 5 finished with value: 9.35 and parameters: {'alpha_theta': 3.5498788321965036e-05, 'alpha_w': 8.17949947521167e-05, 'gamma': 0.9519508867315916}. Best is trial 0 with value: 389.24.


Trial 5 completed in 73.98 seconds with Average Reward 9.35.
Starting Trial 6: alpha_theta=0.000198, alpha_w=0.000075, gamma=0.960573


Training: 100%|██████████| 2000/2000 [01:11<00:00, 28.08episode/s, Avg Reward(100)=9.34]
[I 2025-01-09 11:43:54,013] Trial 6 finished with value: 9.34 and parameters: {'alpha_theta': 0.00019762189340280086, 'alpha_w': 7.476312062252303e-05, 'gamma': 0.9605734365775156}. Best is trial 0 with value: 389.24.


Trial 6 completed in 71.22 seconds with Average Reward 9.34.
Starting Trial 7: alpha_theta=0.000026, alpha_w=0.000075, gamma=0.936270


Training: 100%|██████████| 2000/2000 [01:17<00:00, 25.93episode/s, Avg Reward(100)=9.37]
[I 2025-01-09 11:45:11,164] Trial 7 finished with value: 9.37 and parameters: {'alpha_theta': 2.621087878265438e-05, 'alpha_w': 7.52374288453485e-05, 'gamma': 0.9362698224860755}. Best is trial 0 with value: 389.24.


Trial 7 completed in 77.14 seconds with Average Reward 9.37.
Starting Trial 8: alpha_theta=0.000233, alpha_w=0.002267, gamma=0.919768


Training: 100%|██████████| 2000/2000 [13:19<00:00,  2.50episode/s, Avg Reward(100)=9.40]  
[I 2025-01-09 11:58:30,527] Trial 8 finished with value: 9.4 and parameters: {'alpha_theta': 0.00023345864076016249, 'alpha_w': 0.0022673986523780395, 'gamma': 0.9197677044336776}. Best is trial 0 with value: 389.24.


Trial 8 completed in 799.35 seconds with Average Reward 9.40.
Starting Trial 9: alpha_theta=0.000349, alpha_w=0.000599, gamma=0.904599


Training:  82%|████████▏ | 1645/2000 [15:10<03:16,  1.81episode/s, Avg Reward(100)=366.85]
[I 2025-01-09 12:13:41,131] Trial 9 finished with value: 475.44 and parameters: {'alpha_theta': 0.0003489018845491386, 'alpha_w': 0.0005987474910461401, 'gamma': 0.9045985908592798}. Best is trial 9 with value: 475.44.


Solved CartPole-v1 in 1646 episodes!
Trial 9 completed in 910.59 seconds with Average Reward 475.44.
Number of finished trials:  10
Best trial:
  Value (Average Reward): 475.44
  Params: 
    alpha_theta: 0.0003489018845491386
    alpha_w: 0.0005987474910461401
    gamma: 0.9045985908592798
