In [42]:

import optuna
import torch
from torch import nn

from assignment3.Section1.CartPole_AcroBot.dim_alignment import max_input_dim, max_output_dim
from assignment3.Section1.CartPole_AcroBot.models import PolicyNetwork, ValueNetwork
from assignment3.Section1.CartPole_AcroBot.device import get_device
from assignment3.Section1.MountainCarContinuous.models import UnifiedPolicyNetwork
from assignment3.Section2.actor_critic_finetune import actor_critic_finetune

In [43]:
device = get_device()

# Define hidden sizes (reuse from CartPole if compatible)
hidden_sizes = [32, 64, 32]

In [44]:
def extract_hidden_layers(cartpole_model, unified_model):
    """
    Transfers the weights of the hidden layers from the CartPole model to the Unified model.
    Assumes that both models have the same architecture for hidden layers.
    """
    cartpole_layers = [module for module in cartpole_model.model]
    unified_layers = [module for module in unified_model.model]

    for cp_layer, un_layer in zip(cartpole_layers, unified_layers):
        if isinstance(cp_layer, nn.Linear) and isinstance(un_layer, nn.Linear):
            un_layer.weight.data = cp_layer.weight.data.clone()
            un_layer.bias.data = cp_layer.bias.data.clone()

    return unified_model

In [45]:
def objective(trial):

    # Hyperparameter search space
    alpha_theta = trial.suggest_loguniform('alpha_theta', 1e-5, 1e-2)
    alpha_w = trial.suggest_loguniform('alpha_w', 1e-5, 1e-2)
    gamma = trial.suggest_uniform('gamma', 0.90, 0.999)
    entropy_coeff = trial.suggest_uniform('entropy_coeff', 0.0, 0.1)
    start_noise_std = trial.suggest_uniform('start_noise_std', 0.05, 0.3)
    end_noise_std = trial.suggest_uniform('end_noise_std', 0.25, 0.5)
    noise_decay = trial.suggest_uniform('noise_decay', 0.90, 0.999)

    episodes = 1000  # You can also make this a hyperparameter if desired

    # Define the unique log directory for this trial
    log_dir = f"runs/cartPole-mountainCar_trial_{trial.number}"

    # Print the trial number and hyperparameters in red
    hyperparams = {
        'alpha_theta': alpha_theta,
        'alpha_w': alpha_w,
        'gamma': gamma,
        'entropy_coeff': entropy_coeff,
        'start_noise_std': start_noise_std,
        'end_noise_std': end_noise_std,
        'noise_decay': noise_decay
    }
    hyperparams_str = ', '.join([f"{key}={value:.6f}" for key, value in hyperparams.items()])
    print(f"Starting Trial {trial.number}: {hyperparams_str}")

    # Initialize networks (reuse the code from your setup)
    device = get_device()

    # Load the pre-trained CartPole model
    cartpole_policy_network = PolicyNetwork(max_input_dim, hidden_sizes, max_output_dim).to(device)
    cartpole_policy_network.load_state_dict(torch.load(
        '../Section1/CartPole_AcroBot/models/CartPole-v1/best/policy.pth',
        map_location=device
    ))
    cartpole_policy_network.eval()

    # Initialize the Unified Policy Network
    unified_policy_network = UnifiedPolicyNetwork(
        input_dim=max_input_dim,
        hidden_sizes=hidden_sizes,
        output_dim=max_output_dim
    ).to(device)

    # Transfer hidden layer weights from CartPole model
    unified_policy_network = extract_hidden_layers(cartpole_policy_network, unified_policy_network)
    unified_policy_network.train()  # Set to training mode

    # Initialize the Value Network
    value_network = ValueNetwork(
        input_dim=max_input_dim,
        hidden_sizes=[256],
    ).to(device)
    value_network.train()

    # Run the actor_critic training loop with pre-initialized networks
    policy_network_fine_tuned, value_network_fine_tuned, rewards_per_episode, train_time, best_avg_reward_50 = actor_critic_finetune(
        env_name="MountainCarContinuous-v0",
        input_dim=max_input_dim,
        output_dim=max_output_dim,
        alpha_theta=alpha_theta,
        alpha_w=alpha_w,
        episodes=episodes,
        gamma=gamma,
        entropy_coeff=entropy_coeff,
        start_noise_std=start_noise_std,
        end_noise_std=end_noise_std,
        noise_decay=noise_decay,
        log_dir=log_dir,  # Pass the unique log directory
        policy_network=unified_policy_network,  # Pass the pre-trained policy network
        value_network=value_network,  # Pass the pre-initialized value network
    )

    # Use the best average reward over the last 50 episodes as the metric
    return best_avg_reward_50

In [46]:
def run_optuna_study(n_trials=50):
    # Create a study object
    study = optuna.create_study(
        direction='maximize',  # We aim to maximize the average reward
        sampler=optuna.samplers.TPESampler(seed=42)  # You can set a seed for reproducibility
    )

    # Optimize the objective function
    study.optimize(objective, n_trials=n_trials, timeout=3600)  # e.g., 50 trials or 1 hour

    # Print study statistics
    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print(f"  Value: {trial.value}")

    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    return study

In [47]:
study = run_optuna_study(n_trials=10)

[I 2025-01-07 11:13:45,281] A new study created in memory with name: no-name-acbfe2f0-9e75-4601-a0de-e9598cd08919


Starting Trial 0: alpha_theta=0.000133, alpha_w=0.007114, gamma=0.972467, entropy_coeff=0.059866, start_noise_std=0.089005, end_noise_std=0.288999, noise_decay=0.905750


  alpha_theta = trial.suggest_loguniform('alpha_theta', 1e-5, 1e-2)
  alpha_w = trial.suggest_loguniform('alpha_w', 1e-5, 1e-2)
  gamma = trial.suggest_uniform('gamma', 0.90, 0.999)
  entropy_coeff = trial.suggest_uniform('entropy_coeff', 0.0, 0.1)
  start_noise_std = trial.suggest_uniform('start_noise_std', 0.05, 0.3)
  end_noise_std = trial.suggest_uniform('end_noise_std', 0.25, 0.5)
  noise_decay = trial.suggest_uniform('noise_decay', 0.90, 0.999)
  cartpole_policy_network.load_state_dict(torch.load(


Episode 1: Reward=-80.89, Avg(100)=-80.89, Avg(50)=-80.89, Noise STD=0.2890
Episode 2: Reward=-93.81, Avg(100)=-87.35, Avg(50)=-87.35, Noise STD=0.2890
Episode 3: Reward=18.75, Avg(100)=-51.98, Avg(50)=-51.98, Noise STD=0.2890
Episode 4: Reward=-92.88, Avg(100)=-62.21, Avg(50)=-62.21, Noise STD=0.2890
Episode 5: Reward=-93.04, Avg(100)=-68.37, Avg(50)=-68.37, Noise STD=0.2890
Episode 6: Reward=-93.28, Avg(100)=-72.52, Avg(50)=-72.52, Noise STD=0.2890
Episode 7: Reward=-91.92, Avg(100)=-75.30, Avg(50)=-75.30, Noise STD=0.2890
Episode 8: Reward=-92.73, Avg(100)=-77.47, Avg(50)=-77.47, Noise STD=0.2890
Episode 9: Reward=24.65, Avg(100)=-66.13, Avg(50)=-66.13, Noise STD=0.2890
Episode 10: Reward=-93.70, Avg(100)=-68.88, Avg(50)=-68.88, Noise STD=0.2890
Episode 11: Reward=8.11, Avg(100)=-61.88, Avg(50)=-61.88, Noise STD=0.2890
Episode 12: Reward=24.20, Avg(100)=-54.71, Avg(50)=-54.71, Noise STD=0.2890
Episode 13: Reward=-91.69, Avg(100)=-57.55, Avg(50)=-57.55, Noise STD=0.2890
Episode 14: R

[I 2025-01-07 11:57:04,568] Trial 0 finished with value: 10.644144751563351 and parameters: {'alpha_theta': 0.0001329291894316216, 'alpha_w': 0.0071144760093434225, 'gamma': 0.9724674002393291, 'entropy_coeff': 0.05986584841970366, 'start_noise_std': 0.08900466011060913, 'end_noise_std': 0.2889986300840507, 'noise_decay': 0.9057502776046518}. Best is trial 0 with value: 10.644144751563351.


Episode 710: Reward=66.87, Avg(100)=-1.93, Avg(50)=10.64, Noise STD=0.2890
New best model saved with Avg(50)=10.64
Solved MountainCarContinuous-v0 in 710 episodes!
Starting Trial 1: alpha_theta=0.003968, alpha_w=0.000636, gamma=0.970099, entropy_coeff=0.002058, start_noise_std=0.292477, end_noise_std=0.458111, noise_decay=0.921022
Episode 1: Reward=-81.41, Avg(100)=-81.41, Avg(50)=-81.41, Noise STD=0.4581
Episode 2: Reward=-91.88, Avg(100)=-86.64, Avg(50)=-86.64, Noise STD=0.4581
Episode 3: Reward=-80.96, Avg(100)=-84.75, Avg(50)=-84.75, Noise STD=0.4581
Episode 4: Reward=-92.84, Avg(100)=-86.77, Avg(50)=-86.77, Noise STD=0.4581
Episode 5: Reward=-91.29, Avg(100)=-87.67, Avg(50)=-87.67, Noise STD=0.4581
Episode 6: Reward=70.93, Avg(100)=-61.24, Avg(50)=-61.24, Noise STD=0.4581
Episode 7: Reward=-93.54, Avg(100)=-65.85, Avg(50)=-65.85, Noise STD=0.4581
Episode 8: Reward=-91.77, Avg(100)=-69.09, Avg(50)=-69.09, Noise STD=0.4581
Episode 9: Reward=-93.49, Avg(100)=-71.80, Avg(50)=-71.80, N

[I 2025-01-07 12:58:49,435] Trial 1 finished with value: -inf and parameters: {'alpha_theta': 0.003967605077052989, 'alpha_w': 0.0006358358856676254, 'gamma': 0.9700991852018085, 'entropy_coeff': 0.0020584494295802446, 'start_noise_std': 0.2924774630404986, 'end_noise_std': 0.45811066020010544, 'noise_decay': 0.9210215719571494}. Best is trial 0 with value: 10.644144751563351.


Episode 1000: Reward=-99.90, Avg(100)=-99.90, Avg(50)=-99.90, Noise STD=0.4581
Number of finished trials:  2
Best trial:
  Value: 10.644144751563351
  Params: 
    alpha_theta: 0.0001329291894316216
    alpha_w: 0.0071144760093434225
    gamma: 0.9724674002393291
    entropy_coeff: 0.05986584841970366
    start_noise_std: 0.08900466011060913
    end_noise_std: 0.2889986300840507
    noise_decay: 0.9057502776046518
