In [12]:
import os
import time
import gymnasium
import numpy as np
import optuna
import torch
from assignment3.Section1.MountainCarContinuous.actor_critic_car_loop import actor_critic

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [30]:
def objective(trial):
    # Define hyperparameter space
    env_name = "MountainCarContinuous-v0"
    log_dir = "runs/actor_critic"

    hidden_sizes = trial.suggest_categorical('hidden_sizes', [
        (64,), (128,), (256,), (128, 64), (256, 128)
    ])
    alpha = trial.suggest_float('alpha', 1e-5, 1e-2, log=True)
    episodes = 1000
    gamma = trial.suggest_float('gamma', 0.8, 0.99)
    entropy_coeff = trial.suggest_float('entropy_coeff', 1e-5, 1e-1, log=True)
    start_noise_std = trial.suggest_float('start_noise_std', 0.01, 0.3)
    end_noise_std = trial.suggest_float('end_noise_std', 0.01, 0.3)
    noise_decay = trial.suggest_float('noise_decay', 0.99, 0.9999)

    # Log the hyperparameters being used in this trial
    print(f"Trial {trial.number}:")
    print(f"  hidden_sizes: {hidden_sizes}")
    print(f"  alpha: {alpha}")
    print(f"  gamma: {gamma}")
    print(f"  entropy_coeff: {entropy_coeff}")
    print(f"  start_noise_std: {start_noise_std}")
    print(f"  end_noise_std: {end_noise_std}")
    print(f"  noise_decay: {noise_decay}")

    # Train the agent with these hyperparameters
    policy_net, value_net, rewards, train_time, avg_reward_50 = actor_critic(
        env_name=env_name,
        input_dim=6,  # MountainCarContinuous state dimension
        output_dim=3,  # Single action dimension
        hidden_sizes_theta=hidden_sizes,
        hidden_sizes_w=hidden_sizes,
        alpha_theta=alpha,
        alpha_w=alpha,
        episodes=episodes,
        gamma=gamma,
        entropy_coeff=entropy_coeff,  # Entropy bonus coefficient
        start_noise_std=start_noise_std,  # Initial noise
        end_noise_std=end_noise_std,  # Final noise
        noise_decay=noise_decay,  # Decay rate for noise
        log_dir=f"{log_dir}/{env_name}_g{gamma}_at{alpha}"
    )

    # Log the result of this trial
    print(f"  Average Reward: {avg_reward_50}")
    print(f"  Training Time: {train_time:.2f} seconds\n")

    return avg_reward_50

In [31]:
# Create a study object and specify the direction of optimization
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())

# Optimize the objective function over a number of trials
study.optimize(objective, n_trials=1000, timeout=60 * 60 * 20)  # e.g., 100 trials or 1 hour

# Print the best hyperparameters
print("Best Hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")
print(f"Best Average Reward: {study.best_value}")

[I 2025-01-02 21:43:18,867] A new study created in memory with name: no-name-64039150-2dc9-4912-b259-018a3a8915cb


Trial 0:
  hidden_sizes: (128,)
  alpha: 0.0024352069422591835
  gamma: 0.9532889056602523
  entropy_coeff: 3.558978004690944e-05
  start_noise_std: 0.1571416650213371
  end_noise_std: 0.058061693879282984
  noise_decay: 0.9983021693682942
Episode 1: Reward=-48.78, Avg(100)=-48.78, Avg(50)=-48.78, Noise STD=0.1571
Episode 2: Reward=-91.30, Avg(100)=-70.04, Avg(50)=-70.04, Noise STD=0.1569
Episode 3: Reward=-92.05, Avg(100)=-77.38, Avg(50)=-77.38, Noise STD=0.1566
Episode 4: Reward=-92.68, Avg(100)=-81.20, Avg(50)=-81.20, Noise STD=0.1563
Episode 5: Reward=-93.70, Avg(100)=-83.70, Avg(50)=-83.70, Noise STD=0.1561
Episode 6: Reward=23.33, Avg(100)=-65.86, Avg(50)=-65.86, Noise STD=0.1558
Episode 7: Reward=-91.04, Avg(100)=-69.46, Avg(50)=-69.46, Noise STD=0.1555
Episode 8: Reward=11.03, Avg(100)=-59.40, Avg(50)=-59.40, Noise STD=0.1553
Episode 9: Reward=-92.14, Avg(100)=-63.04, Avg(50)=-63.04, Noise STD=0.1550
Episode 10: Reward=-93.62, Avg(100)=-66.09, Avg(50)=-66.09, Noise STD=0.1548
E

[I 2025-01-02 22:38:00,770] Trial 0 finished with value: -inf and parameters: {'hidden_sizes': (128,), 'alpha': 0.0024352069422591835, 'gamma': 0.9532889056602523, 'entropy_coeff': 3.558978004690944e-05, 'start_noise_std': 0.1571416650213371, 'end_noise_std': 0.058061693879282984, 'noise_decay': 0.9983021693682942}. Best is trial 0 with value: -inf.


Episode 1000: Reward=-99.90, Avg(100)=-99.90, Avg(50)=-99.90, Noise STD=0.0581
  Average Reward: -inf
  Training Time: 3281.89 seconds

Trial 1:
  hidden_sizes: (128, 64)
  alpha: 1.1944975561737275e-05
  gamma: 0.8021239727993724
  entropy_coeff: 1.3591923129287288e-05
  start_noise_std: 0.129751623354225
  end_noise_std: 0.2336624923587489
  noise_decay: 0.9957693003665727
Episode 1: Reward=-33.61, Avg(100)=-33.61, Avg(50)=-33.61, Noise STD=0.2337
Episode 2: Reward=-33.61, Avg(100)=-33.61, Avg(50)=-33.61, Noise STD=0.2337
Episode 3: Reward=-33.89, Avg(100)=-33.70, Avg(50)=-33.70, Noise STD=0.2337
Episode 4: Reward=-33.50, Avg(100)=-33.65, Avg(50)=-33.65, Noise STD=0.2337
Episode 5: Reward=-36.15, Avg(100)=-34.15, Avg(50)=-34.15, Noise STD=0.2337
Episode 6: Reward=-32.36, Avg(100)=-33.85, Avg(50)=-33.85, Noise STD=0.2337
Episode 7: Reward=-33.36, Avg(100)=-33.78, Avg(50)=-33.78, Noise STD=0.2337
Episode 8: Reward=-34.15, Avg(100)=-33.83, Avg(50)=-33.83, Noise STD=0.2337
Episode 9: Rew

[I 2025-01-02 23:33:13,625] Trial 1 finished with value: -inf and parameters: {'hidden_sizes': (128, 64), 'alpha': 1.1944975561737275e-05, 'gamma': 0.8021239727993724, 'entropy_coeff': 1.3591923129287288e-05, 'start_noise_std': 0.129751623354225, 'end_noise_std': 0.2336624923587489, 'noise_decay': 0.9957693003665727}. Best is trial 0 with value: -inf.


Episode 1000: Reward=-9.20, Avg(100)=-9.14, Avg(50)=-9.26, Noise STD=0.2337
  Average Reward: -inf
  Training Time: 3312.85 seconds

Trial 2:
  hidden_sizes: (256,)
  alpha: 0.007320901737702005
  gamma: 0.8138976232076853
  entropy_coeff: 0.003583076135549497
  start_noise_std: 0.03981431604994735
  end_noise_std: 0.014850635852252752
  noise_decay: 0.9979575044965058
Episode 1: Reward=-89.47, Avg(100)=-89.47, Avg(50)=-89.47, Noise STD=0.0398
Episode 2: Reward=-93.46, Avg(100)=-91.46, Avg(50)=-91.46, Noise STD=0.0397
Episode 3: Reward=-91.40, Avg(100)=-91.44, Avg(50)=-91.44, Noise STD=0.0397
Episode 4: Reward=66.98, Avg(100)=-51.84, Avg(50)=-51.84, Noise STD=0.0396
Episode 5: Reward=-93.00, Avg(100)=-60.07, Avg(50)=-60.07, Noise STD=0.0395
Episode 6: Reward=-92.83, Avg(100)=-65.53, Avg(50)=-65.53, Noise STD=0.0394
Episode 7: Reward=-93.86, Avg(100)=-69.58, Avg(50)=-69.58, Noise STD=0.0393
Episode 8: Reward=-93.25, Avg(100)=-72.54, Avg(50)=-72.54, Noise STD=0.0392
Episode 9: Reward=-94

[I 2025-01-03 00:23:05,563] Trial 2 finished with value: -inf and parameters: {'hidden_sizes': (256,), 'alpha': 0.007320901737702005, 'gamma': 0.8138976232076853, 'entropy_coeff': 0.003583076135549497, 'start_noise_std': 0.03981431604994735, 'end_noise_std': 0.014850635852252752, 'noise_decay': 0.9979575044965058}. Best is trial 0 with value: -inf.


Episode 1000: Reward=-99.90, Avg(100)=-98.28, Avg(50)=-99.90, Noise STD=0.0149
  Average Reward: -inf
  Training Time: 2991.92 seconds

Trial 3:
  hidden_sizes: (64,)
  alpha: 0.002113430829073409
  gamma: 0.9604965665749237
  entropy_coeff: 0.008386038633947686
  start_noise_std: 0.2123850570574805
  end_noise_std: 0.12108599158026409
  noise_decay: 0.9958447130437753
Episode 1: Reward=-80.55, Avg(100)=-80.55, Avg(50)=-80.55, Noise STD=0.2124
Episode 2: Reward=-92.86, Avg(100)=-86.70, Avg(50)=-86.70, Noise STD=0.2115
Episode 3: Reward=-92.46, Avg(100)=-88.62, Avg(50)=-88.62, Noise STD=0.2106
Episode 4: Reward=-93.25, Avg(100)=-89.78, Avg(50)=-89.78, Noise STD=0.2097
Episode 5: Reward=-93.40, Avg(100)=-90.50, Avg(50)=-90.50, Noise STD=0.2089
Episode 6: Reward=-93.87, Avg(100)=-91.06, Avg(50)=-91.06, Noise STD=0.2080
Episode 7: Reward=-92.67, Avg(100)=-91.29, Avg(50)=-91.29, Noise STD=0.2071
Episode 8: Reward=-92.74, Avg(100)=-91.47, Avg(50)=-91.47, Noise STD=0.2063
Episode 9: Reward=-9

[I 2025-01-03 01:10:08,671] Trial 3 finished with value: -inf and parameters: {'hidden_sizes': (64,), 'alpha': 0.002113430829073409, 'gamma': 0.9604965665749237, 'entropy_coeff': 0.008386038633947686, 'start_noise_std': 0.2123850570574805, 'end_noise_std': 0.12108599158026409, 'noise_decay': 0.9958447130437753}. Best is trial 0 with value: -inf.


Episode 1000: Reward=-99.90, Avg(100)=-37.54, Avg(50)=-50.00, Noise STD=0.1211
  Average Reward: -inf
  Training Time: 2823.10 seconds

Trial 4:
  hidden_sizes: (128,)
  alpha: 7.107012421632678e-05
  gamma: 0.9816335492601699
  entropy_coeff: 0.0004064677527575657
  start_noise_std: 0.2520440670630389
  end_noise_std: 0.0376524049986823
  noise_decay: 0.9920684350956036
Episode 1: Reward=-34.61, Avg(100)=-34.61, Avg(50)=-34.61, Noise STD=0.2520
Episode 2: Reward=-31.67, Avg(100)=-33.14, Avg(50)=-33.14, Noise STD=0.2500
Episode 3: Reward=-28.36, Avg(100)=-31.55, Avg(50)=-31.55, Noise STD=0.2481
Episode 4: Reward=-25.85, Avg(100)=-30.12, Avg(50)=-30.12, Noise STD=0.2461
Episode 5: Reward=-23.59, Avg(100)=-28.82, Avg(50)=-28.82, Noise STD=0.2441
Episode 6: Reward=-19.37, Avg(100)=-27.24, Avg(50)=-27.24, Noise STD=0.2422
Episode 7: Reward=-17.72, Avg(100)=-25.88, Avg(50)=-25.88, Noise STD=0.2403
Episode 8: Reward=-14.27, Avg(100)=-24.43, Avg(50)=-24.43, Noise STD=0.2384
Episode 9: Reward=

[I 2025-01-03 02:00:37,217] Trial 4 finished with value: -inf and parameters: {'hidden_sizes': (128,), 'alpha': 7.107012421632678e-05, 'gamma': 0.9816335492601699, 'entropy_coeff': 0.0004064677527575657, 'start_noise_std': 0.2520440670630389, 'end_noise_std': 0.0376524049986823, 'noise_decay': 0.9920684350956036}. Best is trial 0 with value: -inf.


Episode 1000: Reward=-5.56, Avg(100)=-5.41, Avg(50)=-5.37, Noise STD=0.0377
  Average Reward: -inf
  Training Time: 3028.54 seconds

Trial 5:
  hidden_sizes: (256, 128)
  alpha: 2.4380495944665872e-05
  gamma: 0.9349519382411126
  entropy_coeff: 0.0006953484941681884
  start_noise_std: 0.2267593114481869
  end_noise_std: 0.034786835112575326
  noise_decay: 0.9942467290017412
Episode 1: Reward=-35.68, Avg(100)=-35.68, Avg(50)=-35.68, Noise STD=0.2268
Episode 2: Reward=-43.68, Avg(100)=-39.68, Avg(50)=-39.68, Noise STD=0.2255
Episode 3: Reward=-54.60, Avg(100)=-44.66, Avg(50)=-44.66, Noise STD=0.2242
Episode 4: Reward=-62.68, Avg(100)=-49.16, Avg(50)=-49.16, Noise STD=0.2229
Episode 5: Reward=-72.78, Avg(100)=-53.89, Avg(50)=-53.89, Noise STD=0.2216
Episode 6: Reward=-83.01, Avg(100)=-58.74, Avg(50)=-58.74, Noise STD=0.2203
Episode 7: Reward=-89.82, Avg(100)=-63.18, Avg(50)=-63.18, Noise STD=0.2190
Episode 8: Reward=-91.69, Avg(100)=-66.74, Avg(50)=-66.74, Noise STD=0.2178
Episode 9: Rew

[I 2025-01-03 02:55:51,310] Trial 5 finished with value: -inf and parameters: {'hidden_sizes': (256, 128), 'alpha': 2.4380495944665872e-05, 'gamma': 0.9349519382411126, 'entropy_coeff': 0.0006953484941681884, 'start_noise_std': 0.2267593114481869, 'end_noise_std': 0.034786835112575326, 'noise_decay': 0.9942467290017412}. Best is trial 0 with value: -inf.


Episode 1000: Reward=-94.00, Avg(100)=-72.32, Avg(50)=-58.90, Noise STD=0.0348
  Average Reward: -inf
  Training Time: 3314.09 seconds

Trial 6:
  hidden_sizes: (256, 128)
  alpha: 0.0001443231142926625
  gamma: 0.8685004456538515
  entropy_coeff: 0.002254280496908549
  start_noise_std: 0.013756056383943659
  end_noise_std: 0.20524853411885216
  noise_decay: 0.998468206213549
Episode 1: Reward=-71.39, Avg(100)=-71.39, Avg(50)=-71.39, Noise STD=0.2052
Episode 2: Reward=-93.12, Avg(100)=-82.26, Avg(50)=-82.26, Noise STD=0.2052
Episode 3: Reward=-93.48, Avg(100)=-86.00, Avg(50)=-86.00, Noise STD=0.2052
Episode 4: Reward=-92.48, Avg(100)=-87.62, Avg(50)=-87.62, Noise STD=0.2052
Episode 5: Reward=60.09, Avg(100)=-58.08, Avg(50)=-58.08, Noise STD=0.2052
Episode 6: Reward=-92.15, Avg(100)=-63.76, Avg(50)=-63.76, Noise STD=0.2052
Episode 7: Reward=-92.93, Avg(100)=-67.92, Avg(50)=-67.92, Noise STD=0.2052
Episode 8: Reward=-93.50, Avg(100)=-71.12, Avg(50)=-71.12, Noise STD=0.2052
Episode 9: Rew

[I 2025-01-03 03:51:26,259] Trial 6 finished with value: -inf and parameters: {'hidden_sizes': (256, 128), 'alpha': 0.0001443231142926625, 'gamma': 0.8685004456538515, 'entropy_coeff': 0.002254280496908549, 'start_noise_std': 0.013756056383943659, 'end_noise_std': 0.20524853411885216, 'noise_decay': 0.998468206213549}. Best is trial 0 with value: -inf.


Episode 1000: Reward=-93.67, Avg(100)=-78.32, Avg(50)=-80.96, Noise STD=0.2052
  Average Reward: -inf
  Training Time: 3334.94 seconds

Trial 7:
  hidden_sizes: (64,)
  alpha: 5.009258914482148e-05
  gamma: 0.9082243340031033
  entropy_coeff: 2.9551891289081892e-05
  start_noise_std: 0.17221426328630066
  end_noise_std: 0.054106024133763
  noise_decay: 0.996859548301482
Episode 1: Reward=-31.06, Avg(100)=-31.06, Avg(50)=-31.06, Noise STD=0.1722
Episode 2: Reward=-31.76, Avg(100)=-31.41, Avg(50)=-31.41, Noise STD=0.1717
Episode 3: Reward=-31.19, Avg(100)=-31.34, Avg(50)=-31.34, Noise STD=0.1711
Episode 4: Reward=-27.72, Avg(100)=-30.43, Avg(50)=-30.43, Noise STD=0.1706
Episode 5: Reward=-28.21, Avg(100)=-29.99, Avg(50)=-29.99, Noise STD=0.1701
Episode 6: Reward=-27.89, Avg(100)=-29.64, Avg(50)=-29.64, Noise STD=0.1695
Episode 7: Reward=-24.01, Avg(100)=-28.83, Avg(50)=-28.83, Noise STD=0.1690
Episode 8: Reward=-23.65, Avg(100)=-28.19, Avg(50)=-28.19, Noise STD=0.1685
Episode 9: Reward=-

[I 2025-01-03 04:42:02,555] Trial 7 finished with value: -inf and parameters: {'hidden_sizes': (64,), 'alpha': 5.009258914482148e-05, 'gamma': 0.9082243340031033, 'entropy_coeff': 2.9551891289081892e-05, 'start_noise_std': 0.17221426328630066, 'end_noise_std': 0.054106024133763, 'noise_decay': 0.996859548301482}. Best is trial 0 with value: -inf.


Episode 1000: Reward=-2.33, Avg(100)=-2.39, Avg(50)=-2.25, Noise STD=0.0541
  Average Reward: -inf
  Training Time: 3036.29 seconds

Trial 8:
  hidden_sizes: (128,)
  alpha: 4.937344578102489e-05
  gamma: 0.8926301413219918
  entropy_coeff: 1.815461093033405e-05
  start_noise_std: 0.19403293441801406
  end_noise_std: 0.174361331197508
  noise_decay: 0.9915460818293088
Episode 1: Reward=-33.03, Avg(100)=-33.03, Avg(50)=-33.03, Noise STD=0.1940
Episode 2: Reward=-33.22, Avg(100)=-33.13, Avg(50)=-33.13, Noise STD=0.1924
Episode 3: Reward=-31.30, Avg(100)=-32.52, Avg(50)=-32.52, Noise STD=0.1908
Episode 4: Reward=-30.07, Avg(100)=-31.90, Avg(50)=-31.90, Noise STD=0.1892
Episode 5: Reward=-30.50, Avg(100)=-31.62, Avg(50)=-31.62, Noise STD=0.1876
Episode 6: Reward=-29.50, Avg(100)=-31.27, Avg(50)=-31.27, Noise STD=0.1860
Episode 7: Reward=-26.19, Avg(100)=-30.54, Avg(50)=-30.54, Noise STD=0.1844
Episode 8: Reward=-26.94, Avg(100)=-30.09, Avg(50)=-30.09, Noise STD=0.1828
Episode 9: Reward=-26

[I 2025-01-03 05:32:38,180] Trial 8 finished with value: -inf and parameters: {'hidden_sizes': (128,), 'alpha': 4.937344578102489e-05, 'gamma': 0.8926301413219918, 'entropy_coeff': 1.815461093033405e-05, 'start_noise_std': 0.19403293441801406, 'end_noise_std': 0.174361331197508, 'noise_decay': 0.9915460818293088}. Best is trial 0 with value: -inf.


Episode 1000: Reward=-5.48, Avg(100)=-5.34, Avg(50)=-5.39, Noise STD=0.1744
  Average Reward: -inf
  Training Time: 3035.62 seconds

Trial 9:
  hidden_sizes: (256,)
  alpha: 0.000432860862913505
  gamma: 0.9571389501004702
  entropy_coeff: 0.0075942800574955
  start_noise_std: 0.12655319079849925
  end_noise_std: 0.2507795275354682
  noise_decay: 0.9952850823459259
Episode 1: Reward=-76.32, Avg(100)=-76.32, Avg(50)=-76.32, Noise STD=0.2508
Episode 2: Reward=-93.31, Avg(100)=-84.81, Avg(50)=-84.81, Noise STD=0.2508
Episode 3: Reward=20.72, Avg(100)=-49.63, Avg(50)=-49.63, Noise STD=0.2508
Episode 4: Reward=-91.35, Avg(100)=-60.06, Avg(50)=-60.06, Noise STD=0.2508
Episode 5: Reward=-91.88, Avg(100)=-66.43, Avg(50)=-66.43, Noise STD=0.2508
Episode 6: Reward=-93.06, Avg(100)=-70.87, Avg(50)=-70.87, Noise STD=0.2508
Episode 7: Reward=33.80, Avg(100)=-55.91, Avg(50)=-55.91, Noise STD=0.2508
Episode 8: Reward=-92.75, Avg(100)=-60.52, Avg(50)=-60.52, Noise STD=0.2508
Episode 9: Reward=-91.78, 

[I 2025-01-03 06:11:01,456] Trial 9 finished with value: 12.037644487601224 and parameters: {'hidden_sizes': (256,), 'alpha': 0.000432860862913505, 'gamma': 0.9571389501004702, 'entropy_coeff': 0.0075942800574955, 'start_noise_std': 0.12655319079849925, 'end_noise_std': 0.2507795275354682, 'noise_decay': 0.9952850823459259}. Best is trial 9 with value: 12.037644487601224.


Episode 879: Reward=47.18, Avg(100)=-26.44, Avg(50)=12.04, Noise STD=0.2508
New best model saved with Avg(50)=12.04
Solved MountainCarContinuous-v0 in 879 episodes!
  Average Reward: 12.037644487601224
  Training Time: 2303.27 seconds

Trial 10:
  hidden_sizes: (256,)
  alpha: 0.0005139448133815112
  gamma: 0.857464133939732
  entropy_coeff: 0.084081149403612
  start_noise_std: 0.09104791373866311
  end_noise_std: 0.2862164262336
  noise_decay: 0.9937713407109787
Episode 1: Reward=-90.86, Avg(100)=-90.86, Avg(50)=-90.86, Noise STD=0.2862
Episode 2: Reward=-91.69, Avg(100)=-91.27, Avg(50)=-91.27, Noise STD=0.2862
Episode 3: Reward=-93.72, Avg(100)=-92.09, Avg(50)=-92.09, Noise STD=0.2862
Episode 4: Reward=-93.04, Avg(100)=-92.33, Avg(50)=-92.33, Noise STD=0.2862
Episode 5: Reward=-93.35, Avg(100)=-92.53, Avg(50)=-92.53, Noise STD=0.2862
Episode 6: Reward=-92.97, Avg(100)=-92.60, Avg(50)=-92.60, Noise STD=0.2862
Episode 7: Reward=-90.72, Avg(100)=-92.34, Avg(50)=-92.34, Noise STD=0.2862


[I 2025-01-03 06:59:45,646] Trial 10 finished with value: -inf and parameters: {'hidden_sizes': (256,), 'alpha': 0.0005139448133815112, 'gamma': 0.857464133939732, 'entropy_coeff': 0.084081149403612, 'start_noise_std': 0.09104791373866311, 'end_noise_std': 0.2862164262336, 'noise_decay': 0.9937713407109787}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=-92.39, Avg(100)=-69.81, Avg(50)=-68.58, Noise STD=0.2862
  Average Reward: -inf
  Training Time: 2924.17 seconds

Trial 11:
  hidden_sizes: (128,)
  alpha: 0.0007534058207437627
  gamma: 0.9485194011507966
  entropy_coeff: 0.00014151182412808685
  start_noise_std: 0.11896376379995859
  end_noise_std: 0.1215983225484151
  noise_decay: 0.9996028594406455
Episode 1: Reward=-29.13, Avg(100)=-29.13, Avg(50)=-29.13, Noise STD=0.1216
Episode 2: Reward=-20.72, Avg(100)=-24.92, Avg(50)=-24.92, Noise STD=0.1216
Episode 3: Reward=-12.40, Avg(100)=-20.75, Avg(50)=-20.75, Noise STD=0.1216
Episode 4: Reward=-15.28, Avg(100)=-19.38, Avg(50)=-19.38, Noise STD=0.1216
Episode 5: Reward=-6.36, Avg(100)=-16.78, Avg(50)=-16.78, Noise STD=0.1216
Episode 6: Reward=-6.86, Avg(100)=-15.13, Avg(50)=-15.13, Noise STD=0.1216
Episode 7: Reward=-9.90, Avg(100)=-14.38, Avg(50)=-14.38, Noise STD=0.1216
Episode 8: Reward=-9.02, Avg(100)=-13.71, Avg(50)=-13.71, Noise STD=0.1216
Episode 9: Reward=-

[I 2025-01-03 07:48:23,757] Trial 11 finished with value: -inf and parameters: {'hidden_sizes': (128,), 'alpha': 0.0007534058207437627, 'gamma': 0.9485194011507966, 'entropy_coeff': 0.00014151182412808685, 'start_noise_std': 0.11896376379995859, 'end_noise_std': 0.1215983225484151, 'noise_decay': 0.9996028594406455}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=33.63, Avg(100)=-66.42, Avg(50)=-63.58, Noise STD=0.1216
  Average Reward: -inf
  Training Time: 2918.10 seconds

Trial 12:
  hidden_sizes: (256,)
  alpha: 0.00207994918610889
  gamma: 0.982668469466086
  entropy_coeff: 0.027950360051149578
  start_noise_std: 0.08261208878420376
  end_noise_std: 0.28773508754592886
  noise_decay: 0.9998047933343307
Episode 1: Reward=-88.73, Avg(100)=-88.73, Avg(50)=-88.73, Noise STD=0.2877
Episode 2: Reward=-93.13, Avg(100)=-90.93, Avg(50)=-90.93, Noise STD=0.2877
Episode 3: Reward=-93.09, Avg(100)=-91.65, Avg(50)=-91.65, Noise STD=0.2877
Episode 4: Reward=-91.58, Avg(100)=-91.63, Avg(50)=-91.63, Noise STD=0.2877
Episode 5: Reward=-93.68, Avg(100)=-92.04, Avg(50)=-92.04, Noise STD=0.2877
Episode 6: Reward=-94.15, Avg(100)=-92.39, Avg(50)=-92.39, Noise STD=0.2877
Episode 7: Reward=34.00, Avg(100)=-74.34, Avg(50)=-74.34, Noise STD=0.2877
Episode 8: Reward=-96.40, Avg(100)=-77.09, Avg(50)=-77.09, Noise STD=0.2877
Episode 9: Reward=-95

[I 2025-01-03 08:36:10,798] Trial 12 finished with value: -inf and parameters: {'hidden_sizes': (256,), 'alpha': 0.00207994918610889, 'gamma': 0.982668469466086, 'entropy_coeff': 0.027950360051149578, 'start_noise_std': 0.08261208878420376, 'end_noise_std': 0.28773508754592886, 'noise_decay': 0.9998047933343307}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=-99.17, Avg(100)=-37.20, Avg(50)=-51.97, Noise STD=0.2877
  Average Reward: -inf
  Training Time: 2867.01 seconds

Trial 13:
  hidden_sizes: (128, 64)
  alpha: 0.008718277148013329
  gamma: 0.9269852234276861
  entropy_coeff: 0.00012977143128575176
  start_noise_std: 0.14299804055992182
  end_noise_std: 0.09876084436356838
  noise_decay: 0.9933801759100406
Episode 1: Reward=-72.27, Avg(100)=-72.27, Avg(50)=-72.27, Noise STD=0.1430
Episode 2: Reward=-92.32, Avg(100)=-82.29, Avg(50)=-82.29, Noise STD=0.1421
Episode 3: Reward=-93.36, Avg(100)=-85.98, Avg(50)=-85.98, Noise STD=0.1411
Episode 4: Reward=-98.28, Avg(100)=-89.06, Avg(50)=-89.06, Noise STD=0.1402
Episode 5: Reward=-98.30, Avg(100)=-90.91, Avg(50)=-90.91, Noise STD=0.1392
Episode 6: Reward=-99.90, Avg(100)=-92.40, Avg(50)=-92.40, Noise STD=0.1383
Episode 7: Reward=-99.75, Avg(100)=-93.45, Avg(50)=-93.45, Noise STD=0.1374
Episode 8: Reward=-99.90, Avg(100)=-94.26, Avg(50)=-94.26, Noise STD=0.1365
Episode 9: R

[I 2025-01-03 09:30:44,073] Trial 13 finished with value: -inf and parameters: {'hidden_sizes': (128, 64), 'alpha': 0.008718277148013329, 'gamma': 0.9269852234276861, 'entropy_coeff': 0.00012977143128575176, 'start_noise_std': 0.14299804055992182, 'end_noise_std': 0.09876084436356838, 'noise_decay': 0.9933801759100406}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=-99.90, Avg(100)=-99.90, Avg(50)=-99.90, Noise STD=0.0988
  Average Reward: -inf
  Training Time: 3273.26 seconds

Trial 14:
  hidden_sizes: (256,)
  alpha: 0.00162671655800421
  gamma: 0.9545450413604298
  entropy_coeff: 0.013449178155056647
  start_noise_std: 0.2841284640174996
  end_noise_std: 0.23893494392799575
  noise_decay: 0.990022701834733
Episode 1: Reward=-87.01, Avg(100)=-87.01, Avg(50)=-87.01, Noise STD=0.2841
Episode 2: Reward=-92.54, Avg(100)=-89.77, Avg(50)=-89.77, Noise STD=0.2813
Episode 3: Reward=43.31, Avg(100)=-45.41, Avg(50)=-45.41, Noise STD=0.2785
Episode 4: Reward=-93.24, Avg(100)=-57.37, Avg(50)=-57.37, Noise STD=0.2757
Episode 5: Reward=-92.48, Avg(100)=-64.39, Avg(50)=-64.39, Noise STD=0.2730
Episode 6: Reward=-90.99, Avg(100)=-68.82, Avg(50)=-68.82, Noise STD=0.2702
Episode 7: Reward=-94.08, Avg(100)=-72.43, Avg(50)=-72.43, Noise STD=0.2675
Episode 8: Reward=-92.14, Avg(100)=-74.90, Avg(50)=-74.90, Noise STD=0.2649
Episode 9: Reward=-92

[I 2025-01-03 10:20:16,521] Trial 14 finished with value: -inf and parameters: {'hidden_sizes': (256,), 'alpha': 0.00162671655800421, 'gamma': 0.9545450413604298, 'entropy_coeff': 0.013449178155056647, 'start_noise_std': 0.2841284640174996, 'end_noise_std': 0.23893494392799575, 'noise_decay': 0.990022701834733}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=-99.90, Avg(100)=-99.90, Avg(50)=-99.90, Noise STD=0.2389
  Average Reward: -inf
  Training Time: 2972.43 seconds

Trial 15:
  hidden_sizes: (128,)
  alpha: 0.000231095162885529
  gamma: 0.9048987908934608
  entropy_coeff: 8.474935034569812e-05
  start_noise_std: 0.16859319065830763
  end_noise_std: 0.08751871727709001
  noise_decay: 0.9973671653953832
Episode 1: Reward=-33.53, Avg(100)=-33.53, Avg(50)=-33.53, Noise STD=0.1686
Episode 2: Reward=-45.02, Avg(100)=-39.27, Avg(50)=-39.27, Noise STD=0.1681
Episode 3: Reward=-70.32, Avg(100)=-49.62, Avg(50)=-49.62, Noise STD=0.1677
Episode 4: Reward=-90.66, Avg(100)=-59.88, Avg(50)=-59.88, Noise STD=0.1673
Episode 5: Reward=-88.05, Avg(100)=-65.52, Avg(50)=-65.52, Noise STD=0.1668
Episode 6: Reward=-90.72, Avg(100)=-69.72, Avg(50)=-69.72, Noise STD=0.1664
Episode 7: Reward=-92.53, Avg(100)=-72.98, Avg(50)=-72.98, Noise STD=0.1659
Episode 8: Reward=51.06, Avg(100)=-57.47, Avg(50)=-57.47, Noise STD=0.1655
Episode 9: Reward

[I 2025-01-03 11:09:41,319] Trial 15 finished with value: -inf and parameters: {'hidden_sizes': (128,), 'alpha': 0.000231095162885529, 'gamma': 0.9048987908934608, 'entropy_coeff': 8.474935034569812e-05, 'start_noise_std': 0.16859319065830763, 'end_noise_std': 0.08751871727709001, 'noise_decay': 0.9973671653953832}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=-92.67, Avg(100)=-68.23, Avg(50)=-80.63, Noise STD=0.0875
  Average Reward: -inf
  Training Time: 2964.78 seconds

Trial 16:
  hidden_sizes: (256,)
  alpha: 0.0006709345994469306
  gamma: 0.9691721126467672
  entropy_coeff: 0.0029311281390429268
  start_noise_std: 0.07495898226958912
  end_noise_std: 0.15581780987492383
  noise_decay: 0.9955403647578134
Episode 1: Reward=-65.47, Avg(100)=-65.47, Avg(50)=-65.47, Noise STD=0.1558
Episode 2: Reward=-91.95, Avg(100)=-78.71, Avg(50)=-78.71, Noise STD=0.1558
Episode 3: Reward=-93.39, Avg(100)=-83.60, Avg(50)=-83.60, Noise STD=0.1558
Episode 4: Reward=-93.89, Avg(100)=-86.17, Avg(50)=-86.17, Noise STD=0.1558
Episode 5: Reward=-92.90, Avg(100)=-87.52, Avg(50)=-87.52, Noise STD=0.1558
Episode 6: Reward=-93.58, Avg(100)=-88.53, Avg(50)=-88.53, Noise STD=0.1558
Episode 7: Reward=-92.05, Avg(100)=-89.03, Avg(50)=-89.03, Noise STD=0.1558
Episode 8: Reward=-93.07, Avg(100)=-89.54, Avg(50)=-89.54, Noise STD=0.1558
Episode 9: Rewa

[I 2025-01-03 12:00:56,145] Trial 16 finished with value: -inf and parameters: {'hidden_sizes': (256,), 'alpha': 0.0006709345994469306, 'gamma': 0.9691721126467672, 'entropy_coeff': 0.0029311281390429268, 'start_noise_std': 0.07495898226958912, 'end_noise_std': 0.15581780987492383, 'noise_decay': 0.9955403647578134}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=-99.90, Avg(100)=-99.90, Avg(50)=-99.90, Noise STD=0.1558
  Average Reward: -inf
  Training Time: 3074.81 seconds

Trial 17:
  hidden_sizes: (128,)
  alpha: 0.004242484896325117
  gamma: 0.9294700436935842
  entropy_coeff: 0.00032493199731181677
  start_noise_std: 0.10808384503363028
  end_noise_std: 0.1936384362364945
  noise_decay: 0.9966923414108511
Episode 1: Reward=-74.56, Avg(100)=-74.56, Avg(50)=-74.56, Noise STD=0.1936
Episode 2: Reward=-91.95, Avg(100)=-83.25, Avg(50)=-83.25, Noise STD=0.1936
Episode 3: Reward=-91.20, Avg(100)=-85.90, Avg(50)=-85.90, Noise STD=0.1936
Episode 4: Reward=-93.32, Avg(100)=-87.76, Avg(50)=-87.76, Noise STD=0.1936
Episode 5: Reward=-91.50, Avg(100)=-88.50, Avg(50)=-88.50, Noise STD=0.1936
Episode 6: Reward=-92.94, Avg(100)=-89.24, Avg(50)=-89.24, Noise STD=0.1936
Episode 7: Reward=-93.97, Avg(100)=-89.92, Avg(50)=-89.92, Noise STD=0.1936
Episode 8: Reward=-92.57, Avg(100)=-90.25, Avg(50)=-90.25, Noise STD=0.1936
Episode 9: Rewar

[I 2025-01-03 12:42:04,295] Trial 17 finished with value: -inf and parameters: {'hidden_sizes': (128,), 'alpha': 0.004242484896325117, 'gamma': 0.9294700436935842, 'entropy_coeff': 0.00032493199731181677, 'start_noise_std': 0.10808384503363028, 'end_noise_std': 0.1936384362364945, 'noise_decay': 0.9966923414108511}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=49.06, Avg(100)=-77.24, Avg(50)=-87.63, Noise STD=0.1936
  Average Reward: -inf
  Training Time: 2468.13 seconds

Trial 18:
  hidden_sizes: (128, 64)
  alpha: 0.0002997004089256374
  gamma: 0.987958246273201
  entropy_coeff: 0.0014000118065187425
  start_noise_std: 0.0596749079281258
  end_noise_std: 0.24956522928957411
  noise_decay: 0.998685310907457
Episode 1: Reward=-46.61, Avg(100)=-46.61, Avg(50)=-46.61, Noise STD=0.2496
Episode 2: Reward=-93.70, Avg(100)=-70.16, Avg(50)=-70.16, Noise STD=0.2496
Episode 3: Reward=-92.80, Avg(100)=-77.70, Avg(50)=-77.70, Noise STD=0.2496
Episode 4: Reward=-94.40, Avg(100)=-81.88, Avg(50)=-81.88, Noise STD=0.2496
Episode 5: Reward=-94.50, Avg(100)=-84.40, Avg(50)=-84.40, Noise STD=0.2496
Episode 6: Reward=-95.31, Avg(100)=-86.22, Avg(50)=-86.22, Noise STD=0.2496
Episode 7: Reward=51.64, Avg(100)=-66.53, Avg(50)=-66.53, Noise STD=0.2496
Episode 8: Reward=-94.19, Avg(100)=-69.98, Avg(50)=-69.98, Noise STD=0.2496
Episode 9: Reward

[I 2025-01-03 13:26:39,497] Trial 18 finished with value: 3.6964725151143396 and parameters: {'hidden_sizes': (128, 64), 'alpha': 0.0002997004089256374, 'gamma': 0.987958246273201, 'entropy_coeff': 0.0014000118065187425, 'start_noise_std': 0.0596749079281258, 'end_noise_std': 0.24956522928957411, 'noise_decay': 0.998685310907457}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=75.60, Avg(100)=-51.25, Avg(50)=-57.31, Noise STD=0.2496
  Average Reward: 3.6964725151143396
  Training Time: 2675.19 seconds

Trial 19:
  hidden_sizes: (128, 64)
  alpha: 0.00011636289354888924
  gamma: 0.9882933878021413
  entropy_coeff: 0.0016423138991324762
  start_noise_std: 0.04719415133864561
  end_noise_std: 0.25309273206735466
  noise_decay: 0.9927429861692667
Episode 1: Reward=-36.86, Avg(100)=-36.86, Avg(50)=-36.86, Noise STD=0.2531
Episode 2: Reward=-62.75, Avg(100)=-49.81, Avg(50)=-49.81, Noise STD=0.2531
Episode 3: Reward=-92.57, Avg(100)=-64.06, Avg(50)=-64.06, Noise STD=0.2531
Episode 4: Reward=-91.75, Avg(100)=-70.98, Avg(50)=-70.98, Noise STD=0.2531
Episode 5: Reward=-93.29, Avg(100)=-75.44, Avg(50)=-75.44, Noise STD=0.2531
Episode 6: Reward=19.71, Avg(100)=-59.58, Avg(50)=-59.58, Noise STD=0.2531
Episode 7: Reward=58.86, Avg(100)=-42.66, Avg(50)=-42.66, Noise STD=0.2531
Episode 8: Reward=41.94, Avg(100)=-32.09, Avg(50)=-32.09, Noise STD=0.2531
E

[I 2025-01-03 14:12:15,816] Trial 19 finished with value: -inf and parameters: {'hidden_sizes': (128, 64), 'alpha': 0.00011636289354888924, 'gamma': 0.9882933878021413, 'entropy_coeff': 0.0016423138991324762, 'start_noise_std': 0.04719415133864561, 'end_noise_std': 0.25309273206735466, 'noise_decay': 0.9927429861692667}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=-99.75, Avg(100)=-81.50, Avg(50)=-86.63, Noise STD=0.2531
  Average Reward: -inf
  Training Time: 2736.30 seconds

Trial 20:
  hidden_sizes: (128, 64)
  alpha: 0.0002690325894797761
  gamma: 0.8738025731104107
  entropy_coeff: 0.006768465513579489
  start_noise_std: 0.05654852452075361
  end_noise_std: 0.26220184545263536
  noise_decay: 0.9946992586239005
Episode 1: Reward=-75.77, Avg(100)=-75.77, Avg(50)=-75.77, Noise STD=0.2622
Episode 2: Reward=-92.38, Avg(100)=-84.07, Avg(50)=-84.07, Noise STD=0.2622
Episode 3: Reward=23.84, Avg(100)=-48.10, Avg(50)=-48.10, Noise STD=0.2622
Episode 4: Reward=-91.47, Avg(100)=-58.94, Avg(50)=-58.94, Noise STD=0.2622
Episode 5: Reward=29.54, Avg(100)=-41.25, Avg(50)=-41.25, Noise STD=0.2622
Episode 6: Reward=-92.64, Avg(100)=-49.81, Avg(50)=-49.81, Noise STD=0.2622
Episode 7: Reward=-92.43, Avg(100)=-55.90, Avg(50)=-55.90, Noise STD=0.2622
Episode 8: Reward=-92.53, Avg(100)=-60.48, Avg(50)=-60.48, Noise STD=0.2622
Episode 9: Rewa

[I 2025-01-03 14:58:21,487] Trial 20 finished with value: -inf and parameters: {'hidden_sizes': (128, 64), 'alpha': 0.0002690325894797761, 'gamma': 0.8738025731104107, 'entropy_coeff': 0.006768465513579489, 'start_noise_std': 0.05654852452075361, 'end_noise_std': 0.26220184545263536, 'noise_decay': 0.9946992586239005}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=-92.94, Avg(100)=-88.65, Avg(50)=-87.54, Noise STD=0.2622
  Average Reward: -inf
  Training Time: 2765.66 seconds

Trial 21:
  hidden_sizes: (128, 64)
  alpha: 0.0011592444108067966
  gamma: 0.9661686352418685
  entropy_coeff: 0.0008766210934038676
  start_noise_std: 0.15138423775272108
  end_noise_std: 0.21217393684219565
  noise_decay: 0.9985326627948221
Episode 1: Reward=-59.07, Avg(100)=-59.07, Avg(50)=-59.07, Noise STD=0.2122
Episode 2: Reward=26.97, Avg(100)=-16.05, Avg(50)=-16.05, Noise STD=0.2122
Episode 3: Reward=-94.76, Avg(100)=-42.29, Avg(50)=-42.29, Noise STD=0.2122
Episode 4: Reward=-93.22, Avg(100)=-55.02, Avg(50)=-55.02, Noise STD=0.2122
Episode 5: Reward=36.55, Avg(100)=-36.71, Avg(50)=-36.71, Noise STD=0.2122
Episode 6: Reward=-92.75, Avg(100)=-46.05, Avg(50)=-46.05, Noise STD=0.2122
Episode 7: Reward=-81.47, Avg(100)=-51.11, Avg(50)=-51.11, Noise STD=0.2122
Episode 8: Reward=16.55, Avg(100)=-42.65, Avg(50)=-42.65, Noise STD=0.2122
Episode 9: Rewa

[I 2025-01-03 15:46:25,813] Trial 21 finished with value: -inf and parameters: {'hidden_sizes': (128, 64), 'alpha': 0.0011592444108067966, 'gamma': 0.9661686352418685, 'entropy_coeff': 0.0008766210934038676, 'start_noise_std': 0.15138423775272108, 'end_noise_std': 0.21217393684219565, 'noise_decay': 0.9985326627948221}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=-99.90, Avg(100)=-99.89, Avg(50)=-99.88, Noise STD=0.2122
  Average Reward: -inf
  Training Time: 2884.31 seconds

Trial 22:
  hidden_sizes: (256, 128)
  alpha: 0.0003955522894782996
  gamma: 0.9440159258951808
  entropy_coeff: 0.022519759152679345
  start_noise_std: 0.014468132675120066
  end_noise_std: 0.29885371656285825
  noise_decay: 0.9989256411775383
Episode 1: Reward=-88.02, Avg(100)=-88.02, Avg(50)=-88.02, Noise STD=0.2989
Episode 2: Reward=-92.83, Avg(100)=-90.43, Avg(50)=-90.43, Noise STD=0.2989
Episode 3: Reward=-93.63, Avg(100)=-91.50, Avg(50)=-91.50, Noise STD=0.2989
Episode 4: Reward=-92.89, Avg(100)=-91.84, Avg(50)=-91.84, Noise STD=0.2989
Episode 5: Reward=-91.13, Avg(100)=-91.70, Avg(50)=-91.70, Noise STD=0.2989
Episode 6: Reward=44.71, Avg(100)=-68.97, Avg(50)=-68.97, Noise STD=0.2989
Episode 7: Reward=-92.93, Avg(100)=-72.39, Avg(50)=-72.39, Noise STD=0.2989
Episode 8: Reward=-94.55, Avg(100)=-75.16, Avg(50)=-75.16, Noise STD=0.2989
Episode 9: R

[I 2025-01-03 16:35:17,431] Trial 22 finished with value: -inf and parameters: {'hidden_sizes': (256, 128), 'alpha': 0.0003955522894782996, 'gamma': 0.9440159258951808, 'entropy_coeff': 0.022519759152679345, 'start_noise_std': 0.014468132675120066, 'end_noise_std': 0.29885371656285825, 'noise_decay': 0.9989256411775383}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=59.60, Avg(100)=-84.37, Avg(50)=-86.24, Noise STD=0.2989
  Average Reward: -inf
  Training Time: 2931.60 seconds

Trial 23:
  hidden_sizes: (64,)
  alpha: 0.0036847767914245048
  gamma: 0.9714347497598761
  entropy_coeff: 5.610918332393168e-05
  start_noise_std: 0.1126742870573873
  end_noise_std: 0.16795132309230068
  noise_decay: 0.9978317466924482
Episode 1: Reward=-60.49, Avg(100)=-60.49, Avg(50)=-60.49, Noise STD=0.1680
Episode 2: Reward=35.58, Avg(100)=-12.46, Avg(50)=-12.46, Noise STD=0.1680
Episode 3: Reward=-94.48, Avg(100)=-39.80, Avg(50)=-39.80, Noise STD=0.1680
Episode 4: Reward=-92.84, Avg(100)=-53.06, Avg(50)=-53.06, Noise STD=0.1680
Episode 5: Reward=-93.21, Avg(100)=-61.09, Avg(50)=-61.09, Noise STD=0.1680
Episode 6: Reward=23.36, Avg(100)=-47.01, Avg(50)=-47.01, Noise STD=0.1680
Episode 7: Reward=-94.35, Avg(100)=-53.78, Avg(50)=-53.78, Noise STD=0.1680
Episode 8: Reward=55.22, Avg(100)=-40.15, Avg(50)=-40.15, Noise STD=0.1680
Episode 9: Reward=-94

[I 2025-01-03 17:20:47,570] Trial 23 finished with value: -inf and parameters: {'hidden_sizes': (64,), 'alpha': 0.0036847767914245048, 'gamma': 0.9714347497598761, 'entropy_coeff': 5.610918332393168e-05, 'start_noise_std': 0.1126742870573873, 'end_noise_std': 0.16795132309230068, 'noise_decay': 0.9978317466924482}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=-99.90, Avg(100)=-99.90, Avg(50)=-99.90, Noise STD=0.1680
  Average Reward: -inf
  Training Time: 2730.12 seconds

Trial 24:
  hidden_sizes: (128, 64)
  alpha: 0.00013871319874410632
  gamma: 0.9191319288558385
  entropy_coeff: 0.0002692435286833037
  start_noise_std: 0.18077712135304377
  end_noise_std: 0.1316735914314744
  noise_decay: 0.9966354684861523
Episode 1: Reward=-49.39, Avg(100)=-49.39, Avg(50)=-49.39, Noise STD=0.1808
Episode 2: Reward=62.90, Avg(100)=6.75, Avg(50)=6.75, Noise STD=0.1802
Episode 3: Reward=-91.04, Avg(100)=-25.84, Avg(50)=-25.84, Noise STD=0.1796
Episode 4: Reward=-92.51, Avg(100)=-42.51, Avg(50)=-42.51, Noise STD=0.1790
Episode 5: Reward=-92.45, Avg(100)=-52.50, Avg(50)=-52.50, Noise STD=0.1784
Episode 6: Reward=-93.46, Avg(100)=-59.33, Avg(50)=-59.33, Noise STD=0.1778
Episode 7: Reward=-93.06, Avg(100)=-64.14, Avg(50)=-64.14, Noise STD=0.1772
Episode 8: Reward=-93.00, Avg(100)=-67.75, Avg(50)=-67.75, Noise STD=0.1766
Episode 9: Reward

[I 2025-01-03 18:11:34,420] Trial 24 finished with value: 6.754542688874871 and parameters: {'hidden_sizes': (128, 64), 'alpha': 0.00013871319874410632, 'gamma': 0.9191319288558385, 'entropy_coeff': 0.0002692435286833037, 'start_noise_std': 0.18077712135304377, 'end_noise_std': 0.1316735914314744, 'noise_decay': 0.9966354684861523}. Best is trial 9 with value: 12.037644487601224.


Episode 1000: Reward=-92.54, Avg(100)=-58.80, Avg(50)=-62.28, Noise STD=0.1317
  Average Reward: 6.754542688874871
  Training Time: 3046.84 seconds

Best Hyperparameters:
  hidden_sizes: (256,)
  alpha: 0.000432860862913505
  gamma: 0.9571389501004702
  entropy_coeff: 0.0075942800574955
  start_noise_std: 0.12655319079849925
  end_noise_std: 0.2507795275354682
  noise_decay: 0.9952850823459259
Best Average Reward: 12.037644487601224
