In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import pickle
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,mean_absolute_percentage_error
from tqdm import tqdm
import math
import optuna
import torch.optim as optim

In [4]:
with open('/home/f20222001/test-venv/UHI/Datasets_processed/inputs_36_extra_clean_10k.pkl', 'rb') as f, open('/home/f20222001/test-venv/UHI/Datasets_processed/targets_36_extra_clean_10k.pkl','rb') as t:
    inputs = pickle.load(f)
    targets = pickle.load(t)
# Shuffle before splitting (preserves alignment of X and y)
X_shuffled, y_shuffled = shuffle(inputs, targets, random_state=42)


In [5]:
# y_shuffled.shape
# new_y=y_shuffled[:,5]
# new_y=new_y.reshape(-1,)
# new_y
# y_shuffled=new_y

In [6]:

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X_shuffled, y_shuffled, test_size=0.2, random_state=42
)
# X_train, X_test, Y_train, Y_test = train_test_split(
#     X_train, Y_train, test_size=0.2, random_state=42
# )
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=0.25, random_state=42
)

In [7]:
# Indexes of features to normalize
norm_idx = [0, 1, 2, 3] 

# Flatten for scaling
X_train_flat = X_train.reshape(-1, X_train.shape[2])
X_val_flat   = X_val.reshape(-1, X_val.shape[2])
X_test_flat  = X_test.reshape(-1, X_test.shape[2])

# Initialize arrays to hold the scaled data
X_train_scaled = X_train_flat.copy()
X_val_scaled   = X_val_flat.copy()
X_test_scaled  = X_test_flat.copy()

scaler = StandardScaler()

# Fit on train, transform all
X_train_scaled[:, norm_idx] = scaler.fit_transform(X_train_flat[:, norm_idx])
X_val_scaled[:, norm_idx]   = scaler.transform(X_val_flat[:, norm_idx])
X_test_scaled[:, norm_idx]  = scaler.transform(X_test_flat[:, norm_idx])

# Reshape back to 3D
X_train = X_train_scaled.reshape(X_train.shape)
X_val   = X_val_scaled.reshape(X_val.shape)
X_test  = X_test_scaled.reshape(X_test.shape)

# Flatten y
y_train_flat = Y_train.reshape(-1, 1)
y_val_flat   = Y_val.reshape(-1, 1)
y_test_flat  = Y_test.reshape(-1, 1)

# Standardize y
standard_scaler_y = StandardScaler()
y_train_scaled = standard_scaler_y.fit_transform(y_train_flat)
y_val_scaled   = standard_scaler_y.transform(y_val_flat)
y_test_scaled  = standard_scaler_y.transform(y_test_flat)

# Reshape back to original shape
Y_train = y_train_scaled.reshape(Y_train.shape)
Y_val   = y_val_scaled.reshape(Y_val.shape)
Y_test  = y_test_scaled.reshape(Y_test.shape)


In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0)]


In [9]:
X_train.shape

(372642, 36, 17)

In [10]:
Y_train.shape

(372642,)

In [11]:
torch.cuda.set_device(0)  

In [12]:
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                              torch.tensor(Y_train, dtype=torch.float32))

val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                            torch.tensor(Y_val, dtype=torch.float32))

test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                             torch.tensor(Y_test, dtype=torch.float32))

# Create loaders
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=1024, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [13]:
device = torch.device("cuda")
device

device(type='cuda')

In [14]:
class TransformerRegressor(nn.Module):
    def __init__(self, input_dim, d_model=256, nhead=4, num_layers=2, dim_feedforward=512, dropout=0.2):
        super().__init__()
        self.input_fc = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_fc = nn.Linear(d_model, 1)  # regression output

    def forward(self, src):
        # src: (batch_size, seq_len, input_dim)
        src = src.permute(1, 0, 2)  # (seq_len, batch_size, input_dim)
        src = self.input_fc(src)   # (seq_len, batch_size, d_model)
        src = self.pos_encoder(src)
        transformer_out = self.transformer_encoder(src)  # (seq_len, batch_size, d_model)
        out = transformer_out[-1]  # take last time step
        return self.output_fc(out).squeeze(-1)


In [15]:
def objective(trial):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_dim = X_train.shape[2]  # Replace with your actual input_dim
    
    # Suggest hyperparameters
    d_model = trial.suggest_categorical('d_model', [128, 256, 512])
    nhead = trial.suggest_categorical('nhead', [2, 4, 8])
    num_layers = trial.suggest_int('num_layers', 1, 4)
    dim_feedforward = trial.suggest_categorical('dim_feedforward', [256, 512, 1024])
    dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    num_epochs = 30  # Reduce epochs for tuning speed


    # Initialize model
    model = TransformerRegressor(input_dim, d_model, nhead, num_layers, dim_feedforward, dropout)
    model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            preds = model(batch_x)
            loss = criterion(preds, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for val_x, val_y in val_loader:
            val_x, val_y = val_x.to(device), val_y.to(device)
            preds = model(val_x)
            y_true.extend(val_y.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
    y_true = np.array(y_true).reshape(-1, 1)
    y_pred = np.array(y_pred).reshape(-1, 1)
    y_true = standard_scaler_y.inverse_transform(y_true).flatten()
    y_pred = standard_scaler_y.inverse_transform(y_pred).flatten()

    mse = mean_squared_error(y_true, y_pred)
    print(f"Trial {trial.number}: MSE={mse:.4f}")
    return mse  # Optuna will minimize this


In [18]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=30) 

# print("Best hyperparameters:", study.best_params)
# print("Best validation MSE:", study.best_value)


In [19]:
n_trials = 30
study = optuna.create_study(direction='minimize')

with tqdm(total=n_trials, desc="Optuna Trials") as pbar:
    for _ in range(n_trials):
        study.optimize(objective, n_trials=1, catch=(Exception,))
        pbar.update(1)

print("Best hyperparameters:", study.best_params)
print("Best validation MSE:", study.best_value)


[I 2025-05-28 20:58:08,357] A new study created in memory with name: no-name-b1e25b42-56af-459a-8091-2fb6c541d5a7
  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 21:07:16,332] Trial 0 finished with value: 56.64198303222656 and parameters: {'d_model': 512, 'nhead': 8, 'num_layers': 2, 'dim_feedforward': 256, 'dropout': 0.2554086457736864, 'lr': 0.00015916372190735877, 'weight_decay': 2.9109206730430406e-06, 'batch_size': 128}. Best is trial 0 with value: 56.64198303222656.
Optuna Trials:   3%|▎         | 1/30 [09:07<4:24:51, 547.97s/it]

Trial 0: MSE=56.6420


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 21:22:57,560] Trial 1 finished with value: 52.224708557128906 and parameters: {'d_model': 512, 'nhead': 4, 'num_layers': 3, 'dim_feedforward': 512, 'dropout': 0.20958299254839458, 'lr': 9.443116658864233e-05, 'weight_decay': 0.000814743707077596, 'batch_size': 128}. Best is trial 1 with value: 52.224708557128906.
Optuna Trials:   7%|▋         | 2/30 [24:49<6:03:40, 779.30s/it]

Trial 1: MSE=52.2247


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 21:27:50,423] Trial 2 finished with value: 24.484046936035156 and parameters: {'d_model': 512, 'nhead': 4, 'num_layers': 1, 'dim_feedforward': 256, 'dropout': 0.12871917243725797, 'lr': 3.992989484241028e-05, 'weight_decay': 0.0005779337026279974, 'batch_size': 128}. Best is trial 2 with value: 24.484046936035156.
Optuna Trials:  10%|█         | 3/30 [29:42<4:10:43, 557.18s/it]

Trial 2: MSE=24.4840


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 21:30:16,543] Trial 3 finished with value: 90.59085083007812 and parameters: {'d_model': 128, 'nhead': 2, 'num_layers': 2, 'dim_feedforward': 256, 'dropout': 0.2954250023736056, 'lr': 4.5149567309131256e-05, 'weight_decay': 0.00018496543539853853, 'batch_size': 128}. Best is trial 2 with value: 24.484046936035156.
Optuna Trials:  13%|█▎        | 4/30 [32:08<2:51:07, 394.90s/it]

Trial 3: MSE=90.5909


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 21:35:19,937] Trial 4 finished with value: 48.68867111206055 and parameters: {'d_model': 128, 'nhead': 4, 'num_layers': 4, 'dim_feedforward': 512, 'dropout': 0.25808108283209147, 'lr': 9.19101564578845e-05, 'weight_decay': 2.9810851163648228e-05, 'batch_size': 64}. Best is trial 2 with value: 24.484046936035156.
Optuna Trials:  17%|█▋        | 5/30 [37:11<2:30:47, 361.90s/it]

Trial 4: MSE=48.6887


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 21:40:00,406] Trial 5 finished with value: 105.54988098144531 and parameters: {'d_model': 256, 'nhead': 4, 'num_layers': 2, 'dim_feedforward': 512, 'dropout': 0.21943190232467724, 'lr': 5.881327004300395e-05, 'weight_decay': 9.908641744983585e-05, 'batch_size': 128}. Best is trial 2 with value: 24.484046936035156.
Optuna Trials:  20%|██        | 6/30 [41:52<2:13:41, 334.21s/it]

Trial 5: MSE=105.5499


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 21:44:15,780] Trial 6 finished with value: 9.051858901977539 and parameters: {'d_model': 256, 'nhead': 2, 'num_layers': 2, 'dim_feedforward': 256, 'dropout': 0.15097167184828136, 'lr': 0.000540375387791473, 'weight_decay': 3.8802465734811765e-06, 'batch_size': 64}. Best is trial 6 with value: 9.051858901977539.
Optuna Trials:  23%|██▎       | 7/30 [46:07<1:58:14, 308.44s/it]

Trial 6: MSE=9.0519


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 21:54:46,397] Trial 7 finished with value: 48.17926788330078 and parameters: {'d_model': 512, 'nhead': 2, 'num_layers': 2, 'dim_feedforward': 512, 'dropout': 0.2718109702596575, 'lr': 0.0002509543476816877, 'weight_decay': 0.00040519370986238283, 'batch_size': 64}. Best is trial 6 with value: 9.051858901977539.
Optuna Trials:  27%|██▋       | 8/30 [56:38<2:30:42, 411.01s/it]

Trial 7: MSE=48.1793


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 22:01:09,058] Trial 8 finished with value: 37.46918869018555 and parameters: {'d_model': 128, 'nhead': 2, 'num_layers': 4, 'dim_feedforward': 1024, 'dropout': 0.27286744212872116, 'lr': 0.0001554953318107687, 'weight_decay': 1.064619847278377e-06, 'batch_size': 64}. Best is trial 6 with value: 9.051858901977539.
Optuna Trials:  30%|███       | 9/30 [1:03:00<2:20:45, 402.14s/it]

Trial 8: MSE=37.4692


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 22:11:42,116] Trial 9 finished with value: 26.178359985351562 and parameters: {'d_model': 512, 'nhead': 4, 'num_layers': 2, 'dim_feedforward': 512, 'dropout': 0.1738844882178489, 'lr': 0.0009050169529650548, 'weight_decay': 1.784150647919834e-06, 'batch_size': 32}. Best is trial 6 with value: 9.051858901977539.
Optuna Trials:  33%|███▎      | 10/30 [1:13:33<2:37:48, 473.43s/it]

Trial 9: MSE=26.1784


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 22:15:01,752] Trial 10 finished with value: 33.78196334838867 and parameters: {'d_model': 256, 'nhead': 8, 'num_layers': 1, 'dim_feedforward': 1024, 'dropout': 0.10319287449047611, 'lr': 1.0777173080350303e-05, 'weight_decay': 9.652579978072323e-06, 'batch_size': 32}. Best is trial 6 with value: 9.051858901977539.
Optuna Trials:  37%|███▋      | 11/30 [1:16:53<2:03:23, 389.64s/it]

Trial 10: MSE=33.7820


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 22:17:23,181] Trial 11 finished with value: 26.589693069458008 and parameters: {'d_model': 256, 'nhead': 2, 'num_layers': 1, 'dim_feedforward': 256, 'dropout': 0.1310108329332455, 'lr': 1.9404998023235838e-05, 'weight_decay': 1.1020744881876011e-05, 'batch_size': 64}. Best is trial 6 with value: 9.051858901977539.
Optuna Trials:  40%|████      | 12/30 [1:19:14<1:34:14, 314.13s/it]

Trial 11: MSE=26.5897


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 22:19:41,485] Trial 12 finished with value: 14.37835693359375 and parameters: {'d_model': 256, 'nhead': 4, 'num_layers': 1, 'dim_feedforward': 256, 'dropout': 0.1588430430298392, 'lr': 0.0009312776155776435, 'weight_decay': 6.34941902232709e-05, 'batch_size': 128}. Best is trial 6 with value: 9.051858901977539.
Optuna Trials:  43%|████▎     | 13/30 [1:21:33<1:13:54, 260.87s/it]

Trial 12: MSE=14.3784


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 22:25:50,721] Trial 13 finished with value: 9.924542427062988 and parameters: {'d_model': 256, 'nhead': 2, 'num_layers': 3, 'dim_feedforward': 256, 'dropout': 0.17328857984446483, 'lr': 0.0009606050862282962, 'weight_decay': 5.010820318725442e-05, 'batch_size': 64}. Best is trial 6 with value: 9.051858901977539.
Optuna Trials:  47%|████▋     | 14/30 [1:27:42<1:18:17, 293.60s/it]

Trial 13: MSE=9.9245


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 22:31:59,848] Trial 14 finished with value: 18.002689361572266 and parameters: {'d_model': 256, 'nhead': 2, 'num_layers': 3, 'dim_feedforward': 256, 'dropout': 0.1792786763023334, 'lr': 0.0004973942558497382, 'weight_decay': 6.206421727763395e-06, 'batch_size': 64}. Best is trial 6 with value: 9.051858901977539.
Optuna Trials:  50%|█████     | 15/30 [1:33:51<1:19:05, 316.37s/it]

Trial 14: MSE=18.0027


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 22:38:09,067] Trial 15 finished with value: 17.062528610229492 and parameters: {'d_model': 256, 'nhead': 2, 'num_layers': 3, 'dim_feedforward': 256, 'dropout': 0.14906522662098273, 'lr': 0.00044510836199735884, 'weight_decay': 2.351825635889756e-05, 'batch_size': 64}. Best is trial 6 with value: 9.051858901977539.
Optuna Trials:  53%|█████▎    | 16/30 [1:40:00<1:17:31, 332.27s/it]

Trial 15: MSE=17.0625


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 22:47:01,470] Trial 16 finished with value: 17.548349380493164 and parameters: {'d_model': 256, 'nhead': 2, 'num_layers': 3, 'dim_feedforward': 1024, 'dropout': 0.18546110362056278, 'lr': 0.00045633414448305245, 'weight_decay': 4.0143036186985015e-06, 'batch_size': 64}. Best is trial 6 with value: 9.051858901977539.
Optuna Trials:  57%|█████▋    | 17/30 [1:48:53<1:25:01, 392.45s/it]

Trial 16: MSE=17.5483


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 22:55:04,367] Trial 17 finished with value: 17.794696807861328 and parameters: {'d_model': 256, 'nhead': 2, 'num_layers': 4, 'dim_feedforward': 256, 'dropout': 0.22477688006067423, 'lr': 0.0003099337110070001, 'weight_decay': 1.6464073430487426e-05, 'batch_size': 64}. Best is trial 6 with value: 9.051858901977539.
Optuna Trials:  60%|██████    | 18/30 [1:56:56<1:23:55, 419.63s/it]

Trial 17: MSE=17.7947


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 23:01:25,026] Trial 18 finished with value: 8.020761489868164 and parameters: {'d_model': 256, 'nhead': 8, 'num_layers': 3, 'dim_feedforward': 256, 'dropout': 0.1003293115205969, 'lr': 0.0006331182245241827, 'weight_decay': 4.701444243150461e-05, 'batch_size': 32}. Best is trial 18 with value: 8.020761489868164.
Optuna Trials:  63%|██████▎   | 19/30 [2:03:16<1:14:47, 407.93s/it]

Trial 18: MSE=8.0208


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 23:07:45,869] Trial 19 finished with value: 28.66414451599121 and parameters: {'d_model': 256, 'nhead': 8, 'num_layers': 3, 'dim_feedforward': 256, 'dropout': 0.10344109978844584, 'lr': 0.0002571387149785743, 'weight_decay': 0.00015073195871026694, 'batch_size': 32}. Best is trial 18 with value: 8.020761489868164.
Optuna Trials:  67%|██████▋   | 20/30 [2:09:37<1:06:37, 399.79s/it]

Trial 19: MSE=28.6641


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 23:11:33,859] Trial 20 finished with value: 12.688490867614746 and parameters: {'d_model': 128, 'nhead': 8, 'num_layers': 2, 'dim_feedforward': 1024, 'dropout': 0.13026217914375457, 'lr': 0.0006180902812005364, 'weight_decay': 0.0002807758357786887, 'batch_size': 32}. Best is trial 18 with value: 8.020761489868164.
Optuna Trials:  70%|███████   | 21/30 [2:13:25<52:14, 348.22s/it]  

Trial 20: MSE=12.6885


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 23:17:54,503] Trial 21 finished with value: 14.40927505493164 and parameters: {'d_model': 256, 'nhead': 8, 'num_layers': 3, 'dim_feedforward': 256, 'dropout': 0.15138055487136126, 'lr': 0.000923642489383506, 'weight_decay': 5.437124924566921e-05, 'batch_size': 32}. Best is trial 18 with value: 8.020761489868164.
Optuna Trials:  73%|███████▎  | 22/30 [2:19:46<47:43, 357.95s/it]

Trial 21: MSE=14.4093


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 23:24:14,725] Trial 22 finished with value: 8.468600273132324 and parameters: {'d_model': 256, 'nhead': 8, 'num_layers': 3, 'dim_feedforward': 256, 'dropout': 0.11849337432284562, 'lr': 0.0006735340355299766, 'weight_decay': 4.808581826718319e-05, 'batch_size': 32}. Best is trial 18 with value: 8.020761489868164.
Optuna Trials:  77%|███████▋  | 23/30 [2:26:06<42:32, 364.64s/it]

Trial 22: MSE=8.4686


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 23:32:33,211] Trial 23 finished with value: 8.581242561340332 and parameters: {'d_model': 256, 'nhead': 8, 'num_layers': 4, 'dim_feedforward': 256, 'dropout': 0.12164956588363374, 'lr': 0.0003265868986094045, 'weight_decay': 1.8555345782305862e-05, 'batch_size': 32}. Best is trial 18 with value: 8.020761489868164.
Optuna Trials:  80%|████████  | 24/30 [2:34:24<40:28, 404.80s/it]

Trial 23: MSE=8.5812


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 23:40:51,875] Trial 24 finished with value: 23.482065200805664 and parameters: {'d_model': 256, 'nhead': 8, 'num_layers': 4, 'dim_feedforward': 256, 'dropout': 0.11786741190248196, 'lr': 0.0003167526352724879, 'weight_decay': 3.4912234987319697e-05, 'batch_size': 32}. Best is trial 18 with value: 8.020761489868164.
Optuna Trials:  83%|████████▎ | 25/30 [2:42:43<36:04, 432.96s/it]

Trial 24: MSE=23.4821


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-28 23:49:48,013] Trial 25 finished with value: 24.314678192138672 and parameters: {'d_model': 256, 'nhead': 8, 'num_layers': 4, 'dim_feedforward': 256, 'dropout': 0.1163342394808921, 'lr': 0.00019235967480183853, 'weight_decay': 9.881464494266396e-05, 'batch_size': 32}. Best is trial 18 with value: 8.020761489868164.
Optuna Trials:  87%|████████▋ | 26/30 [2:51:39<30:55, 463.92s/it]

Trial 25: MSE=24.3147


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-29 00:01:01,784] Trial 26 finished with value: 9.3746337890625 and parameters: {'d_model': 256, 'nhead': 8, 'num_layers': 4, 'dim_feedforward': 256, 'dropout': 0.1007037543161998, 'lr': 0.0006527996627363542, 'weight_decay': 1.7957808649941023e-05, 'batch_size': 32}. Best is trial 18 with value: 8.020761489868164.
Optuna Trials:  90%|█████████ | 27/30 [3:02:53<26:20, 526.88s/it]

Trial 26: MSE=9.3746


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-29 00:09:36,900] Trial 27 finished with value: 15.639519691467285 and parameters: {'d_model': 256, 'nhead': 8, 'num_layers': 3, 'dim_feedforward': 256, 'dropout': 0.14039685816543507, 'lr': 0.0003438796175542121, 'weight_decay': 1.2281567093183449e-05, 'batch_size': 32}. Best is trial 18 with value: 8.020761489868164.
Optuna Trials:  93%|█████████▎| 28/30 [3:11:28<17:26, 523.35s/it]

Trial 27: MSE=15.6395


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-29 00:19:13,286] Trial 28 finished with value: 8.93648624420166 and parameters: {'d_model': 128, 'nhead': 8, 'num_layers': 4, 'dim_feedforward': 1024, 'dropout': 0.11697867843000427, 'lr': 0.0007105675823413295, 'weight_decay': 3.916493496632049e-05, 'batch_size': 32}. Best is trial 18 with value: 8.020761489868164.
Optuna Trials:  97%|█████████▋| 29/30 [3:21:04<08:59, 539.26s/it]

Trial 28: MSE=8.9365


  dropout = trial.suggest_uniform('dropout', 0.1, 0.3)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
[I 2025-05-29 00:27:48,410] Trial 29 finished with value: 38.75088119506836 and parameters: {'d_model': 256, 'nhead': 8, 'num_layers': 3, 'dim_feedforward': 256, 'dropout': 0.16451495911862787, 'lr': 0.00015191766333724508, 'weight_decay': 0.00012160349393283507, 'batch_size': 32}. Best is trial 18 with value: 8.020761489868164.
Optuna Trials: 100%|██████████| 30/30 [3:29:40<00:00, 419.34s/it]

Trial 29: MSE=38.7509
Best hyperparameters: {'d_model': 256, 'nhead': 8, 'num_layers': 3, 'dim_feedforward': 256, 'dropout': 0.1003293115205969, 'lr': 0.0006331182245241827, 'weight_decay': 4.701444243150461e-05, 'batch_size': 32}
Best validation MSE: 8.020761489868164



