In [1]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
chess_king_rook_vs_king_pawn = fetch_ucirepo(id=22) 
  
# data (as pandas dataframes) 
X = chess_king_rook_vs_king_pawn.data.features 
y = chess_king_rook_vs_king_pawn.data.targets   
  
# Combine X and y into a single DataFrame
df = pd.concat([X, y], axis=1)

# Rename the target column to 'Y'
df = df.rename(columns={df.columns[-1]: 'Y'})
df = df.dropna()

In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Dictionary of regression models with their parameter grids
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Ridge Regression': (Ridge(), {'alpha': [0.1, 1.0, 10.0]}),
    'Lasso Regression': (Lasso(), {'alpha': [0.1, 1.0, 10.0]}),
    'ElasticNet': (ElasticNet(), {'alpha': [0.1, 1.0, 10.0], 'l1_ratio': [0.2, 0.5, 0.8]}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [3, 5, 7]}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]}),
    'Gradient Boosting': (GradientBoostingRegressor(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.3]}),
    'XGBoost': (XGBRegressor(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.3]}),
    'LightGBM': (LGBMRegressor(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.3]}),
    'CatBoost': (CatBoostRegressor(verbose=0), {'iterations': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.3]})
}

# Dictionary to store results
results = {}

for name, (model, param_grid) in models.items():
    start_time = time.time()
    
    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Training time for best parameters
    training_start = time.time()
    best_model.fit(X_train_scaled, y_train)
    training_time = time.time() - training_start
    
    # Inference time for best parameters
    inference_start = time.time()
    y_pred = best_model.predict(X_test_scaled)
    inference_time = time.time() - inference_start
    
    # Compute metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Computation time (total run time)
    computation_time = time.time() - start_time
    
    results[name] = {
        'RMSE': rmse,
        'R-squared': r2,
        'Training Time': training_time,
        'Inference Time': inference_time,
        'Computation Time': computation_time,
        'Best Parameters': grid_search.best_params_
    }

# Convert results to a DataFrame
result = pd.DataFrame(results).T  # Transpose to have models as rows

# Display results
print(result)

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score -25.224351
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score -25.224351
                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.136773  0.999999      0.001988            0.0   
Ridge Regression     0.836741  0.999961      0.000997       0.000998   
Lasso Regression     0.207878  0.999998      0.000997            0.0   
ElasticNet           4.180667  0.999015      0.002025            0.0   
Decision Tree       74.185527  0.689995      0.000998            0.0   
Random Forest       72.050707   0.70758       0.27595        0.01502   
Gradient Boosting   65.093976  0.761322       0.16496            0.0   
XGBoost             56.539934   0.81993      0.065024       0.001979   
Li

In [14]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the MLP model and parameter grid
mlp = MLPRegressor(random_state=42, max_iter=1000)
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
}

# Perform GridSearchCV
grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Training time for best parameters
training_start = time.time()
best_model.fit(X_train_scaled, y_train)
training_time = time.time() - training_start

# Inference time for best parameters
inference_start = time.time()
y_pred = best_model.predict(X_test_scaled)
inference_time = time.time() - inference_start

# Compute metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

result.loc['MLP'] = [rmse, r2, training_time, inference_time, computation_time, grid_search.best_params_]

# Display results
print(result)

# Print the best parameters
print("\nBest Parameters:")
print(grid_search.best_params_)




                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.991287      0.594412       0.000998   

                  Computation Time  \
Linear Regression         0.055115   
Ridge Regression          0.050876   
Lasso Regression     



In [15]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, RegressorMixin

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class DNN(nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super(DNN, self).__init__()
        layers = []
        prev_dim = input_dim
        for dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(nn.ReLU())
            prev_dim = dim
        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

class DNNRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, input_dim, hidden_dims, learning_rate=0.001, batch_size=32, num_epochs=100):
        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.model = DNN(input_dim, hidden_dims).to(device)
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def fit(self, X, y):
        train_dataset = TensorDataset(torch.FloatTensor(X).to(device), torch.FloatTensor(y).unsqueeze(1).to(device))
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

        self.model.train()
        for epoch in range(self.num_epochs):
            for batch_X, batch_y in train_loader:
                self.optimizer.zero_grad()
                outputs = self.model(batch_X)
                loss = self.criterion(outputs, batch_y)
                loss.backward()
                self.optimizer.step()
        return self

    def predict(self, X):
        self.model.eval()
        with torch.no_grad():
            X_tensor = torch.FloatTensor(X).to(device)
            predictions = self.model(X_tensor)
        return predictions.cpu().numpy()

# Define the parameter grid for GridSearchCV
param_grid = {
    'hidden_dims': [[50], [100], [50, 50], [100, 50]],
    'learning_rate': [0.0001, 0.001, 0.01],
    'batch_size': [32, 64, 128],
    'num_epochs': [50, 100]
}

# Create the DNNRegressor instance
dnn_regressor = DNNRegressor(input_dim=X_train.shape[1], hidden_dims=[50])

# Perform GridSearchCV
grid_search = GridSearchCV(dnn_regressor, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Training time for best parameters
training_start = time.time()
best_model.fit(X_train_scaled, y_train)
training_time = time.time() - training_start

# Inference time for best parameters
inference_start = time.time()
y_pred = best_model.predict(X_test_scaled)
inference_time = time.time() - inference_start

# Compute metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Assuming result is your existing DataFrame
# Store results in the existing result DataFrame
result.loc['DNN'] = [rmse, r2, training_time, inference_time, computation_time, grid_search.best_params_]

print(result)

# Print the best parameters
print("\nBest Parameters:")
print(grid_search.best_params_)


Using device: cpu
                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.991287      0.594412       0.000998   
DNN                109.746297 -1.084697      0.063799            0.0   

                  Computation Time  \
Linear 

72 fits failed out of a total of 216.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Tommy\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Tommy\AppData\Local\Temp\ipykernel_6972\3484552703.py", line 64, in fit
ValueError: could not determine the shape of object type 'Series'

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


In [16]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class CrossLayer(nn.Module):
    def __init__(self, input_dim):
        super(CrossLayer, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(input_dim, 1))
        self.bias = nn.Parameter(torch.Tensor(input_dim, 1))
        nn.init.xavier_uniform_(self.weight)
        nn.init.zeros_(self.bias)

    def forward(self, x0, x):
        x = x.unsqueeze(2)
        x0 = x0.unsqueeze(2)
        interaction = torch.matmul(x0, torch.matmul(x.transpose(1, 2), self.weight))
        return x0.squeeze(2) + interaction.squeeze(2) + self.bias.T

class DCN(nn.Module):
    def __init__(self, input_dim, cross_layers, deep_layers):
        super(DCN, self).__init__()
        self.cross_layers = nn.ModuleList([CrossLayer(input_dim) for _ in range(cross_layers)])
        
        deep_dims = [input_dim] + deep_layers
        self.deep_layers = nn.ModuleList([nn.Linear(deep_dims[i], deep_dims[i+1]) for i in range(len(deep_layers))])
        self.deep_activation = nn.ReLU()
        
        self.final_layer = nn.Linear(deep_layers[-1] + input_dim, 1)

    def forward(self, x):
        x0 = x
        cross_out = x
        for layer in self.cross_layers:
            cross_out = layer(x0, cross_out)
        
        deep_out = x
        for layer in self.deep_layers:
            deep_out = self.deep_activation(layer(deep_out))
        
        combined = torch.cat([cross_out, deep_out], dim=1)
        return self.final_layer(combined)

def objective(trial):
    # Define hyperparameters to tune
    cross_layers = trial.suggest_int('cross_layers', 1, 5)
    deep_layers = [trial.suggest_int(f'deep_layer_{i}', 32, 256) for i in range(3)]
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the DCN model
    model = DCN(X_train.shape[1], cross_layers, deep_layers).to(device)

    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        mse = mean_squared_error(y_test_tensor.cpu().numpy(), y_pred.cpu().numpy())
        rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final DCN model with the best hyperparameters
best_model = DCN(X_train.shape[1], 
                 best_params['cross_layers'], 
                 [best_params[f'deep_layer_{i}'] for i in range(3)]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    y_pred = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['DCN'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 08:08:56,710] A new study created in memory with name: no-name-182fff70-c98f-4851-b413-5a94b6c913be


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:08:57,040] Trial 0 finished with value: 9.475296020507812 and parameters: {'cross_layers': 4, 'deep_layer_0': 225, 'deep_layer_1': 76, 'deep_layer_2': 127, 'learning_rate': 0.03614504961249343, 'batch_size': 64, 'num_epochs': 42}. Best is trial 0 with value: 9.475296020507812.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:08:57,304] Trial 1 finished with value: 94.39893341064453 and parameters: {'cross_layers': 5, 'deep_layer_0': 36, 'deep_layer_1': 124, 'deep_layer_2': 55, 'learning_rate': 0.002292692342985842, 'batch_size': 256, 'num_epochs': 38}. Best is trial 0 with value: 9.475296020507812.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:08:57,832] Trial 2 finished with value: 67.55377197265625 and parameters: {'cross_layers': 5, 'deep_layer_0': 66, 'deep_layer_1': 210, 'deep_layer_2': 212, 'learning_rate': 0.0

                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.991287      0.594412       0.000998   
DNN                109.746297 -1.084697      0.063799            0.0   
DCN                   4.72361  0.996138      0.418879           

In [17]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class WideAndDeepNetwork(nn.Module):
    def __init__(self, input_dim, wide_dim, deep_dims):
        super(WideAndDeepNetwork, self).__init__()
        self.wide = nn.Linear(input_dim, wide_dim)
        
        deep_layers = []
        prev_dim = input_dim
        for dim in deep_dims:
            deep_layers.append(nn.Linear(prev_dim, dim))
            deep_layers.append(nn.ReLU())
            prev_dim = dim
        self.deep = nn.Sequential(*deep_layers)
        
        self.final = nn.Linear(wide_dim + deep_dims[-1], 1)

    def forward(self, x):
        wide_out = self.wide(x)
        deep_out = self.deep(x)
        combined = torch.cat([wide_out, deep_out], dim=1)
        return self.final(combined)

def objective(trial):
    # Define hyperparameters to tune
    wide_dim = trial.suggest_int('wide_dim', 8, 128)
    n_deep_layers = trial.suggest_int('n_deep_layers', 1, 5)
    deep_dims = [trial.suggest_int(f'deep_dim_{i}', 32, 256) for i in range(n_deep_layers)]
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Wide & Deep model
    model = WideAndDeepNetwork(X_train.shape[1], wide_dim, deep_dims).to(device)

    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        mse = mean_squared_error(y_test_tensor.cpu().numpy(), y_pred.cpu().numpy())
        rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final Wide & Deep model with the best hyperparameters
best_model = WideAndDeepNetwork(X_train.shape[1], 
                                best_params['wide_dim'], 
                                [best_params[f'deep_dim_{i}'] for i in range(best_params['n_deep_layers'])]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    y_pred = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['Wide & Deep'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


[I 2024-07-26 08:10:15,159] A new study created in memory with name: no-name-c86b482c-8e21-4561-bf41-f15418b31c04


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:10:15,432] Trial 0 finished with value: 6.078895092010498 and parameters: {'wide_dim': 92, 'n_deep_layers': 3, 'deep_dim_0': 142, 'deep_dim_1': 37, 'deep_dim_2': 207, 'learning_rate': 0.0791549434665011, 'batch_size': 64, 'num_epochs': 53}. Best is trial 0 with value: 6.078895092010498.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:10:15,678] Trial 1 finished with value: 2.9665329456329346 and parameters: {'wide_dim': 114, 'n_deep_layers': 1, 'deep_dim_0': 205, 'learning_rate': 0.027147206811385417, 'batch_size': 128, 'num_epochs': 92}. Best is trial 1 with value: 2.9665329456329346.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:10:15,818] Trial 2 finished with value: 81.11412811279297 and parameters: {'wide_dim': 89, 'n_deep_layers': 1, 'deep_dim_0': 177, 'learning_rate': 0.005431418025440753, 'batch_size': 256, 

                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.991287      0.594412       0.000998   
DNN                109.746297 -1.084697      0.063799            0.0   
DCN                   4.72361  0.996138      0.418879           

In [18]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna
import xgboost as xgb

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super(NeuralNetwork, self).__init__()
        layers = []
        prev_dim = input_dim
        for dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(nn.ReLU())
            prev_dim = dim
        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for XGBoost
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 200),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
        'subsample': trial.suggest_uniform('xgb_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('xgb_colsample_bytree', 0.5, 1.0)
    }

    # Train XGBoost model
    xgb_model = xgb.XGBRegressor(**xgb_params)
    xgb_model.fit(X_train_scaled, y_train)

    # Transform the data using the XGBoost model
    X_train_transformed = xgb_model.apply(X_train_scaled)
    X_test_transformed = xgb_model.apply(X_test_scaled)

    # Define hyperparameters to tune for Neural Network
    n_layers = trial.suggest_int('nn_n_layers', 1, 5)
    hidden_dims = [trial.suggest_int(f'nn_hidden_dim_{i}', 32, 256) for i in range(n_layers)]
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    nn_batch_size = trial.suggest_categorical('nn_batch_size', [32, 64, 128, 256])
    nn_num_epochs = trial.suggest_int('nn_num_epochs', 10, 100)

    # Create the Neural Network model
    nn_model = NeuralNetwork(X_train_transformed.shape[1], hidden_dims).to(device)

    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(nn_model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(torch.FloatTensor(X_train_transformed).to(device), y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=nn_batch_size, shuffle=True)

    # Training
    for epoch in range(nn_num_epochs):
        nn_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = nn_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    nn_model.eval()
    with torch.no_grad():
        y_pred = nn_model(torch.FloatTensor(X_test_transformed).to(device))
        mse = mean_squared_error(y_test_tensor.cpu().numpy(), y_pred.cpu().numpy())
        rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final XGBoost model with the best hyperparameters
xgb_params = {
    'n_estimators': best_params['xgb_n_estimators'],
    'max_depth': best_params['xgb_max_depth'],
    'learning_rate': best_params['xgb_learning_rate'],
    'subsample': best_params['xgb_subsample'],
    'colsample_bytree': best_params['xgb_colsample_bytree']
}
xgb_model = xgb.XGBRegressor(**xgb_params)
xgb_model.fit(X_train_scaled, y_train)

# Transform the data using the XGBoost model
X_train_transformed = xgb_model.apply(X_train_scaled)
X_test_transformed = xgb_model.apply(X_test_scaled)

# Train the final Neural Network model with the best hyperparameters
nn_model = NeuralNetwork(X_train_transformed.shape[1], 
                         [best_params[f'nn_hidden_dim_{i}'] for i in range(best_params['nn_n_layers'])]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(nn_model.parameters(), lr=best_params['nn_learning_rate'])

train_dataset = TensorDataset(torch.FloatTensor(X_train_transformed).to(device), y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['nn_batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['nn_num_epochs']):
    nn_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = nn_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
nn_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    y_pred = nn_model(torch.FloatTensor(X_test_transformed).to(device))
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['XGBoost + NN'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


[I 2024-07-26 08:11:32,383] A new study created in memory with name: no-name-02df36d3-dd1e-4742-a6eb-fd488a87cb4a
  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('xgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('xgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:11:32,519] Trial 0 finished with value: 100.7371826171875 and parameters: {'xgb_n_estimators': 65, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.00012616736217413617, 'xgb_subsample': 0.6620903178422187, 'xgb_colsample_bytree': 0.6655663978982062, 'nn_n_layers': 3, 'nn_hidden_dim_0': 104, 'nn_hidden_dim_1': 191, 'nn_hidden_dim_2': 237, 'nn_learning_rate': 0.04983956435788168, 'nn_batch_size': 32, 'nn_num_epochs': 16}. Best is trial 0 with value: 100.7371826171875.


Using device: cpu


  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('xgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('xgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:11:32,937] Trial 1 finished with value: 57.06631088256836 and parameters: {'xgb_n_estimators': 81, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.031570465181887185, 'xgb_subsample': 0.6432306473108582, 'xgb_colsample_bytree': 0.6656993623987824, 'nn_n_layers': 3, 'nn_hidden_dim_0': 252, 'nn_hidden_dim_1': 103, 'nn_hidden_dim_2': 253, 'nn_learning_rate': 0.012292455085503839, 'nn_batch_size': 64, 'nn_num_epochs': 66}. Best is trial 1 with value: 57.06631088256836.
  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('xgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('xgb_colsample_bytree', 0.5, 1.

                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.991287      0.594412       0.000998   
DNN                109.746297 -1.084697      0.063799            0.0   
DCN                   4.72361  0.996138      0.418879           

In [20]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna
import lightgbm as lgb

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super(NeuralNetwork, self).__init__()
        layers = []
        prev_dim = input_dim
        for dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(nn.ReLU())
            prev_dim = dim
        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for LightGBM
    lgb_params = {
        'n_estimators': trial.suggest_int('lgb_n_estimators', 50, 200),
        'max_depth': trial.suggest_int('lgb_max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
        'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
    }

    # Train LightGBM model
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(X_train_scaled, y_train)

    # Transform the data using the LightGBM model (use raw predictions as features)
    X_train_transformed = lgb_model.predict(X_train_scaled, raw_score=True).reshape(-1, 1)
    X_test_transformed = lgb_model.predict(X_test_scaled, raw_score=True).reshape(-1, 1)

    # Define hyperparameters to tune for Neural Network
    n_layers = trial.suggest_int('nn_n_layers', 1, 5)
    hidden_dims = [trial.suggest_int(f'nn_hidden_dim_{i}', 32, 256) for i in range(n_layers)]
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    nn_batch_size = trial.suggest_categorical('nn_batch_size', [32, 64, 128, 256])
    nn_num_epochs = trial.suggest_int('nn_num_epochs', 10, 100)

    # Create the Neural Network model
    nn_model = NeuralNetwork(X_train_transformed.shape[1], hidden_dims).to(device)

    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(nn_model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(torch.FloatTensor(X_train_transformed).to(device), y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=nn_batch_size, shuffle=True)

    # Training
    for epoch in range(nn_num_epochs):
        nn_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = nn_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    nn_model.eval()
    with torch.no_grad():
        y_pred = nn_model(torch.FloatTensor(X_test_transformed).to(device))
        mse = mean_squared_error(y_test_tensor.cpu().numpy(), y_pred.cpu().numpy())
        rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final LightGBM model with the best hyperparameters
lgb_params = {
    'n_estimators': best_params['lgb_n_estimators'],
    'max_depth': best_params['lgb_max_depth'],
    'learning_rate': best_params['lgb_learning_rate'],
    'subsample': best_params['lgb_subsample'],
    'colsample_bytree': best_params['lgb_colsample_bytree']
}
lgb_model = lgb.LGBMRegressor(**lgb_params)
lgb_model.fit(X_train_scaled, y_train)

# Transform the data using the LightGBM model (use raw predictions as features)
X_train_transformed = lgb_model.predict(X_train_scaled, raw_score=True).reshape(-1, 1)
X_test_transformed = lgb_model.predict(X_test_scaled, raw_score=True).reshape(-1, 1)

# Train the final Neural Network model with the best hyperparameters
nn_model = NeuralNetwork(X_train_transformed.shape[1], 
                         [best_params[f'nn_hidden_dim_{i}'] for i in range(best_params['nn_n_layers'])]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(nn_model.parameters(), lr=best_params['nn_learning_rate'])

train_dataset = TensorDataset(torch.FloatTensor(X_train_transformed).to(device), y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['nn_batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['nn_num_epochs']):
    nn_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = nn_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
nn_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    y_pred = nn_model(torch.FloatTensor(X_test_transformed).to(device))
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['LightGBM + NN'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


[I 2024-07-26 08:16:22,655] A new study created in memory with name: no-name-c8fd92fb-64cd-4842-99f7-ebcb28058e2d


Using device: cpu
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:22,993] Trial 0 finished with value: 104.6399154663086 and parameters: {'lgb_n_estimators': 54, 'lgb_max_depth': 10, 'lgb_learning_rate': 0.00042179376500963453, 'lgb_subsample': 0.5163096251303236, 'lgb_colsample_bytree': 0.8761492359442916, 'nn_n_layers': 3, 'nn_hidden_dim_0': 208, 'nn_hidden_dim_1': 67, 'nn_hidden_dim_2': 236, 'nn_learning_rate': 0.00024512865277224473, 'nn_batch_size': 32, 'nn_num_epochs': 57}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5,

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:23,304] Trial 2 finished with value: 106.50305938720703 and parameters: {'lgb_n_estimators': 158, 'lgb_max_depth': 4, 'lgb_learning_rate': 0.038195058985550756, 'lgb_subsample': 0.646086635863193, 'lgb_colsample_bytree': 0.9555746204783562, 'nn_n_layers': 5, 'nn_hidden_dim_0': 218, 'nn_hidden_dim_1': 51, 'nn_hidden_dim_2': 65, 'nn_hidden_dim_3': 98, 'nn_hidden_dim_4': 209, 'nn_learning_rate': 0.017250623908633172, 'nn_batch_size': 128, 'nn_num_epochs': 27}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:23,830] Trial 3 finished with value: 105.12828826904297 and parameters: {'lgb_n_estimators': 86, 'lgb_max_depth': 7, 'lgb_learning_rate': 0.013106012580848403, 'lgb_subsample': 0.6567430671894787, 'lgb_colsample_bytree': 0.9371286341986755, 'nn_n_layers': 3, 'nn_hidden_dim_0': 215, 'nn_hidden_dim_1': 187, 'nn_hidden_dim_2': 238, 'nn_learning_rate': 0.0002732289643463954, 'nn_batch_size': 256, 'nn_num_epochs': 80}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:24,024] Trial 4 finished with value: 105.2466049194336 and parameters: {'lgb_n_estimators': 108, 'lgb_max_depth': 3, 'lgb_learning_rate': 0.004661992428285996, 'lgb_subsample': 0.950058910141466, 

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:24,700] Trial 5 finished with value: 105.04753112792969 and parameters: {'lgb_n_estimators': 77, 'lgb_max_depth': 4, 'lgb_learning_rate': 0.009509274033423549, 'lgb_subsample': 0.9132806481233388, 'lgb_colsample_bytree': 0.6762631949675548, 'nn_n_layers': 4, 'nn_hidden_dim_0': 150, 'nn_hidden_dim_1': 159, 'nn_hidden_dim_2': 108, 'nn_hidden_dim_3': 186, 'nn_learning_rate': 0.0004305671017741831, 'nn_batch_size': 64, 'nn_num_epochs': 89}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:24,950] Trial 6 finished with value: 106.21504974365234 and parameters: {'lgb_n_estimators': 102, 'lgb_max_depth': 8, 'lgb_learning_rate': 0.00563586056575545, 'lgb_subsample': 0.8219929662113465, 'lgb_colsample_bytree': 0.7174286525623176, 'nn_n_layers': 2, 'nn_hidden_dim_0': 181, 'nn_hidden_dim_1': 185, 'nn_learning_rate': 0.00015208131236318554, 'nn_batch_size': 64, 'nn_num_epochs': 49}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:25,098] Trial 7 finished with value: 110.00047302246094 and parameters: {'lgb_n_estimators': 143, 'lgb_max_depth': 3, 'lgb_learning_rate': 0.08875038268334944, 'lgb_subsample': 0.7356743450106915, 'lgb_colsample_bytree':

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:25,340] Trial 8 finished with value: 105.14608001708984 and parameters: {'lgb_n_estimators': 84, 'lgb_max_depth': 3, 'lgb_learning_rate': 0.011743006273115628, 'lgb_subsample': 0.5794933754157657, 'lgb_colsample_bytree': 0.5690603464598271, 'nn_n_layers': 2, 'nn_hidden_dim_0': 246, 'nn_hidden_dim_1': 151, 'nn_learning_rate': 0.0005565788062898846, 'nn_batch_size': 256, 'nn_num_epochs': 49}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:25,445] Trial 9 finished with value: 108.37620544433594 and parameters: {'lgb_n_estimators': 54, 'lgb_max_depth': 3, 'lgb_learning_rate': 0.00016299732737135215, 'lgb_subsample': 0.5093314129933125, 'lgb_colsample_bytree

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:25,732] Trial 10 finished with value: 105.21692657470703 and parameters: {'lgb_n_estimators': 183, 'lgb_max_depth': 10, 'lgb_learning_rate': 0.000588145388421464, 'lgb_subsample': 0.5145257119691895, 'lgb_colsample_bytree': 0.8453881529267584, 'nn_n_layers': 1, 'nn_hidden_dim_0': 191, 'nn_learning_rate': 0.0021956309545749903, 'nn_batch_size': 32, 'nn_num_epochs': 72}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:26,810] Trial 11 finished with value: 105.10245513916016 and parameters: {'lgb_n_estimators': 65, 'lgb_max_depth': 10, 'lgb_learning_rate': 0.0010459560092104098, 'lgb_subsample': 0.9902553105645449, 'lgb_colsample_bytree': 0.8332652358640823, 'nn_n_layers': 4, 'nn_hidden_dim_0': 37, 'nn_hidden_dim_1': 253, 'nn_hidden_dim_2': 254, 'nn_hidden_dim_3': 246, 'nn_learning_rate': 0.0007772421313441734, 'nn_batch_size': 64, 'nn_num_epochs': 99}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:27,365] Trial 12 finished with value: 106.71044158935547 and parameters: {'lgb_n_estimators': 52, 'lgb_max_depth': 5, 'lgb_learning_rate': 0.0011095277948729976, 'lgb_subsample': 0.8585157894818644, 'lgb_colsample_bytree': 0.8165537403467128, 'nn_n_layers': 4, 'nn_hidden_dim_0': 138, 'nn_hidden_dim_1': 114, 'nn_hidden_dim_2': 147, 'nn_hidden_dim_3': 38, 'nn_learning_rate': 0.00010207698377775588, 'nn_batch_size': 32, 'nn_num_epochs': 98}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:27,944] Trial 13 finished with value: 105.12277221679688 and parameters: {'lgb_n_estimators': 79, 'lgb_max_depth': 6, 'lgb_learning_rate': 0.00017701922956591832, 'lgb_subsample': 0.8806899517616948, 'lgb_colsample_bytree': 0.6523306985305392, 'nn_n_layers': 4, 'nn_hidden_dim_0': 255, 'nn_hidden_dim_1': 137, 'nn_hidden_dim_2': 187, 'nn_hidden_dim_3': 127, 'nn_learning_rate': 0.09362031497363987, 'nn_batch_size': 32, 'nn_num_epochs': 73}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:29,291] Trial 14 finished with value: 105.13893127441406 and parameters: {'lgb_n_estimators': 70, 'lgb_max_depth': 5, 'lgb_learning_rate': 0.001983509484350235, 'lgb_subsample': 0.7720644045106894, 'lgb_colsample_bytree': 0.8964295639465052, 'nn_n_layers': 3, 'nn_hidden_dim_0': 181, 'nn_hidden_dim_1': 183, 'nn_hidden_dim_2': 112, 'nn_learning_rate': 0.0003747110722546206, 'nn_batch_size': 64, 'nn_num_epochs': 86}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:29,716] Trial 15 finished with value: 105.62723541259766 and parameters: {'lgb_n_estimators': 104, 'lgb_max_depth': 9, 'lgb_learning_rate': 0.0003838774127723813, 'lgb_subsample': 0.9229009979675693, 'lgb_colsample_bytree': 0.6791212431723318, 'nn_n_layers': 5, 'nn_hidden_dim_0': 159, 'nn_hidden_dim_1': 230, 'nn_hidden_dim_2': 35, 'nn_hidden_dim_3': 252, 'nn_hidden_dim_4': 56, 'nn_learning_rate': 0.0020845990411855085, 'nn_batch_size': 32, 'nn_num_epochs': 38}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:29,942] Trial 16 finished with value: 105.21481323242188 and parameters: {'lgb_n_estimators': 54, 'lgb_max_depth': 6, 'lgb_learning_rate': 0.0024737

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:30,636] Trial 17 finished with value: 105.12393951416016 and parameters: {'lgb_n_estimators': 127, 'lgb_max_depth': 7, 'lgb_learning_rate': 0.00010187367730751595, 'lgb_subsample': 0.805789287203229, 'lgb_colsample_bytree': 0.6474286357732697, 'nn_n_layers': 4, 'nn_hidden_dim_0': 55, 'nn_hidden_dim_1': 106, 'nn_hidden_dim_2': 219, 'nn_hidden_dim_3': 189, 'nn_learning_rate': 0.0010260604262142354, 'nn_batch_size': 128, 'nn_num_epochs': 65}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:31,263] Trial 18 finished with value: 105.19861602783203 and parameters: {'lgb_n_estimators': 193, 'lgb_max_depth': 5, 'lgb_learning_rate': 0.02950064677236357, 'lgb_subsample': 0.5892538440795134, 'lgb_colsample_bytree': 0.9938046166870105, 'nn_n_layers': 3, 'nn_hidden_dim_0': 139, 'nn_hidden_dim_1': 150, 'nn_hidden_dim_2': 100, 'nn_learning_rate': 0.0003105457886348364, 'nn_batch_size': 64, 'nn_num_epochs': 89}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:31,552] Trial 19 finished with value: 104.715576171875 and parameters: {'lgb_n_estimators': 93, 'lgb_max_depth': 9, 'lgb_learning_rate': 0.00033586287424902005, 'lgb_subsample': 0.579356215026103, 'lgb_colsample_bytree': 0.886211703520994, 'nn_n_layers': 2, 'nn_hidden_dim_0': 108, 'nn_hidden_dim_1': 207, 'nn_learning_rate': 0.0015550629932259642, 'nn_batch_size': 32, 'nn_num_epochs': 38}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:31,844] Trial 20 finished with value: 104.82938385009766 and parameters: {'lgb_n_estimators': 95, 'lgb_max_depth': 9, 'lgb_learning_rate': 0.00035842515421034474, 'lgb_subsample': 0.5653014733744197, 'lgb_colsample_bytree': 0.8967352991630179, 'nn_n_layers': 2, 'nn_hidden_dim_0': 82, 'nn_hidden_dim_1': 207, 'nn_learning_rate': 0.010625811633286245, 'nn_batch_size': 32, 'nn_num_epochs': 39}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:32,181] Trial 21 finished with value: 104.8520278930664 and parameters: {'lgb_n_estimators': 91, 'lgb_max_depth': 9, 'lgb_learning_rate': 0.00036807342162744526, 'lgb_subsample': 0.5883233944252878, 'lgb_colsample_bytree': 0.8695428646638181, 'nn_n_layers': 2, 'nn_hidden_dim_0': 80, 'nn_hidden_dim_1': 227, 'nn_learning_rate': 0.009730423176239449, 'nn_batch_size': 32, 'nn_num_epochs': 40}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:32,482] Trial 22 finished with value: 105.05113983154297 and parameters: {'lgb_n_estimators': 120, 'lgb_max_depth': 9, 'lgb_learning_rate': 0.00031053057674986726, 'lgb_subsample': 0.5518060640135853, 'lgb_colsample_bytree': 0.9127553501274046, 'nn_n_layers': 2, 'nn_hidden_dim_0': 108, 'nn_hidden_dim_1': 213, 'nn_learning_rate': 0.043299244091483106, 'nn_batch_size': 32, 'nn_num_epochs': 40}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:32,665] Trial 23 finished with value: 105.84293365478516 and parameters: {'lgb_n_estimators': 97, 'lgb_max_depth': 10, 'lgb_learning_rate': 0.0008251884685813018, 'lgb_subsample': 0.6216379851692604, 'lgb_colsample_byt

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:33,131] Trial 24 finished with value: 105.17169952392578 and parameters: {'lgb_n_estimators': 69, 'lgb_max_depth': 8, 'lgb_learning_rate': 0.0002143282844388477, 'lgb_subsample': 0.5401500567451343, 'lgb_colsample_bytree': 0.9849840715230123, 'nn_n_layers': 3, 'nn_hidden_dim_0': 122, 'nn_hidden_dim_1': 204, 'nn_hidden_dim_2': 196, 'nn_learning_rate': 0.0016163637485276314, 'nn_batch_size': 32, 'nn_num_epochs': 58}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:33,385] Trial 25 finished with value: 105.14616394042969 and parameters: {'lgb_n_estimators': 143, 'lgb_max_depth': 9

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:33,633] Trial 26 finished with value: 105.39818572998047 and parameters: {'lgb_n_estimators': 111, 'lgb_max_depth': 10, 'lgb_learning_rate': 0.0014708902138905384, 'lgb_subsample': 0.5012606053692553, 'lgb_colsample_bytree': 0.9297397211270766, 'nn_n_layers': 1, 'nn_hidden_dim_0': 87, 'nn_learning_rate': 0.022932779563372932, 'nn_batch_size': 32, 'nn_num_epochs': 43}. Best is trial 0 with value: 104.6399154663086.


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:33,851] Trial 27 finished with value: 105.83656311035156 and parameters: {'lgb_n_estimators': 135, 'lgb_max_depth': 8, 'lgb_learning_rate': 0.00010353888546192184, 'lgb_subsample': 0.6203610203721386, 'lgb_colsample_bytree': 0.7990879676935866, 'nn_n_layers': 3, 'nn_hidden_dim_0': 97, 'nn_hidden_dim_1': 167, 'nn_hidden_dim_2': 154, 'nn_learning_rate': 0.005310597878511717, 'nn_batch_size': 32, 'nn_num_epochs': 20}. Best is trial 0 with value: 104.6399154663086.


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:35,047] Trial 28 finished with value: 105.11751556396484 and parameters: {'lgb_n_estimators': 163, 'lgb_max_depth': 9, 'lgb_learning_rate': 0.00042753121777514523, 'lgb_subsample': 0.5595733654037651, 'lgb_colsample_bytree': 0.8589458273030148, 'nn_n_layers': 2, 'nn_hidden_dim_0': 51, 'nn_hidden_dim_1': 126, 'nn_learning_rate': 0.01310236927213674, 'nn_batch_size': 32, 'nn_num_epochs': 57}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:35,327] Trial 29 finished with value: 105.71556854248047 and parameters: {'lgb_n_estimators': 114, 'lgb_max_depth': 7, 'lgb_learning_rate': 0.00023800937302895507, 'lgb_subsample': 0.6146100915018151, 'lgb_colsample_bytree': 0.9531206864059752, 'nn_n_layers': 2, 'nn_hidden_dim_0': 130, 'nn_hidden_dim_1': 203, 'nn_learning_rate': 0.035532979080029066, 'nn_batch_size': 256, 'nn_num_epochs': 33}. Best is trial 0 with value: 104.6399154663086.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:35,555] Trial 30 finished with value: 104.99534606933594 and parameters: {'lgb_n_estimators': 60, 'lgb_max_depth': 10, 'lgb_learning_rate': 0.000660327142659241, 'lgb_subsample': 0.7040221734701735, 'lgb_colsample_byt

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:35,716] Trial 31 finished with value: 104.5672607421875 and parameters: {'lgb_n_estimators': 90, 'lgb_max_depth': 9, 'lgb_learning_rate': 0.000380258536824582, 'lgb_subsample': 0.594032821135376, 'lgb_colsample_bytree': 0.8997484019295007, 'nn_n_layers': 2, 'nn_hidden_dim_0': 33, 'nn_hidden_dim_1': 226, 'nn_learning_rate': 0.006743188645866281, 'nn_batch_size': 32, 'nn_num_epochs': 21}. Best is trial 31 with value: 104.5672607421875.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:36,264] Trial 32 finished with value: 105.16194915771484 and parameters: {'lgb_n_estimators': 92, 'lgb_max_depth': 8, 'lgb_learning_rate': 0.0005203837173892043, 'lgb_subsample': 0.5284351717927248, 'lgb_colsample_bytree': 0.8894114521178983, 'nn_n_layers': 3, 'nn_hidden_dim_0': 40, 'nn_hidden_dim_1': 232, 'nn_hidden_dim_2': 223, 'nn_learning_rate': 0.006935316879649457, 'nn_batch_size': 32, 'nn_num_epochs': 16}. Best is trial 31 with value: 104.5672607421875.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:36,493] Trial 33 finished with value: 106.59446716308594 and parameters: {'lgb_n_estimators': 76, 'lgb_max_depth': 9, 'lgb_learning_rate': 0.00022846886852720478, 'lgb_subsample': 0.64480717352178

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:36,809] Trial 34 finished with value: 105.74711608886719 and parameters: {'lgb_n_estimators': 95, 'lgb_max_depth': 8, 'lgb_learning_rate': 0.00014011719148488084, 'lgb_subsample': 0.5660912588931056, 'lgb_colsample_bytree': 0.9694800860514312, 'nn_n_layers': 3, 'nn_hidden_dim_0': 70, 'nn_hidden_dim_1': 173, 'nn_hidden_dim_2': 210, 'nn_learning_rate': 0.0013793096719111155, 'nn_batch_size': 256, 'nn_num_epochs': 28}. Best is trial 31 with value: 104.5672607421875.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:37,156] Trial 35 finished with value: 105.09170532226562 and parameters: {'lgb_n_estimators': 85, 'lgb_max_depth': 10, 'lgb_learning_rate': 0.003719832745464487, 'lgb_subsample': 0.6848121298252398, 'lgb_colsample_bytree': 0.9553914207254042, 'nn_n_layers': 2, 'nn_hidden_dim_0': 94, 'nn_hidden_dim_1': 245, 'nn_learning_rate': 0.0026950869807247976, 'nn_batch_size': 32, 'nn_num_epochs': 61}. Best is trial 31 with value: 104.5672607421875.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:37,517] Trial 36 finished with value: 105.19314575195312 and parameters: {'lgb_n_estimators': 124, 'lgb_max_depth': 9, 'lgb_learning_rate': 0.000310304788250096, 'lgb_subsample': 0.6008229591745248, 'lgb_colsample_bytree': 0.8987182740738502, 'nn_n_layers': 3, 'nn_hidden_dim_0': 33, 'nn_hidden_dim_1': 196, 'nn_hidden_dim_2': 256, 'nn_learning_rate': 0.03001748430612727, 'nn_batch_size': 128, 'nn_num_epochs': 52}. Best is trial 31 with value: 104.5672607421875.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:37,812] Trial 37 finished with value: 106.72945404052734 and parameters: {'lgb_n_estimators': 101, 'lgb_max_depth': 9, 'lgb_learning_rate': 0.0014993135187394897, 'lgb_subsample': 0.65227355469724

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:38,028] Trial 38 finished with value: 105.24493408203125 and parameters: {'lgb_n_estimators': 75, 'lgb_max_depth': 10, 'lgb_learning_rate': 0.0008299034749961804, 'lgb_subsample': 0.5359718088471037, 'lgb_colsample_bytree': 0.9411754094332717, 'nn_n_layers': 1, 'nn_hidden_dim_0': 170, 'nn_learning_rate': 0.004601582644459463, 'nn_batch_size': 256, 'nn_num_epochs': 46}. Best is trial 31 with value: 104.5672607421875.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:38,195] Trial 39 finished with value: 107.5615005493164 and parameters: {'lgb_n_estimators': 113, 'lgb_max_depth': 7, 'lgb_learning_rate': 0.007823924063453218, 'lgb_subsample': 0.6386356782357735, 'lgb_colsample_bytree': 0.8614585057075129, 'nn_n_layers': 2, 'nn_hidden_dim_0': 122, 'nn_hidden_dim_1': 33, 'nn_learning_rate': 0.0004812114916245511, 'nn_batch_size': 32, 'nn_num_epochs': 22}. Best is trial 31 with value: 104.5672607421875.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:38,379] Trial 40 finished with value: 103.45968627929688 and parameters: {'lgb_n_estimators': 84, 'lgb_max_depth': 8, 'lgb_learning_rate': 0.00027848775760416306, 'lgb_subsample': 0.6744402577794993, 'lgb_colsample_bytree': 0.7622452542593923, 'nn_n_layers': 2, 'nn_hidden_dim_0': 204, 'nn_hidden_dim_1': 215, 'nn_learning_rate': 0.05799908673567021, 'nn_batch_size': 32, 'nn_num_epochs': 10}. Best is trial 40 with value: 103.45968627929688.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:38,559] Trial 41 finished with value: 107.35567474365234 and parameters: {'lgb_n_estimators': 85, 'lgb_max_depth': 8, 'lgb_learning_rate': 0

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:38,749] Trial 42 finished with value: 104.81035614013672 and parameters: {'lgb_n_estimators': 62, 'lgb_max_depth': 8, 'lgb_learning_rate': 0.00013779746459518765, 'lgb_subsample': 0.5750806056710004, 'lgb_colsample_bytree': 0.9120795264699705, 'nn_n_layers': 2, 'nn_hidden_dim_0': 228, 'nn_hidden_dim_1': 234, 'nn_learning_rate': 0.013322216353318438, 'nn_batch_size': 32, 'nn_num_epochs': 17}. Best is trial 40 with value: 103.45968627929688.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:39,806] Trial 43 finished with value: 106.35819244384766 and parameters: {'lgb_n_estimators': 61, 'lgb_max_depth': 8, 'lgb_learning_rate': 0.00013865197575545172, 'lgb_subsample': 0.6058086407488158, 'lgb_colsample_bytree': 0.7539000253869703, 'nn_n_layers': 2, 'nn_hidden_dim_0': 231, 'nn_hidden_dim_1': 242, 'nn_learning_rate': 0.016660063263505068, 'nn_batch_size': 32, 'nn_num_epochs': 17}. Best is trial 40 with value: 103.45968627929688.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:39,979] Trial 44 finished with value: 104.08866119384766 and parameters: {'lgb_n_estimators': 63, 'lgb_max_depth': 8, 'lgb_learning_rate': 

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


[I 2024-07-26 08:16:40,135] Trial 45 finished with value: 104.52873992919922 and parameters: {'lgb_n_estimators': 50, 'lgb_max_depth': 10, 'lgb_learning_rate': 0.0002462895811311937, 'lgb_subsample': 0.5218013897838257, 'lgb_colsample_bytree': 0.842212370767243, 'nn_n_layers': 1, 'nn_hidden_dim_0': 202, 'nn_learning_rate': 0.039624238933723646, 'nn_batch_size': 32, 'nn_num_epochs': 26}. Best is trial 40 with value: 103.45968627929688.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:40,255] Trial 46 finished with value: 104.9756851196289 and parameters: {'lgb_n_estimators': 50, 'lgb_max_depth': 10, 'lgb_learning_rate': 0.0002183632727940508, 'lgb_subsample': 0.5187690167825391, 'lgb_colsample_bytree': 0.7109449739173778,

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:40,626] Trial 47 finished with value: 106.18160247802734 and parameters: {'lgb_n_estimators': 70, 'lgb_max_depth': 7, 'lgb_learning_rate': 0.0004827780794728424, 'lgb_subsample': 0.5020010487241862, 'lgb_colsample_bytree': 0.8083391200005406, 'nn_n_layers': 1, 'nn_hidden_dim_0': 202, 'nn_learning_rate': 0.05752637116376999, 'nn_batch_size': 32, 'nn_num_epochs': 23}. Best is trial 40 with value: 103.45968627929688.


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:40,843] Trial 48 finished with value: 101.91624450683594 and parameters: {'lgb_n_estimators': 57, 'lgb_max_depth': 10, 'lgb_learning_rate': 0.0009649370536817564, 'lgb_subsample': 0.5330759057920634, 'lgb_colsample_bytree': 0.8463168041687263, 'nn_n_layers': 1, 'nn_hidden_dim_0': 172, 'nn_learning_rate': 0.060956933751015685, 'nn_batch_size': 256, 'nn_num_epochs': 10}. Best is trial 48 with value: 101.91624450683594.


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.5, 1.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:40,958] Trial 49 finished with value: 103.74559020996094 and parameters: {'lgb_n_estimators': 56, 'lgb_max_depth': 10, 'lgb_learning_rate': 0.000703869871932133, 'lgb_subsample': 0.5442127536467646, 'lgb_colsample_bytree': 0.7667767786096507, 'nn_n_layers': 1, 'nn_hidden_dim_0': 181, 'nn_learning_rate': 0.07111929774337371, 'nn_batch_size': 256, 'nn_num_epochs': 10}. Best is trial 48 with value: 101.91624450683594.


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 24, number of used features: 0
[LightGBM] [Info] Start training from score 7.791087
                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.99

In [21]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna
from catboost import CatBoostRegressor, Pool

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super(NeuralNetwork, self).__init__()
        layers = []
        prev_dim = input_dim
        for dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(nn.ReLU())
            prev_dim = dim
        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for CatBoost
    cat_params = {
        'iterations': trial.suggest_int('cat_iterations', 100, 1000),
        'depth': trial.suggest_int('cat_depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('cat_learning_rate', 1e-3, 1),
        'l2_leaf_reg': trial.suggest_loguniform('cat_l2_leaf_reg', 1e-8, 10),
        'border_count': trial.suggest_int('cat_border_count', 32, 255),
        'bagging_temperature': trial.suggest_loguniform('cat_bagging_temperature', 0.01, 100.0)
    }

    # Train CatBoost model
    cat_model = CatBoostRegressor(**cat_params, verbose=False)
    cat_model.fit(X_train_scaled, y_train)

    # Transform the data using the CatBoost model
    X_train_transformed = cat_model.calc_leaf_indexes(X_train_scaled)
    X_test_transformed = cat_model.calc_leaf_indexes(X_test_scaled)

    # Define hyperparameters to tune for Neural Network
    n_layers = trial.suggest_int('nn_n_layers', 1, 5)
    hidden_dims = [trial.suggest_int(f'nn_hidden_dim_{i}', 32, 256) for i in range(n_layers)]
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    nn_batch_size = trial.suggest_categorical('nn_batch_size', [32, 64, 128, 256])
    nn_num_epochs = trial.suggest_int('nn_num_epochs', 10, 100)

    # Create the Neural Network model
    nn_model = NeuralNetwork(X_train_transformed.shape[1], hidden_dims).to(device)

    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(nn_model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(torch.FloatTensor(X_train_transformed).to(device), y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=nn_batch_size, shuffle=True)

    # Training
    for epoch in range(nn_num_epochs):
        nn_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = nn_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    nn_model.eval()
    with torch.no_grad():
        y_pred = nn_model(torch.FloatTensor(X_test_transformed).to(device))
        mse = mean_squared_error(y_test_tensor.cpu().numpy(), y_pred.cpu().numpy())
        rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final CatBoost model with the best hyperparameters
cat_params = {
    'iterations': best_params['cat_iterations'],
    'depth': best_params['cat_depth'],
    'learning_rate': best_params['cat_learning_rate'],
    'l2_leaf_reg': best_params['cat_l2_leaf_reg'],
    'border_count': best_params['cat_border_count'],
    'bagging_temperature': best_params['cat_bagging_temperature']
}
cat_model = CatBoostRegressor(**cat_params, verbose=False)
cat_model.fit(X_train_scaled, y_train)

# Transform the data using the CatBoost model
X_train_transformed = cat_model.calc_leaf_indexes(X_train_scaled)
X_test_transformed = cat_model.calc_leaf_indexes(X_test_scaled)

# Train the final Neural Network model with the best hyperparameters
nn_model = NeuralNetwork(X_train_transformed.shape[1], 
                         [best_params[f'nn_hidden_dim_{i}'] for i in range(best_params['nn_n_layers'])]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(nn_model.parameters(), lr=best_params['nn_learning_rate'])

train_dataset = TensorDataset(torch.FloatTensor(X_train_transformed).to(device), y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['nn_batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['nn_num_epochs']):
    nn_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = nn_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
nn_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    y_pred = nn_model(torch.FloatTensor(X_test_transformed).to(device))
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['CatBoost + NN'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


[I 2024-07-26 08:16:57,374] A new study created in memory with name: no-name-add0a9c7-a0a3-48c5-81ab-2b05f7fdf1b2


Using device: cpu


  'learning_rate': trial.suggest_loguniform('cat_learning_rate', 1e-3, 1),
  'l2_leaf_reg': trial.suggest_loguniform('cat_l2_leaf_reg', 1e-8, 10),
  'bagging_temperature': trial.suggest_loguniform('cat_bagging_temperature', 0.01, 100.0)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:16:58,723] Trial 0 finished with value: 68.5884017944336 and parameters: {'cat_iterations': 808, 'cat_depth': 9, 'cat_learning_rate': 0.006298568706135879, 'cat_l2_leaf_reg': 0.06032636615429546, 'cat_border_count': 55, 'cat_bagging_temperature': 0.24724308918778018, 'nn_n_layers': 2, 'nn_hidden_dim_0': 56, 'nn_hidden_dim_1': 96, 'nn_learning_rate': 0.000115418945612421, 'nn_batch_size': 32, 'nn_num_epochs': 49}. Best is trial 0 with value: 68.5884017944336.
  'learning_rate': trial.suggest_loguniform('cat_learning_rate', 1e-3, 1),
  'l2_leaf_reg': trial.suggest_loguniform('cat_l2_leaf_reg', 1e-8, 10),
  'bagging_temperature': trial.suggest_loguniform('cat_bag

                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.991287      0.594412       0.000998   
DNN                109.746297 -1.084697      0.063799            0.0   
DCN                   4.72361  0.996138      0.418879           

In [26]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class AutoInt(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_heads, num_layers, dropout):
        super(AutoInt, self).__init__()
        self.embedding = nn.Linear(input_dim, embedding_dim)
        self.multi_head_attentions = nn.ModuleList([
            nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout)
            for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embedding_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(0)  # Add sequence dimension
        for attention in self.multi_head_attentions:
            x, _ = attention(x, x, x)
        x = x.squeeze(0)  # Remove sequence dimension
        x = self.dropout(x)
        return self.fc(x)

def objective(trial):
    # Define hyperparameters to tune
    num_heads = trial.suggest_int('num_heads', 1, 8)
    embedding_dim = trial.suggest_int('embedding_dim', num_heads, 256, step=num_heads)  # Ensure divisibility
    num_layers = trial.suggest_int('num_layers', 1, 5)
    dropout = trial.suggest_float('dropout', 0.0, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the AutoInt model
    model = AutoInt(X_train.shape[1], embedding_dim, num_heads, num_layers, dropout).to(device)

    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        mse = mean_squared_error(y_test_tensor.cpu().numpy(), y_pred.cpu().numpy())
        rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final AutoInt model with the best hyperparameters
best_model = AutoInt(X_train.shape[1], 
                     best_params['embedding_dim'], 
                     best_params['num_heads'], 
                     best_params['num_layers'], 
                     best_params['dropout']).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    y_pred = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['AutoInt'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


[I 2024-07-26 08:23:37,592] A new study created in memory with name: no-name-72a72849-e5aa-409c-9409-fb80cc019d2b


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:23:39,866] Trial 0 finished with value: 53.65874481201172 and parameters: {'num_heads': 8, 'embedding_dim': 216, 'num_layers': 5, 'dropout': 0.33285250940195116, 'learning_rate': 0.00016484901891047237, 'batch_size': 128, 'num_epochs': 83}. Best is trial 0 with value: 53.65874481201172.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:23:40,821] Trial 1 finished with value: 29091127296.0 and parameters: {'num_heads': 7, 'embedding_dim': 217, 'num_layers': 5, 'dropout': 0.4136746604752212, 'learning_rate': 0.08854690893073805, 'batch_size': 128, 'num_epochs': 35}. Best is trial 0 with value: 53.65874481201172.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:23:41,323] Trial 2 finished with value: 74.18865966796875 and parameters: {'num_heads': 3, 'embedding_dim': 138, 'num_layers': 1, 'dropout': 0.46379974863399825, 'le

                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.991287      0.594412       0.000998   
DNN                109.746297 -1.084697      0.063799            0.0   
DCN                   4.72361  0.996138      0.418879           

In [25]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class FTTransformer(nn.Module):
    def __init__(self, input_dim, num_tokens, dim, depth, heads, mlp_dim, dropout=0.1):
        super(FTTransformer, self).__init__()
        self.num_tokens = num_tokens
        self.input_dim = input_dim
        self.token_embedding = nn.Embedding(num_tokens * input_dim, dim)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim, dropout=dropout),
            num_layers=depth
        )
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, 1)
        )

    def forward(self, x):
        # Tokenize the input
        x = (x * (self.num_tokens - 1)).long()
        x = torch.clamp(x, 0, self.num_tokens - 1)  # Ensure indices are within range
        x = x + torch.arange(x.shape[1], device=x.device) * self.num_tokens
        x = self.token_embedding(x)
        
        # Apply transformer
        x = self.transformer(x.permute(1, 0, 2)).permute(1, 0, 2)
        
        # Pool and predict
        x = x.mean(dim=1)
        return self.mlp_head(x)

def objective(trial):
    # Define hyperparameters to tune
    num_tokens = trial.suggest_int('num_tokens', 8, 256)
    heads = trial.suggest_int('heads', 1, 8)
    dim = trial.suggest_int('dim', heads, 256, step=heads)  # Ensure dim is divisible by heads
    depth = trial.suggest_int('depth', 1, 6)
    mlp_dim = trial.suggest_int('mlp_dim', 32, 256)
    dropout = trial.suggest_float('dropout', 0.0, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the FT Transformer model
    model = FTTransformer(X_train.shape[1], num_tokens, dim, depth, heads, mlp_dim, dropout).to(device)

    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        mse = mean_squared_error(y_test_tensor.cpu().numpy(), y_pred.cpu().numpy())
        rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final FT Transformer model with the best hyperparameters
best_model = FTTransformer(X_train.shape[1], 
                           best_params['num_tokens'], 
                           best_params['dim'], 
                           best_params['depth'], 
                           best_params['heads'], 
                           best_params['mlp_dim'], 
                           best_params['dropout']).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    y_pred = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['FT-Transformer'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


[I 2024-07-26 08:21:41,082] A new study created in memory with name: no-name-c47986e0-ae22-4f5f-baba-0d189ed3e6a9


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:21:41,622] Trial 0 finished with value: 103.96244812011719 and parameters: {'num_tokens': 58, 'heads': 4, 'dim': 52, 'depth': 1, 'mlp_dim': 90, 'dropout': 0.4294006564307398, 'learning_rate': 0.0018117934028063763, 'batch_size': 64, 'num_epochs': 45}. Best is trial 0 with value: 103.96244812011719.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:21:42,882] Trial 1 finished with value: 107.64453887939453 and parameters: {'num_tokens': 93, 'heads': 1, 'dim': 145, 'depth': 4, 'mlp_dim': 222, 'dropout': 0.43217741185569525, 'learning_rate': 0.009877085966135644, 'batch_size': 128, 'num_epochs': 27}. Best is trial 0 with value: 103.96244812011719.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:21:44,394] Trial 2 finished with value: 92.91849517822266 and parameters: {'num_tokens': 164, 'heads': 7, 'dim': 196, 'depth': 3, 

                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.991287      0.594412       0.000998   
DNN                109.746297 -1.084697      0.063799            0.0   
DCN                   4.72361  0.996138      0.418879           

In [27]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class DynamicNet(nn.Module):
    def __init__(self, input_dim, layer_sizes, activations):
        super(DynamicNet, self).__init__()
        layers = []
        prev_dim = input_dim
        for size, activation in zip(layer_sizes, activations):
            layers.append(nn.Linear(prev_dim, size))
            if activation == 'relu':
                layers.append(nn.ReLU())
            elif activation == 'tanh':
                layers.append(nn.Tanh())
            elif activation == 'sigmoid':
                layers.append(nn.Sigmoid())
            prev_dim = size
        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune
    n_layers = trial.suggest_int('n_layers', 1, 5)
    layer_sizes = [trial.suggest_int(f'n_units_l{i}', 32, 256) for i in range(n_layers)]
    activations = [trial.suggest_categorical(f'activation_l{i}', ['relu', 'tanh', 'sigmoid']) for i in range(n_layers)]
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the model
    model = DynamicNet(X_train.shape[1], layer_sizes, activations).to(device)

    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        mse = mean_squared_error(y_test_tensor.cpu().numpy(), y_pred.cpu().numpy())
        rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final model with the best hyperparameters
layer_sizes = [best_params[f'n_units_l{i}'] for i in range(best_params['n_layers'])]
activations = [best_params[f'activation_l{i}'] for i in range(best_params['n_layers'])]
best_model = DynamicNet(X_train.shape[1], layer_sizes, activations).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    y_pred = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['NAS'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


[I 2024-07-26 08:26:00,400] A new study created in memory with name: no-name-3e541fa6-fe18-4b91-b923-1b72e5976d50


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:26:00,616] Trial 0 finished with value: 87.46815490722656 and parameters: {'n_layers': 5, 'n_units_l0': 201, 'n_units_l1': 170, 'n_units_l2': 158, 'n_units_l3': 145, 'n_units_l4': 221, 'activation_l0': 'tanh', 'activation_l1': 'sigmoid', 'activation_l2': 'relu', 'activation_l3': 'sigmoid', 'activation_l4': 'sigmoid', 'learning_rate': 0.010669005980628142, 'batch_size': 64, 'num_epochs': 18}. Best is trial 0 with value: 87.46815490722656.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:26:00,673] Trial 1 finished with value: 109.96450805664062 and parameters: {'n_layers': 1, 'n_units_l0': 203, 'activation_l0': 'relu', 'learning_rate': 0.0012974182240801542, 'batch_size': 64, 'num_epochs': 16}. Best is trial 0 with value: 87.46815490722656.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:26:00,759] Trial 2 finished with

                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.991287      0.594412       0.000998   
DNN                109.746297 -1.084697      0.063799            0.0   
DCN                   4.72361  0.996138      0.418879           

In [38]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torchdiffeq import odeint_adjoint as odeint
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class ODEFunc(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, activation):
        super(ODEFunc, self).__init__()
        layers = []
        prev_dim = input_dim
        for _ in range(num_layers):
            layers.append(nn.Linear(prev_dim, hidden_dim))
            if activation == 'relu':
                layers.append(nn.ReLU())
            elif activation == 'tanh':
                layers.append(nn.Tanh())
            elif activation == 'sigmoid':
                layers.append(nn.Sigmoid())
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, input_dim))
        self.network = nn.Sequential(*layers)

    def forward(self, t, x):
        return self.network(x)

class NODE(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, activation):
        super(NODE, self).__init__()
        self.odefunc = ODEFunc(input_dim, hidden_dim, num_layers, activation)
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        t = torch.tensor([0, 1], dtype=torch.float32).to(x.device)
        out = odeint(self.odefunc, x, t, method='rk4')[-1]
        return self.fc(out)

def objective(trial):
    # Define hyperparameters to tune
    hidden_dim = trial.suggest_int('hidden_dim', 32, 128)
    num_layers = trial.suggest_int('num_layers', 1, 3)
    activation = trial.suggest_categorical('activation', ['relu', 'tanh', 'sigmoid'])
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    num_epochs = trial.suggest_int('num_epochs', 10, 30)

    # Create the NODE model
    model = NODE(X_train.shape[1], hidden_dim, num_layers, activation).to(device)

    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        mse = mean_squared_error(y_test_tensor.cpu().numpy(), y_pred.cpu().numpy())
        rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # Reduced number of trials for faster tuning

# Get the best hyperparameters
best_params = study.best_params

# Train the final NODE model with the best hyperparameters
best_model = NODE(X_train.shape[1], 
                  best_params['hidden_dim'], 
                  best_params['num_layers'], 
                  best_params['activation']).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    y_pred = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['NODE'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


[I 2024-07-26 08:47:05,834] A new study created in memory with name: no-name-357476c1-4ae8-493f-94fd-34b393b5fb8b


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
[I 2024-07-26 08:47:06,483] Trial 0 finished with value: 110.83451080322266 and parameters: {'hidden_dim': 96, 'num_layers': 3, 'activation': 'sigmoid', 'learning_rate': 0.00019913092117633768, 'batch_size': 32, 'num_epochs': 25}. Best is trial 0 with value: 110.83451080322266.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
[I 2024-07-26 08:47:06,730] Trial 1 finished with value: 110.6949462890625 and parameters: {'hidden_dim': 123, 'num_layers': 3, 'activation': 'sigmoid', 'learning_rate': 0.0003890037162662886, 'batch_size': 64, 'num_epochs': 15}. Best is trial 1 with value: 110.6949462890625.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
[I 2024-07-26 08:47:06,916] Trial 2 finished with value: 104.4683837890625 and parameters: {'hidden_dim': 97, 'num_layers': 3, 'activation': 'sigmoid', 'learning_rate': 0.008446207350313182, 'batch_size': 32, 'num_epochs': 13}. 

                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.991287      0.594412       0.000998   
DNN                109.746297 -1.084697      0.063799            0.0   
DCN                   4.72361  0.996138      0.418879           

In [35]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna
from pytorch_tabnet.tab_model import TabNetRegressor

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape y_train and y_test to 2D arrays
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

def objective(trial):
    # Define hyperparameters to tune
    n_d = trial.suggest_int('n_d', 8, 64)
    n_a = trial.suggest_int('n_a', 8, 64)
    n_steps = trial.suggest_int('n_steps', 3, 10)
    gamma = trial.suggest_float('gamma', 1.0, 2.0)
    lambda_sparse = trial.suggest_float('lambda_sparse', 1e-6, 1e-3)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the TabNet model
    model = TabNetRegressor(
        n_d=n_d, n_a=n_a, n_steps=n_steps, gamma=gamma, lambda_sparse=lambda_sparse,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=learning_rate),
        scheduler_params={"step_size":50, "gamma":0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='entmax'
    )

    # Training
    model.fit(
        X_train_scaled, y_train,
        eval_set=[(X_test_scaled, y_test)],
        max_epochs=num_epochs,
        patience=5,
        batch_size=batch_size,
        virtual_batch_size=batch_size // 4,
        num_workers=0,
        drop_last=False
    )

    # Evaluation
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final TabNet model with the best hyperparameters
best_model = TabNetRegressor(
    n_d=best_params['n_d'],
    n_a=best_params['n_a'],
    n_steps=best_params['n_steps'],
    gamma=best_params['gamma'],
    lambda_sparse=best_params['lambda_sparse'],
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=best_params['learning_rate']),
    scheduler_params={"step_size":50, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='entmax'
)

training_start_time = time.time()
best_model.fit(
    X_train_scaled, y_train,
    eval_set=[(X_test_scaled, y_test)],
    max_epochs=best_params['num_epochs'],
    patience=5,
    batch_size=best_params['batch_size'],
    virtual_batch_size=best_params['batch_size'] // 4,
    num_workers=0,
    drop_last=False
)
training_time = time.time() - training_start_time

# Evaluation
inference_start_time = time.time()
y_pred = best_model.predict(X_test_scaled)
inference_time = time.time() - inference_start_time

# Compute metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['TabNet'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


[I 2024-07-26 08:39:52,078] A new study created in memory with name: no-name-c7a724b3-ec54-4626-8671-f94974c00f33


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8256.16309| val_0_mse: 23110.13626|  0:00:00s
epoch 1  | loss: 8055.23438| val_0_mse: 20637.32674|  0:00:00s
epoch 2  | loss: 7819.08838| val_0_mse: 17622.58602|  0:00:00s
epoch 3  | loss: 7459.15283| val_0_mse: 14274.93994|  0:00:00s
epoch 4  | loss: 7402.29443| val_0_mse: 13705.05766|  0:00:00s
epoch 5  | loss: 7144.45703| val_0_mse: 13284.58937|  0:00:00s
epoch 6  | loss: 6884.3125| val_0_mse: 12814.47813|  0:00:00s
epoch 7  | loss: 6592.36719| val_0_mse: 12400.12692|  0:00:01s
epoch 8  | loss: 6331.91016| val_0_mse: 12183.93229|  0:00:01s
epoch 9  | loss: 6030.80127| val_0_mse: 11871.92538|  0:00:01s
epoch 10 | loss: 5773.92188| val_0_mse: 11227.09959|  0:00:01s
epoch 11 | loss: 5508.97852| val_0_mse: 11209.23243|  0:00:01s
epoch 12 | loss: 5292.21338| val_0_mse: 11170.52595|  0:00:01s
epoch 13 | loss: 5037.39404| val_0_mse: 10963.63377|  0:00:01s
epoch 14 | loss: 4804.24707| val_0_mse: 10890.63483|  0:00:01s
epoch 15 | loss: 4662.29053| val_0_mse: 10832.59968|  0:

[I 2024-07-26 08:39:54,465] Trial 0 finished with value: 103.30708498687908 and parameters: {'n_d': 63, 'n_a': 47, 'n_steps': 6, 'gamma': 1.3616047743555921, 'lambda_sparse': 0.0002613536321293988, 'learning_rate': 0.0029466101498928417, 'batch_size': 256, 'num_epochs': 17}. Best is trial 0 with value: 103.30708498687908.


epoch 16 | loss: 4477.21826| val_0_mse: 10672.35381|  0:00:02s
Stop training because you reached max_epochs = 17 with best_epoch = 16 and best_val_0_mse = 10672.35381


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8447.09277| val_0_mse: 12040.16196|  0:00:00s
epoch 1  | loss: 8486.91113| val_0_mse: 12033.37028|  0:00:00s
epoch 2  | loss: 8473.91699| val_0_mse: 12009.84318|  0:00:00s
epoch 3  | loss: 8443.22461| val_0_mse: 11987.46981|  0:00:00s
epoch 4  | loss: 8411.21875| val_0_mse: 11970.59511|  0:00:00s
epoch 5  | loss: 8369.91016| val_0_mse: 11939.05906|  0:00:00s
epoch 6  | loss: 8327.69238| val_0_mse: 11914.73228|  0:00:00s
epoch 7  | loss: 8321.55762| val_0_mse: 11892.34383|  0:00:00s
epoch 8  | loss: 8257.35938| val_0_mse: 11886.39601|  0:00:00s


[I 2024-07-26 08:39:55,403] Trial 1 finished with value: 108.99574901420779 and parameters: {'n_d': 17, 'n_a': 8, 'n_steps': 5, 'gamma': 1.6306151831579347, 'lambda_sparse': 0.0005263353861934946, 'learning_rate': 0.001930107001570886, 'batch_size': 128, 'num_epochs': 10}. Best is trial 0 with value: 103.30708498687908.


epoch 9  | loss: 8255.83301| val_0_mse: 11880.0733|  0:00:00s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_0_mse = 11880.0733
epoch 0  | loss: 8721.83594| val_0_mse: 12236.80553|  0:00:00s


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 1  | loss: 8718.60547| val_0_mse: 12141.47293|  0:00:00s
epoch 2  | loss: 8691.29492| val_0_mse: 12580.76302|  0:00:00s
epoch 3  | loss: 8656.21484| val_0_mse: 12692.31055|  0:00:00s
epoch 4  | loss: 8622.55566| val_0_mse: 12508.62363|  0:00:00s
epoch 5  | loss: 8621.91309| val_0_mse: 12490.57482|  0:00:00s
epoch 6  | loss: 8511.34766| val_0_mse: 12422.76036|  0:00:00s

Early stopping occurred at epoch 6 with best_epoch = 1 and best_val_0_mse = 12141.47293


[I 2024-07-26 08:39:56,189] Trial 2 finished with value: 110.18835204111991 and parameters: {'n_d': 56, 'n_a': 9, 'n_steps': 5, 'gamma': 1.0751051664224747, 'lambda_sparse': 0.0008356128793584141, 'learning_rate': 0.0011677641538001243, 'batch_size': 32, 'num_epochs': 12}. Best is trial 0 with value: 103.30708498687908.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8762.39941| val_0_mse: 14167.0732|  0:00:00s
epoch 1  | loss: 8494.30957| val_0_mse: 13616.13619|  0:00:00s
epoch 2  | loss: 8389.37207| val_0_mse: 13074.23848|  0:00:00s
epoch 3  | loss: 8329.07324| val_0_mse: 12572.67852|  0:00:00s
epoch 4  | loss: 8244.71582| val_0_mse: 11785.38729|  0:00:00s
epoch 5  | loss: 8085.28418| val_0_mse: 11689.03825|  0:00:01s
epoch 6  | loss: 7941.13867| val_0_mse: 11619.87173|  0:00:01s
epoch 7  | loss: 7886.42676| val_0_mse: 11226.8241|  0:00:01s
epoch 8  | loss: 7795.61426| val_0_mse: 11482.0034|  0:00:01s
epoch 9  | loss: 7510.39746| val_0_mse: 11344.4644|  0:00:01s
epoch 10 | loss: 7806.18066| val_0_mse: 11356.17586|  0:00:01s
epoch 11 | loss: 7375.67285| val_0_mse: 11151.73966|  0:00:02s
epoch 12 | loss: 7237.68018| val_0_mse: 10922.44476|  0:00:02s
epoch 13 | loss: 7340.42432| val_0_mse: 10700.24652|  0:00:02s
epoch 14 | loss: 7070.95312| val_0_mse: 10546.18326|  0:00:02s
epoch 15 | loss: 7164.77588| val_0_mse: 10297.10055|  0:00:

[I 2024-07-26 08:40:00,861] Trial 3 finished with value: 98.82440267473338 and parameters: {'n_d': 57, 'n_a': 49, 'n_steps': 9, 'gamma': 1.568857219419653, 'lambda_sparse': 0.0007881207158268259, 'learning_rate': 0.00514873933645679, 'batch_size': 32, 'num_epochs': 66}. Best is trial 3 with value: 98.82440267473338.


epoch 24 | loss: 6356.10547| val_0_mse: 10379.56495|  0:00:04s

Early stopping occurred at epoch 24 with best_epoch = 19 and best_val_0_mse = 9766.26256


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8512.85547| val_0_mse: 11306.63029|  0:00:00s
epoch 1  | loss: 8344.90137| val_0_mse: 11035.73472|  0:00:00s
epoch 2  | loss: 8215.30176| val_0_mse: 10874.33832|  0:00:00s
epoch 3  | loss: 8060.75391| val_0_mse: 10778.93539|  0:00:00s
epoch 4  | loss: 7915.10352| val_0_mse: 10620.3018|  0:00:00s
epoch 5  | loss: 7785.61963| val_0_mse: 10244.85773|  0:00:00s
epoch 6  | loss: 7644.0708| val_0_mse: 10089.66333|  0:00:00s
epoch 7  | loss: 7529.88135| val_0_mse: 10000.2191|  0:00:00s
epoch 8  | loss: 7379.44092| val_0_mse: 9913.56042|  0:00:00s
epoch 9  | loss: 7278.05078| val_0_mse: 9698.06843|  0:00:01s
epoch 10 | loss: 7139.39404| val_0_mse: 9409.78731|  0:00:01s
epoch 11 | loss: 7007.36084| val_0_mse: 9077.4135|  0:00:01s
epoch 12 | loss: 6904.41357| val_0_mse: 8850.79072|  0:00:01s
epoch 13 | loss: 6778.73193| val_0_mse: 8613.46412|  0:00:01s
epoch 14 | loss: 6646.71729| val_0_mse: 8456.08498|  0:00:01s
epoch 15 | loss: 6506.62305| val_0_mse: 8310.29217|  0:00:02s
epoc

[I 2024-07-26 08:40:07,504] Trial 4 finished with value: 27.649455942998415 and parameters: {'n_d': 21, 'n_a': 15, 'n_steps': 6, 'gamma': 1.1563402160779976, 'lambda_sparse': 4.92728781179181e-05, 'learning_rate': 0.00732366909994917, 'batch_size': 256, 'num_epochs': 82}. Best is trial 4 with value: 27.649455942998415.


epoch 68 | loss: 114.92049| val_0_mse: 809.06785|  0:00:06s

Early stopping occurred at epoch 68 with best_epoch = 63 and best_val_0_mse = 764.49241
epoch 0  | loss: 8401.36035| val_0_mse: 10956.0983|  0:00:00s


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 1  | loss: 8424.4541| val_0_mse: 10832.04422|  0:00:00s
epoch 2  | loss: 8296.53613| val_0_mse: 10690.29723|  0:00:00s
epoch 3  | loss: 8346.74512| val_0_mse: 10937.42856|  0:00:00s
epoch 4  | loss: 8300.81543| val_0_mse: 10785.23161|  0:00:00s
epoch 5  | loss: 8221.76562| val_0_mse: 10677.48368|  0:00:00s
epoch 6  | loss: 8112.52539| val_0_mse: 10756.70303|  0:00:00s
epoch 7  | loss: 8155.13867| val_0_mse: 10668.20769|  0:00:00s
epoch 8  | loss: 7962.42578| val_0_mse: 10138.99787|  0:00:00s
epoch 9  | loss: 8010.38525| val_0_mse: 10119.21585|  0:00:01s
epoch 10 | loss: 7801.07568| val_0_mse: 10008.58214|  0:00:01s
epoch 11 | loss: 7845.75 | val_0_mse: 9890.54286|  0:00:01s
epoch 12 | loss: 7731.3833| val_0_mse: 9727.28594|  0:00:01s
epoch 13 | loss: 7767.06885| val_0_mse: 9677.57137|  0:00:01s
epoch 14 | loss: 7721.44873| val_0_mse: 9736.5516|  0:00:01s
epoch 15 | loss: 7865.17969| val_0_mse: 10405.46784|  0:00:01s
epoch 16 | loss: 7452.96924| val_0_mse: 10481.80226|  0:00:01s


[I 2024-07-26 08:40:09,468] Trial 5 finished with value: 98.37464799316282 and parameters: {'n_d': 9, 'n_a': 59, 'n_steps': 6, 'gamma': 1.7745717377848533, 'lambda_sparse': 0.00015995084229871053, 'learning_rate': 0.010309222764741771, 'batch_size': 32, 'num_epochs': 83}. Best is trial 4 with value: 27.649455942998415.


epoch 17 | loss: 7507.26221| val_0_mse: 10412.31426|  0:00:01s
epoch 18 | loss: 7249.41846| val_0_mse: 10308.10437|  0:00:01s

Early stopping occurred at epoch 18 with best_epoch = 13 and best_val_0_mse = 9677.57137


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8647.73828| val_0_mse: 8165.95436|  0:00:00s
epoch 1  | loss: 8717.12988| val_0_mse: 8754.86121|  0:00:00s
epoch 2  | loss: 8693.1084| val_0_mse: 8750.04019|  0:00:00s
epoch 3  | loss: 8547.6748| val_0_mse: 9316.04376|  0:00:00s


[I 2024-07-26 08:40:10,491] Trial 6 finished with value: 90.36567027498232 and parameters: {'n_d': 20, 'n_a': 58, 'n_steps': 10, 'gamma': 1.4554309266617094, 'lambda_sparse': 4.549304793959321e-05, 'learning_rate': 0.0001363839100393466, 'batch_size': 32, 'num_epochs': 93}. Best is trial 4 with value: 27.649455942998415.


epoch 4  | loss: 8565.22168| val_0_mse: 9974.06982|  0:00:00s
epoch 5  | loss: 8527.91504| val_0_mse: 9976.38421|  0:00:00s

Early stopping occurred at epoch 5 with best_epoch = 0 and best_val_0_mse = 8165.95436


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8417.88184| val_0_mse: 13349.37306|  0:00:00s
epoch 1  | loss: 8106.97168| val_0_mse: 11542.97814|  0:00:00s
epoch 2  | loss: 7832.44629| val_0_mse: 11850.71225|  0:00:00s
epoch 3  | loss: 7611.81104| val_0_mse: 12422.78975|  0:00:00s
epoch 4  | loss: 7193.55762| val_0_mse: 13089.49233|  0:00:00s
epoch 5  | loss: 7097.72559| val_0_mse: 11446.08326|  0:00:00s
epoch 6  | loss: 6644.7168| val_0_mse: 10332.67524|  0:00:00s
epoch 7  | loss: 6097.85205| val_0_mse: 8640.22926|  0:00:00s
epoch 8  | loss: 5728.21143| val_0_mse: 6792.77256|  0:00:00s
epoch 9  | loss: 5556.45898| val_0_mse: 4237.99897|  0:00:00s
epoch 10 | loss: 5001.85352| val_0_mse: 4181.14304|  0:00:00s
epoch 11 | loss: 4870.01855| val_0_mse: 4392.91384|  0:00:00s
epoch 12 | loss: 4173.75586| val_0_mse: 4984.08078|  0:00:00s
epoch 13 | loss: 4129.77148| val_0_mse: 5879.81956|  0:00:00s
epoch 14 | loss: 3037.05371| val_0_mse: 11055.47178|  0:00:01s
epoch 15 | loss: 3297.3833| val_0_mse: 17241.28624|  0:00:01s



[I 2024-07-26 08:40:11,661] Trial 7 finished with value: 64.6617587388766 and parameters: {'n_d': 27, 'n_a': 56, 'n_steps': 4, 'gamma': 1.9356130686768203, 'lambda_sparse': 0.0006750188745237965, 'learning_rate': 0.02340030399656239, 'batch_size': 64, 'num_epochs': 90}. Best is trial 4 with value: 27.649455942998415.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8459.98145| val_0_mse: 13362.60305|  0:00:00s
epoch 1  | loss: 8406.10449| val_0_mse: 13117.14256|  0:00:00s
epoch 2  | loss: 8319.35254| val_0_mse: 13075.89923|  0:00:00s
epoch 3  | loss: 8230.79492| val_0_mse: 13025.22159|  0:00:00s
epoch 4  | loss: 8164.08398| val_0_mse: 13024.63646|  0:00:00s
epoch 5  | loss: 8112.86768| val_0_mse: 13067.19502|  0:00:00s
epoch 6  | loss: 8054.97656| val_0_mse: 13080.03873|  0:00:00s
epoch 7  | loss: 7993.62109| val_0_mse: 13095.32582|  0:00:00s
epoch 8  | loss: 7947.22949| val_0_mse: 13101.84963|  0:00:00s
epoch 9  | loss: 7884.82861| val_0_mse: 12373.37184|  0:00:00s
epoch 10 | loss: 7825.28174| val_0_mse: 13107.80332|  0:00:00s
epoch 11 | loss: 7762.11719| val_0_mse: 13484.4894|  0:00:00s
epoch 12 | loss: 7754.479| val_0_mse: 12542.22396|  0:00:00s
epoch 13 | loss: 7661.52002| val_0_mse: 12427.89825|  0:00:00s
epoch 14 | loss: 7649.26562| val_0_mse: 12465.82038|  0:00:01s


[I 2024-07-26 08:40:12,772] Trial 8 finished with value: 111.23565902368959 and parameters: {'n_d': 9, 'n_a': 47, 'n_steps': 5, 'gamma': 1.4884660272398018, 'lambda_sparse': 0.0004609322731393331, 'learning_rate': 0.002701315731345864, 'batch_size': 128, 'num_epochs': 58}. Best is trial 4 with value: 27.649455942998415.



Early stopping occurred at epoch 14 with best_epoch = 9 and best_val_0_mse = 12373.37184


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8434.80371| val_0_mse: 10547.79266|  0:00:02s
epoch 1  | loss: 8158.34131| val_0_mse: 12745.97085|  0:00:02s
epoch 2  | loss: 7911.22705| val_0_mse: 10976.70784|  0:00:02s
epoch 3  | loss: 7855.74658| val_0_mse: 11533.42978|  0:00:02s


[I 2024-07-26 08:40:16,137] Trial 9 finished with value: 102.70244721953317 and parameters: {'n_d': 28, 'n_a': 17, 'n_steps': 9, 'gamma': 1.6146435157821433, 'lambda_sparse': 0.000306150795524749, 'learning_rate': 0.016132475371065286, 'batch_size': 256, 'num_epochs': 29}. Best is trial 4 with value: 27.649455942998415.


epoch 4  | loss: 7550.71436| val_0_mse: 11297.02554|  0:00:03s
epoch 5  | loss: 7543.32178| val_0_mse: 11076.71253|  0:00:03s

Early stopping occurred at epoch 5 with best_epoch = 0 and best_val_0_mse = 10547.79266


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8478.72949| val_0_mse: 10122.652|  0:00:00s
epoch 1  | loss: 7673.36084| val_0_mse: 16488.38675|  0:00:00s
epoch 2  | loss: 6335.55859| val_0_mse: 33507.81995|  0:00:00s
epoch 3  | loss: 4785.95557| val_0_mse: 28763.55291|  0:00:00s
epoch 4  | loss: 3199.34277| val_0_mse: 53987.92402|  0:00:00s


[I 2024-07-26 08:40:16,546] Trial 10 finished with value: 100.61139098155233 and parameters: {'n_d': 42, 'n_a': 30, 'n_steps': 3, 'gamma': 1.0522916095225474, 'lambda_sparse': 1.6775133152212835e-05, 'learning_rate': 0.09044593082214665, 'batch_size': 256, 'num_epochs': 42}. Best is trial 4 with value: 27.649455942998415.


epoch 5  | loss: 1831.31189| val_0_mse: 163710.41369|  0:00:00s

Early stopping occurred at epoch 5 with best_epoch = 0 and best_val_0_mse = 10122.652
epoch 0  | loss: 8348.31152| val_0_mse: 16343.96324|  0:00:00s


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 1  | loss: 7894.73438| val_0_mse: 12546.3436|  0:00:00s
epoch 2  | loss: 6921.69971| val_0_mse: 9374.81437|  0:00:00s
epoch 3  | loss: 6281.90283| val_0_mse: 5436.85223|  0:00:00s
epoch 4  | loss: 5544.02637| val_0_mse: 6782.4344|  0:00:00s


[I 2024-07-26 08:40:17,187] Trial 11 finished with value: 73.73501362737736 and parameters: {'n_d': 35, 'n_a': 29, 'n_steps': 3, 'gamma': 1.9355043278729966, 'lambda_sparse': 0.0005840956170830762, 'learning_rate': 0.046149488002888435, 'batch_size': 64, 'num_epochs': 100}. Best is trial 4 with value: 27.649455942998415.


epoch 5  | loss: 5324.3374| val_0_mse: 9767.09567|  0:00:00s
epoch 6  | loss: 4530.66846| val_0_mse: 13007.69635|  0:00:00s
epoch 7  | loss: 3656.68188| val_0_mse: 14173.7584|  0:00:00s
epoch 8  | loss: 2670.19287| val_0_mse: 19871.60028|  0:00:00s

Early stopping occurred at epoch 8 with best_epoch = 3 and best_val_0_mse = 5436.85223


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8517.91895| val_0_mse: 12002.75445|  0:00:00s
epoch 1  | loss: 8182.55566| val_0_mse: 12467.76104|  0:00:00s
epoch 2  | loss: 7917.96436| val_0_mse: 11646.99842|  0:00:00s
epoch 3  | loss: 7738.11035| val_0_mse: 11704.67132|  0:00:00s
epoch 4  | loss: 7386.04736| val_0_mse: 12631.79629|  0:00:00s
epoch 5  | loss: 7166.44824| val_0_mse: 12140.38302|  0:00:00s


[I 2024-07-26 08:40:18,325] Trial 12 finished with value: 107.92126030070835 and parameters: {'n_d': 27, 'n_a': 21, 'n_steps': 7, 'gamma': 1.2594794457044132, 'lambda_sparse': 0.0009814253235156639, 'learning_rate': 0.022514823550334916, 'batch_size': 64, 'num_epochs': 77}. Best is trial 4 with value: 27.649455942998415.


epoch 6  | loss: 6835.06934| val_0_mse: 11864.60449|  0:00:00s
epoch 7  | loss: 6246.25049| val_0_mse: 12339.21175|  0:00:00s

Early stopping occurred at epoch 7 with best_epoch = 2 and best_val_0_mse = 11646.99842


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8439.87793| val_0_mse: 12504.84436|  0:00:00s
epoch 1  | loss: 8540.82324| val_0_mse: 13582.63158|  0:00:00s
epoch 2  | loss: 8536.84863| val_0_mse: 12953.7427|  0:00:00s
epoch 3  | loss: 8530.77246| val_0_mse: 12917.94193|  0:00:00s


[I 2024-07-26 08:40:18,850] Trial 13 finished with value: 111.82506139957556 and parameters: {'n_d': 42, 'n_a': 38, 'n_steps': 4, 'gamma': 1.9761419056839338, 'lambda_sparse': 0.0006867909143118176, 'learning_rate': 0.000577877890462599, 'batch_size': 64, 'num_epochs': 77}. Best is trial 4 with value: 27.649455942998415.


epoch 4  | loss: 8441.04297| val_0_mse: 12985.39373|  0:00:00s
epoch 5  | loss: 8522.66016| val_0_mse: 12721.1228|  0:00:00s

Early stopping occurred at epoch 5 with best_epoch = 0 and best_val_0_mse = 12504.84436


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8098.49268| val_0_mse: 10442.88745|  0:00:00s
epoch 1  | loss: 7982.875| val_0_mse: 11865.3432|  0:00:00s
epoch 2  | loss: 7723.03809| val_0_mse: 12024.59961|  0:00:00s
epoch 3  | loss: 7638.67773| val_0_mse: 11188.25584|  0:00:00s
epoch 4  | loss: 7348.64307| val_0_mse: 10162.92335|  0:00:00s
epoch 5  | loss: 7017.71729| val_0_mse: 12896.19634|  0:00:00s
epoch 6  | loss: 7022.67236| val_0_mse: 13877.40786|  0:00:00s
epoch 7  | loss: 6913.0791| val_0_mse: 16112.94954|  0:00:00s
epoch 8  | loss: 6475.12744| val_0_mse: 11924.54515|  0:00:00s
epoch 9  | loss: 6218.85547| val_0_mse: 9809.30984|  0:00:01s
epoch 10 | loss: 6027.46729| val_0_mse: 11102.93594|  0:00:01s
epoch 11 | loss: 5693.11523| val_0_mse: 10578.60835|  0:00:01s
epoch 12 | loss: 5258.85205| val_0_mse: 10408.06044|  0:00:01s
epoch 13 | loss: 5088.94922| val_0_mse: 10161.0425|  0:00:01s
epoch 14 | loss: 4567.67383| val_0_mse: 9570.65062|  0:00:01s
epoch 15 | loss: 5096.55957| val_0_mse: 8998.85723|  0:00:01s


[I 2024-07-26 08:40:22,888] Trial 14 finished with value: 58.30003454807836 and parameters: {'n_d': 23, 'n_a': 64, 'n_steps': 7, 'gamma': 1.173499615297842, 'lambda_sparse': 0.00038472899568851393, 'learning_rate': 0.008206684245246607, 'batch_size': 64, 'num_epochs': 88}. Best is trial 4 with value: 27.649455942998415.


epoch 32 | loss: 652.87177| val_0_mse: 3910.77863|  0:00:03s
epoch 33 | loss: 1030.03394| val_0_mse: 3946.37408|  0:00:03s

Early stopping occurred at epoch 33 with best_epoch = 28 and best_val_0_mse = 3398.89403


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8475.87695| val_0_mse: 12999.25196|  0:00:00s
epoch 1  | loss: 8337.9541| val_0_mse: 11814.80009|  0:00:00s
epoch 2  | loss: 8134.41846| val_0_mse: 11894.96521|  0:00:00s
epoch 3  | loss: 8002.27197| val_0_mse: 10980.08327|  0:00:00s
epoch 4  | loss: 7807.4248| val_0_mse: 11064.37763|  0:00:00s
epoch 5  | loss: 7660.98291| val_0_mse: 10881.39763|  0:00:00s
epoch 6  | loss: 7514.87402| val_0_mse: 10784.56832|  0:00:00s
epoch 7  | loss: 7345.81055| val_0_mse: 10873.50094|  0:00:00s
epoch 8  | loss: 7169.50586| val_0_mse: 10924.5151|  0:00:00s
epoch 9  | loss: 7005.66064| val_0_mse: 11148.84828|  0:00:00s


[I 2024-07-26 08:40:24,159] Trial 15 finished with value: 103.84877619155472 and parameters: {'n_d': 17, 'n_a': 37, 'n_steps': 7, 'gamma': 1.2129856033019195, 'lambda_sparse': 0.00036112893330574313, 'learning_rate': 0.007433476232629437, 'batch_size': 256, 'num_epochs': 68}. Best is trial 4 with value: 27.649455942998415.


epoch 10 | loss: 6853.87939| val_0_mse: 11109.14483|  0:00:01s
epoch 11 | loss: 6696.84766| val_0_mse: 11043.80542|  0:00:01s

Early stopping occurred at epoch 11 with best_epoch = 6 and best_val_0_mse = 10784.56832


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8534.08887| val_0_mse: 14846.39928|  0:00:00s
epoch 1  | loss: 8494.70996| val_0_mse: 13753.67167|  0:00:00s
epoch 2  | loss: 8404.25781| val_0_mse: 15706.88417|  0:00:00s
epoch 3  | loss: 8364.69727| val_0_mse: 14624.85772|  0:00:00s
epoch 4  | loss: 8285.87598| val_0_mse: 15030.30744|  0:00:00s


[I 2024-07-26 08:40:25,152] Trial 16 finished with value: 117.27604899465608 and parameters: {'n_d': 37, 'n_a': 63, 'n_steps': 8, 'gamma': 1.1518595354921382, 'lambda_sparse': 0.00016400907510827903, 'learning_rate': 0.000628170794608913, 'batch_size': 256, 'num_epochs': 46}. Best is trial 4 with value: 27.649455942998415.


epoch 5  | loss: 8200.27441| val_0_mse: 14522.65178|  0:00:00s
epoch 6  | loss: 8137.50537| val_0_mse: 14078.73237|  0:00:00s

Early stopping occurred at epoch 6 with best_epoch = 1 and best_val_0_mse = 13753.67167


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8759.14258| val_0_mse: 9408.953|  0:00:00s
epoch 1  | loss: 8722.23145| val_0_mse: 8739.94878|  0:00:00s
epoch 2  | loss: 8530.28125| val_0_mse: 9171.18396|  0:00:00s
epoch 3  | loss: 8416.44141| val_0_mse: 9300.8242|  0:00:00s
epoch 4  | loss: 8307.04688| val_0_mse: 8610.6008|  0:00:00s
epoch 5  | loss: 8275.26074| val_0_mse: 8710.49568|  0:00:00s
epoch 6  | loss: 8265.41504| val_0_mse: 8668.76834|  0:00:00s
epoch 7  | loss: 8073.14453| val_0_mse: 8712.9013|  0:00:00s
epoch 8  | loss: 8064.89453| val_0_mse: 8463.80609|  0:00:00s
epoch 9  | loss: 7979.64697| val_0_mse: 8506.39666|  0:00:00s
epoch 10 | loss: 7820.58398| val_0_mse: 8988.5091|  0:00:00s
epoch 11 | loss: 7786.22363| val_0_mse: 8875.68708|  0:00:01s
epoch 12 | loss: 7685.01074| val_0_mse: 8935.4048|  0:00:01s
epoch 13 | loss: 7572.24463| val_0_mse: 8906.39879|  0:00:01s

Early stopping occurred at epoch 13 with best_epoch = 8 and best_val_0_mse = 8463.80609


[I 2024-07-26 08:40:26,542] Trial 17 finished with value: 91.99894614917595 and parameters: {'n_d': 21, 'n_a': 18, 'n_steps': 8, 'gamma': 1.3227342254236643, 'lambda_sparse': 0.00042919447217360933, 'learning_rate': 0.005238643860091839, 'batch_size': 64, 'num_epochs': 88}. Best is trial 4 with value: 27.649455942998415.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8333.87402| val_0_mse: 10212.7298|  0:00:00s
epoch 1  | loss: 7897.12891| val_0_mse: 11096.3134|  0:00:00s
epoch 2  | loss: 7553.37354| val_0_mse: 10529.96485|  0:00:00s
epoch 3  | loss: 7039.26318| val_0_mse: 10054.43549|  0:00:00s
epoch 4  | loss: 6655.29297| val_0_mse: 10193.36486|  0:00:00s
epoch 5  | loss: 6438.22217| val_0_mse: 10919.96788|  0:00:00s
epoch 6  | loss: 6081.28369| val_0_mse: 11376.89273|  0:00:00s
epoch 7  | loss: 5729.15381| val_0_mse: 12208.12455|  0:00:00s
epoch 8  | loss: 5532.33643| val_0_mse: 11842.12394|  0:00:00s


[I 2024-07-26 08:40:27,590] Trial 18 finished with value: 100.27180807221315 and parameters: {'n_d': 31, 'n_a': 27, 'n_steps': 8, 'gamma': 1.1352576120017848, 'lambda_sparse': 0.00017286614235118532, 'learning_rate': 0.011387622196006392, 'batch_size': 128, 'num_epochs': 69}. Best is trial 4 with value: 27.649455942998415.



Early stopping occurred at epoch 8 with best_epoch = 3 and best_val_0_mse = 10054.43549


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8700.98438| val_0_mse: 13489.32815|  0:00:00s
epoch 1  | loss: 8456.41309| val_0_mse: 13215.3966|  0:00:00s
epoch 2  | loss: 8301.95996| val_0_mse: 11168.41757|  0:00:00s
epoch 3  | loss: 8041.729| val_0_mse: 9682.64917|  0:00:00s
epoch 4  | loss: 7808.14453| val_0_mse: 10575.48904|  0:00:00s
epoch 5  | loss: 7490.77881| val_0_mse: 10954.46787|  0:00:00s
epoch 6  | loss: 7087.11279| val_0_mse: 10572.47979|  0:00:00s


[I 2024-07-26 08:40:28,714] Trial 19 finished with value: 98.40045308221238 and parameters: {'n_d': 14, 'n_a': 41, 'n_steps': 7, 'gamma': 1.3931488648915051, 'lambda_sparse': 0.0002382459694088816, 'learning_rate': 0.03644843868541327, 'batch_size': 64, 'num_epochs': 97}. Best is trial 4 with value: 27.649455942998415.


epoch 7  | loss: 7324.38281| val_0_mse: 10072.18798|  0:00:00s
epoch 8  | loss: 6820.21875| val_0_mse: 10828.68965|  0:00:00s

Early stopping occurred at epoch 8 with best_epoch = 3 and best_val_0_mse = 9682.64917


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8348.66016| val_0_mse: 13817.53519|  0:00:00s
epoch 1  | loss: 8249.56934| val_0_mse: 13613.38777|  0:00:00s
epoch 2  | loss: 8176.41064| val_0_mse: 13253.21967|  0:00:00s
epoch 3  | loss: 8105.86572| val_0_mse: 13362.83593|  0:00:00s
epoch 4  | loss: 8054.76807| val_0_mse: 13538.32029|  0:00:00s
epoch 5  | loss: 8004.17041| val_0_mse: 13569.88317|  0:00:00s
epoch 6  | loss: 7958.80859| val_0_mse: 13438.54999|  0:00:00s


[I 2024-07-26 08:40:29,409] Trial 20 finished with value: 115.12262882125101 and parameters: {'n_d': 23, 'n_a': 13, 'n_steps': 6, 'gamma': 1.0033150726666615, 'lambda_sparse': 7.494233009174575e-05, 'learning_rate': 0.0010251628167877867, 'batch_size': 256, 'num_epochs': 53}. Best is trial 4 with value: 27.649455942998415.


epoch 7  | loss: 7905.51025| val_0_mse: 13359.51107|  0:00:00s

Early stopping occurred at epoch 7 with best_epoch = 2 and best_val_0_mse = 13253.21967
epoch 0  | loss: 8364.0166| val_0_mse: 14710.9107|  0:00:00s


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 1  | loss: 8116.69385| val_0_mse: 12718.47163|  0:00:00s
epoch 2  | loss: 7941.78613| val_0_mse: 12697.38232|  0:00:00s
epoch 3  | loss: 7557.5415| val_0_mse: 13547.19845|  0:00:00s
epoch 4  | loss: 7288.2334| val_0_mse: 13778.12605|  0:00:00s
epoch 5  | loss: 6679.53809| val_0_mse: 12928.76855|  0:00:00s
epoch 6  | loss: 6351.2207| val_0_mse: 7367.17947|  0:00:00s
epoch 7  | loss: 6258.1958| val_0_mse: 6670.97295|  0:00:00s
epoch 8  | loss: 5338.7959| val_0_mse: 6103.93674|  0:00:00s
epoch 9  | loss: 4717.07764| val_0_mse: 6060.35834|  0:00:00s
epoch 10 | loss: 4201.81055| val_0_mse: 5914.01781|  0:00:00s
epoch 11 | loss: 3316.05469| val_0_mse: 5937.27532|  0:00:00s
epoch 12 | loss: 3170.19238| val_0_mse: 7141.56569|  0:00:01s
epoch 13 | loss: 2339.31738| val_0_mse: 10842.36123|  0:00:01s


[I 2024-07-26 08:40:30,779] Trial 21 finished with value: 76.90265148771591 and parameters: {'n_d': 28, 'n_a': 54, 'n_steps': 4, 'gamma': 1.8423628654311677, 'lambda_sparse': 0.0006138658556408038, 'learning_rate': 0.03158831546202484, 'batch_size': 64, 'num_epochs': 84}. Best is trial 4 with value: 27.649455942998415.


epoch 14 | loss: 2058.63843| val_0_mse: 13555.09677|  0:00:01s
epoch 15 | loss: 2697.16528| val_0_mse: 16921.1855|  0:00:01s

Early stopping occurred at epoch 15 with best_epoch = 10 and best_val_0_mse = 5914.01781


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8603.8252| val_0_mse: 16446.10364|  0:00:01s
epoch 1  | loss: 8010.84033| val_0_mse: 17027.98676|  0:00:01s
epoch 2  | loss: 7360.90723| val_0_mse: 7857.58894|  0:00:01s
epoch 3  | loss: 6562.45947| val_0_mse: 19892.83817|  0:00:01s
epoch 4  | loss: 5898.22754| val_0_mse: 27635.84816|  0:00:01s
epoch 5  | loss: 5069.37646| val_0_mse: 35588.78001|  0:00:01s


[I 2024-07-26 08:40:33,086] Trial 22 finished with value: 88.64304224554238 and parameters: {'n_d': 24, 'n_a': 64, 'n_steps': 4, 'gamma': 1.7525437034029765, 'lambda_sparse': 0.0007371531572096128, 'learning_rate': 0.07922193575912555, 'batch_size': 64, 'num_epochs': 89}. Best is trial 4 with value: 27.649455942998415.


epoch 6  | loss: 4088.83887| val_0_mse: 45452.48631|  0:00:01s
epoch 7  | loss: 3578.79541| val_0_mse: 62352.48314|  0:00:02s

Early stopping occurred at epoch 7 with best_epoch = 2 and best_val_0_mse = 7857.58894


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8633.33496| val_0_mse: 6839.38934|  0:00:00s
epoch 1  | loss: 8273.11816| val_0_mse: 7378.03251|  0:00:00s
epoch 2  | loss: 7855.49609| val_0_mse: 7280.71715|  0:00:00s
epoch 3  | loss: 7432.57324| val_0_mse: 8492.0348|  0:00:00s
epoch 4  | loss: 7011.39258| val_0_mse: 8269.32929|  0:00:00s
epoch 5  | loss: 6462.71191| val_0_mse: 2769.09869|  0:00:00s
epoch 6  | loss: 6330.91406| val_0_mse: 2407.87857|  0:00:00s
epoch 7  | loss: 5603.52783| val_0_mse: 1757.27259|  0:00:00s
epoch 8  | loss: 5215.52246| val_0_mse: 1056.32327|  0:00:00s
epoch 9  | loss: 5177.76025| val_0_mse: 816.5443|  0:00:01s
epoch 10 | loss: 4581.8584| val_0_mse: 1285.6773|  0:00:01s
epoch 11 | loss: 4421.73877| val_0_mse: 1041.33952|  0:00:01s
epoch 12 | loss: 4181.3125| val_0_mse: 1351.37448|  0:00:01s
epoch 13 | loss: 3395.61475| val_0_mse: 1016.18194|  0:00:01s
epoch 14 | loss: 3319.2395| val_0_mse: 435.68885|  0:00:01s
epoch 15 | loss: 2820.70264| val_0_mse: 617.68171|  0:00:01s
epoch 16 | loss: 

[I 2024-07-26 08:40:35,193] Trial 23 finished with value: 20.873160944553998 and parameters: {'n_d': 34, 'n_a': 56, 'n_steps': 5, 'gamma': 1.2462081377166927, 'lambda_sparse': 0.0009268419160555327, 'learning_rate': 0.015156122967260719, 'batch_size': 64, 'num_epochs': 77}. Best is trial 23 with value: 20.873160944553998.


epoch 18 | loss: 1928.90674| val_0_mse: 1884.98449|  0:00:01s
epoch 19 | loss: 1884.50793| val_0_mse: 3271.46177|  0:00:01s

Early stopping occurred at epoch 19 with best_epoch = 14 and best_val_0_mse = 435.68885


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8253.48047| val_0_mse: 7164.36276|  0:00:00s
epoch 1  | loss: 8151.28369| val_0_mse: 7353.36826|  0:00:00s
epoch 2  | loss: 7929.4917| val_0_mse: 7382.36032|  0:00:00s
epoch 3  | loss: 7793.46631| val_0_mse: 7316.62926|  0:00:00s
epoch 4  | loss: 7492.88135| val_0_mse: 8387.2165|  0:00:00s
epoch 5  | loss: 7278.59326| val_0_mse: 8695.66873|  0:00:00s

Early stopping occurred at epoch 5 with best_epoch = 0 and best_val_0_mse = 7164.36276


[I 2024-07-26 08:40:36,038] Trial 24 finished with value: 84.64255881116017 and parameters: {'n_d': 34, 'n_a': 51, 'n_steps': 6, 'gamma': 1.2017805829449242, 'lambda_sparse': 0.0009872445593222686, 'learning_rate': 0.00813544942415561, 'batch_size': 64, 'num_epochs': 76}. Best is trial 23 with value: 20.873160944553998.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8632.00684| val_0_mse: 6979.04378|  0:00:00s
epoch 1  | loss: 8570.22852| val_0_mse: 7975.48361|  0:00:00s
epoch 2  | loss: 8332.11426| val_0_mse: 8565.93149|  0:00:00s
epoch 3  | loss: 8215.37598| val_0_mse: 8875.68118|  0:00:00s
epoch 4  | loss: 8077.62109| val_0_mse: 9284.09412|  0:00:00s


[I 2024-07-26 08:40:37,186] Trial 25 finished with value: 83.54067142390585 and parameters: {'n_d': 41, 'n_a': 43, 'n_steps': 5, 'gamma': 1.2851053539324238, 'lambda_sparse': 0.00037477666707584183, 'learning_rate': 0.0049163194650405926, 'batch_size': 64, 'num_epochs': 61}. Best is trial 23 with value: 20.873160944553998.


epoch 5  | loss: 7855.12012| val_0_mse: 9008.48052|  0:00:00s

Early stopping occurred at epoch 5 with best_epoch = 0 and best_val_0_mse = 6979.04378


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8573.23047| val_0_mse: 13732.16397|  0:00:00s
epoch 1  | loss: 7742.34668| val_0_mse: 10668.11821|  0:00:00s
epoch 2  | loss: 6763.96582| val_0_mse: 8420.10499|  0:00:00s
epoch 3  | loss: 5722.22217| val_0_mse: 4699.55229|  0:00:00s
epoch 4  | loss: 4815.81299| val_0_mse: 3540.52068|  0:00:00s
epoch 5  | loss: 3898.35913| val_0_mse: 2835.86132|  0:00:00s
epoch 6  | loss: 3115.88574| val_0_mse: 3736.51201|  0:00:00s
epoch 7  | loss: 2501.09863| val_0_mse: 6237.95018|  0:00:01s
epoch 8  | loss: 1874.93433| val_0_mse: 6757.67669|  0:00:01s
epoch 9  | loss: 1362.80969| val_0_mse: 9358.23623|  0:00:01s
epoch 10 | loss: 956.88953| val_0_mse: 9237.17687|  0:00:01s


[I 2024-07-26 08:40:38,705] Trial 26 finished with value: 53.25280571058216 and parameters: {'n_d': 48, 'n_a': 60, 'n_steps': 7, 'gamma': 1.1340984120771724, 'lambda_sparse': 0.000883182523177946, 'learning_rate': 0.015313022521698838, 'batch_size': 256, 'num_epochs': 81}. Best is trial 23 with value: 20.873160944553998.



Early stopping occurred at epoch 10 with best_epoch = 5 and best_val_0_mse = 2835.86132


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8462.97266| val_0_mse: 10635.43365|  0:00:00s
epoch 1  | loss: 7918.49268| val_0_mse: 10637.20026|  0:00:00s
epoch 2  | loss: 7619.87354| val_0_mse: 8550.70157|  0:00:00s
epoch 3  | loss: 6891.22119| val_0_mse: 6509.61407|  0:00:00s
epoch 4  | loss: 6120.9624| val_0_mse: 2925.95262|  0:00:00s
epoch 5  | loss: 5484.12988| val_0_mse: 1440.36669|  0:00:01s
epoch 6  | loss: 4787.46143| val_0_mse: 2722.61423|  0:00:01s
epoch 7  | loss: 4160.9126| val_0_mse: 2431.02254|  0:00:01s
epoch 8  | loss: 3561.90332| val_0_mse: 5439.26076|  0:00:01s
epoch 9  | loss: 2973.95679| val_0_mse: 19138.52414|  0:00:01s
epoch 10 | loss: 2418.08789| val_0_mse: 28673.04774|  0:00:01s

Early stopping occurred at epoch 10 with best_epoch = 5 and best_val_0_mse = 1440.36669


[I 2024-07-26 08:40:40,875] Trial 27 finished with value: 37.952163217155665 and parameters: {'n_d': 48, 'n_a': 52, 'n_steps': 6, 'gamma': 1.0899367738630545, 'lambda_sparse': 0.0008931543831694979, 'learning_rate': 0.01582417103070356, 'batch_size': 256, 'num_epochs': 77}. Best is trial 23 with value: 20.873160944553998.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8412.11523| val_0_mse: 6288.88784|  0:00:00s
epoch 1  | loss: 7003.77051| val_0_mse: 3025.08195|  0:00:00s
epoch 2  | loss: 4869.94922| val_0_mse: 5453.51049|  0:00:00s
epoch 3  | loss: 3060.41138| val_0_mse: 121129.07259|  0:00:00s
epoch 4  | loss: 1729.38904| val_0_mse: 167639.61171|  0:00:00s


[I 2024-07-26 08:40:41,888] Trial 28 finished with value: 55.0007449821262 and parameters: {'n_d': 51, 'n_a': 33, 'n_steps': 5, 'gamma': 1.0732020293566589, 'lambda_sparse': 0.0009145053614118248, 'learning_rate': 0.063095489565229, 'batch_size': 256, 'num_epochs': 72}. Best is trial 23 with value: 20.873160944553998.


epoch 5  | loss: 774.93433| val_0_mse: 266359.8791|  0:00:00s
epoch 6  | loss: 240.93373| val_0_mse: 265120.88546|  0:00:00s

Early stopping occurred at epoch 6 with best_epoch = 1 and best_val_0_mse = 3025.08195


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8754.36133| val_0_mse: 10756.38726|  0:00:00s
epoch 1  | loss: 8588.54297| val_0_mse: 11379.01842|  0:00:00s
epoch 2  | loss: 8375.8916| val_0_mse: 11002.0112|  0:00:00s
epoch 3  | loss: 8150.87695| val_0_mse: 10294.03961|  0:00:00s
epoch 4  | loss: 7953.40674| val_0_mse: 10387.83364|  0:00:00s
epoch 5  | loss: 7775.61377| val_0_mse: 10331.52622|  0:00:00s
epoch 6  | loss: 7500.56445| val_0_mse: 10188.2733|  0:00:00s
epoch 7  | loss: 7254.16406| val_0_mse: 10030.92646|  0:00:00s
epoch 8  | loss: 7028.37793| val_0_mse: 8755.81387|  0:00:01s
epoch 9  | loss: 6779.64062| val_0_mse: 8654.41025|  0:00:01s
epoch 10 | loss: 6532.85596| val_0_mse: 8772.00134|  0:00:01s
epoch 11 | loss: 6272.84033| val_0_mse: 8259.51024|  0:00:01s
epoch 12 | loss: 6034.20557| val_0_mse: 8700.4995|  0:00:01s
epoch 13 | loss: 5799.96875| val_0_mse: 8484.71711|  0:00:01s
epoch 14 | loss: 5571.93994| val_0_mse: 8066.84574|  0:00:01s
epoch 15 | loss: 5359.46973| val_0_mse: 7667.0361|  0:00:01s
epoch

[I 2024-07-26 08:40:46,308] Trial 29 finished with value: 55.03999553087951 and parameters: {'n_d': 63, 'n_a': 44, 'n_steps': 6, 'gamma': 1.3711231025377473, 'lambda_sparse': 0.0009201161212385012, 'learning_rate': 0.002896361774723192, 'batch_size': 256, 'num_epochs': 50}. Best is trial 23 with value: 20.873160944553998.


epoch 31 | loss: 2427.52197| val_0_mse: 3080.45881|  0:00:04s

Early stopping occurred at epoch 31 with best_epoch = 26 and best_val_0_mse = 3029.40111


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8625.7041| val_0_mse: 14609.63871|  0:00:00s
epoch 1  | loss: 8102.56836| val_0_mse: 14960.03002|  0:00:00s
epoch 2  | loss: 7484.67676| val_0_mse: 18471.27518|  0:00:00s
epoch 3  | loss: 6820.81836| val_0_mse: 14518.06632|  0:00:00s
epoch 4  | loss: 6201.49414| val_0_mse: 13616.95813|  0:00:00s
epoch 5  | loss: 5583.68555| val_0_mse: 13542.20558|  0:00:00s
epoch 6  | loss: 5042.46826| val_0_mse: 6066.62889|  0:00:00s
epoch 7  | loss: 4503.33691| val_0_mse: 5658.53284|  0:00:00s
epoch 8  | loss: 4002.89136| val_0_mse: 5462.93253|  0:00:00s
epoch 9  | loss: 3500.6416| val_0_mse: 6038.54955|  0:00:01s
epoch 10 | loss: 3027.53662| val_0_mse: 12894.79689|  0:00:01s
epoch 11 | loss: 2754.49219| val_0_mse: 19007.6954|  0:00:01s
epoch 12 | loss: 2236.40161| val_0_mse: 19849.59378|  0:00:01s


[I 2024-07-26 08:40:48,310] Trial 30 finished with value: 73.91165351689088 and parameters: {'n_d': 47, 'n_a': 52, 'n_steps': 5, 'gamma': 1.2672448427349126, 'lambda_sparse': 0.0007931804146531453, 'learning_rate': 0.01599693065635172, 'batch_size': 256, 'num_epochs': 29}. Best is trial 23 with value: 20.873160944553998.


epoch 13 | loss: 1920.23987| val_0_mse: 22110.88912|  0:00:01s

Early stopping occurred at epoch 13 with best_epoch = 8 and best_val_0_mse = 5462.93253


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8860.92578| val_0_mse: 8275.72205|  0:00:00s
epoch 1  | loss: 7869.1416| val_0_mse: 6877.26272|  0:00:00s
epoch 2  | loss: 7132.50439| val_0_mse: 10935.65351|  0:00:00s
epoch 3  | loss: 6411.44287| val_0_mse: 7596.59278|  0:00:00s
epoch 4  | loss: 5754.06299| val_0_mse: 2952.19092|  0:00:00s
epoch 5  | loss: 4976.2373| val_0_mse: 2171.94766|  0:00:00s
epoch 6  | loss: 4335.896| val_0_mse: 2916.75901|  0:00:00s
epoch 7  | loss: 3574.52051| val_0_mse: 1742.28951|  0:00:01s
epoch 8  | loss: 2998.16992| val_0_mse: 1720.01305|  0:00:01s
epoch 9  | loss: 2428.53784| val_0_mse: 1461.53771|  0:00:01s
epoch 10 | loss: 1894.4043| val_0_mse: 9646.40203|  0:00:01s
epoch 11 | loss: 1426.32507| val_0_mse: 12614.76745|  0:00:01s
epoch 12 | loss: 1026.06665| val_0_mse: 16888.33817|  0:00:01s


[I 2024-07-26 08:40:50,631] Trial 31 finished with value: 38.23006289117313 and parameters: {'n_d': 48, 'n_a': 61, 'n_steps': 7, 'gamma': 1.099131613159436, 'lambda_sparse': 0.0009028817404990103, 'learning_rate': 0.014115693344120127, 'batch_size': 256, 'num_epochs': 79}. Best is trial 23 with value: 20.873160944553998.


epoch 13 | loss: 680.66376| val_0_mse: 18191.91005|  0:00:01s
epoch 14 | loss: 405.68607| val_0_mse: 30329.40495|  0:00:01s

Early stopping occurred at epoch 14 with best_epoch = 9 and best_val_0_mse = 1461.53771


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8552.54492| val_0_mse: 12103.2638|  0:00:00s
epoch 1  | loss: 7551.36865| val_0_mse: 9084.35388|  0:00:00s
epoch 2  | loss: 6228.06787| val_0_mse: 6957.18739|  0:00:00s
epoch 3  | loss: 4844.90283| val_0_mse: 4773.16083|  0:00:00s
epoch 4  | loss: 3718.83594| val_0_mse: 4943.02276|  0:00:00s
epoch 5  | loss: 2785.12769| val_0_mse: 5426.45675|  0:00:01s
epoch 6  | loss: 1952.92419| val_0_mse: 11880.66934|  0:00:01s
epoch 7  | loss: 1271.77759| val_0_mse: 14678.82983|  0:00:01s
epoch 8  | loss: 736.11053| val_0_mse: 17239.54458|  0:00:01s

Early stopping occurred at epoch 8 with best_epoch = 3 and best_val_0_mse = 4773.16083


[I 2024-07-26 08:40:52,442] Trial 32 finished with value: 69.08806575457692 and parameters: {'n_d': 52, 'n_a': 55, 'n_steps': 6, 'gamma': 1.0051308958156209, 'lambda_sparse': 0.0008533005184101107, 'learning_rate': 0.023759814466703024, 'batch_size': 256, 'num_epochs': 62}. Best is trial 23 with value: 20.873160944553998.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8237.55566| val_0_mse: 5446.6578|  0:00:00s
epoch 1  | loss: 7840.15332| val_0_mse: 6397.65811|  0:00:00s
epoch 2  | loss: 7565.14795| val_0_mse: 6007.47341|  0:00:00s
epoch 3  | loss: 7317.13232| val_0_mse: 5925.99805|  0:00:00s


[I 2024-07-26 08:40:53,292] Trial 33 finished with value: 73.80147561652501 and parameters: {'n_d': 45, 'n_a': 47, 'n_steps': 6, 'gamma': 1.1097080965780892, 'lambda_sparse': 0.0007962216002552824, 'learning_rate': 0.004222662022431126, 'batch_size': 256, 'num_epochs': 73}. Best is trial 23 with value: 20.873160944553998.


epoch 4  | loss: 7006.89404| val_0_mse: 6085.49343|  0:00:00s
epoch 5  | loss: 6748.66699| val_0_mse: 6536.25658|  0:00:00s

Early stopping occurred at epoch 5 with best_epoch = 0 and best_val_0_mse = 5446.6578


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8306.72266| val_0_mse: 13593.30672|  0:00:00s
epoch 1  | loss: 7866.22559| val_0_mse: 12923.1597|  0:00:00s
epoch 2  | loss: 7726.06885| val_0_mse: 12449.39886|  0:00:00s
epoch 3  | loss: 7463.77197| val_0_mse: 11942.77288|  0:00:00s
epoch 4  | loss: 7219.08691| val_0_mse: 11560.8532|  0:00:00s
epoch 5  | loss: 7031.68701| val_0_mse: 11136.54226|  0:00:03s
epoch 6  | loss: 6741.6084| val_0_mse: 10026.75426|  0:00:03s
epoch 7  | loss: 6514.14697| val_0_mse: 9716.46926|  0:00:03s
epoch 8  | loss: 6303.89111| val_0_mse: 9711.19029|  0:00:03s
epoch 9  | loss: 6092.84082| val_0_mse: 9668.05568|  0:00:03s
epoch 10 | loss: 5889.98096| val_0_mse: 9727.01506|  0:00:04s
epoch 11 | loss: 5691.50635| val_0_mse: 9649.12877|  0:00:04s
epoch 12 | loss: 5486.3208| val_0_mse: 9578.00132|  0:00:04s
epoch 13 | loss: 5285.22754| val_0_mse: 9480.45259|  0:00:04s
epoch 14 | loss: 5082.31445| val_0_mse: 9357.15998|  0:00:04s
epoch 15 | loss: 4882.68408| val_0_mse: 9217.50399|  0:00:04s
epoch

[I 2024-07-26 08:41:10,698] Trial 34 finished with value: 45.922281127079216 and parameters: {'n_d': 58, 'n_a': 23, 'n_steps': 8, 'gamma': 1.078475871959083, 'lambda_sparse': 0.0009397838555029976, 'learning_rate': 0.0017092337561769259, 'batch_size': 128, 'num_epochs': 82}. Best is trial 23 with value: 20.873160944553998.


epoch 81 | loss: 11.68875| val_0_mse: 2108.8559|  0:00:17s
Stop training because you reached max_epochs = 82 with best_epoch = 81 and best_val_0_mse = 2108.8559


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8386.4209| val_0_mse: 11483.10936|  0:00:00s
epoch 1  | loss: 7901.47949| val_0_mse: 13716.57217|  0:00:00s
epoch 2  | loss: 7288.05957| val_0_mse: 12632.98742|  0:00:00s
epoch 3  | loss: 6875.58008| val_0_mse: 10978.08551|  0:00:00s
epoch 4  | loss: 6310.00439| val_0_mse: 10221.04327|  0:00:00s
epoch 5  | loss: 5966.84766| val_0_mse: 9818.94095|  0:00:00s
epoch 6  | loss: 5168.75342| val_0_mse: 8976.50541|  0:00:01s
epoch 7  | loss: 4700.07129| val_0_mse: 7379.33203|  0:00:01s
epoch 8  | loss: 4189.94922| val_0_mse: 7841.8222|  0:00:01s
epoch 9  | loss: 3717.63354| val_0_mse: 15675.99591|  0:00:01s
epoch 10 | loss: 3284.33203| val_0_mse: 8912.13679|  0:00:01s
epoch 11 | loss: 2862.40771| val_0_mse: 10207.22712|  0:00:01s
epoch 12 | loss: 2429.57959| val_0_mse: 4717.53826|  0:00:01s
epoch 13 | loss: 2039.02759| val_0_mse: 3220.91902|  0:00:01s
epoch 14 | loss: 1676.32764| val_0_mse: 3720.47285|  0:00:01s
epoch 15 | loss: 1337.54956| val_0_mse: 4885.82259|  0:00:02s
epo

[I 2024-07-26 08:41:13,705] Trial 35 finished with value: 56.753141060714185 and parameters: {'n_d': 38, 'n_a': 59, 'n_steps': 6, 'gamma': 1.2334923484710107, 'lambda_sparse': 0.0007431679157062545, 'learning_rate': 0.01260500927418449, 'batch_size': 256, 'num_epochs': 65}. Best is trial 23 with value: 20.873160944553998.


epoch 18 | loss: 569.19952| val_0_mse: 10956.78722|  0:00:02s

Early stopping occurred at epoch 18 with best_epoch = 13 and best_val_0_mse = 3220.91902


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8551.14551| val_0_mse: 11360.80746|  0:00:00s
epoch 1  | loss: 8135.37158| val_0_mse: 9567.86933|  0:00:00s
epoch 2  | loss: 7488.93848| val_0_mse: 10507.93666|  0:00:00s
epoch 3  | loss: 6767.31689| val_0_mse: 2422.18363|  0:00:00s
epoch 4  | loss: 6053.44971| val_0_mse: 4493.18912|  0:00:00s
epoch 5  | loss: 5123.00732| val_0_mse: 18946.65196|  0:00:00s
epoch 6  | loss: 3863.54443| val_0_mse: 26418.73575|  0:00:00s


[I 2024-07-26 08:41:15,069] Trial 36 finished with value: 49.215684782467875 and parameters: {'n_d': 53, 'n_a': 50, 'n_steps': 5, 'gamma': 1.3402833334154456, 'lambda_sparse': 0.0005429298042020002, 'learning_rate': 0.050707047364063576, 'batch_size': 32, 'num_epochs': 93}. Best is trial 23 with value: 20.873160944553998.


epoch 7  | loss: 3200.00732| val_0_mse: 58973.80562|  0:00:01s
epoch 8  | loss: 2263.62842| val_0_mse: 80513.71241|  0:00:01s

Early stopping occurred at epoch 8 with best_epoch = 3 and best_val_0_mse = 2422.18363


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8317.14062| val_0_mse: 12521.48659|  0:00:00s
epoch 1  | loss: 8210.30469| val_0_mse: 12521.32448|  0:00:00s
epoch 2  | loss: 7900.07227| val_0_mse: 12334.84626|  0:00:00s
epoch 3  | loss: 7624.08594| val_0_mse: 11426.92976|  0:00:00s
epoch 4  | loss: 7280.1958| val_0_mse: 14744.66417|  0:00:00s
epoch 5  | loss: 7059.44727| val_0_mse: 16531.01815|  0:00:00s
epoch 6  | loss: 6679.71143| val_0_mse: 16077.35882|  0:00:00s
epoch 7  | loss: 6391.32715| val_0_mse: 9180.617|  0:00:01s
epoch 8  | loss: 5936.96729| val_0_mse: 7680.34933|  0:00:01s
epoch 9  | loss: 5561.53955| val_0_mse: 14623.52454|  0:00:01s
epoch 10 | loss: 5220.25977| val_0_mse: 13083.60701|  0:00:01s
epoch 11 | loss: 4856.34473| val_0_mse: 5530.55654|  0:00:01s
epoch 12 | loss: 4457.72119| val_0_mse: 5145.7789|  0:00:01s
epoch 13 | loss: 4108.51562| val_0_mse: 4527.13437|  0:00:01s
epoch 14 | loss: 3766.03735| val_0_mse: 4095.46555|  0:00:02s
epoch 15 | loss: 3426.68213| val_0_mse: 4566.59699|  0:00:02s
epo

[I 2024-07-26 08:41:17,997] Trial 37 finished with value: 63.99582448360724 and parameters: {'n_d': 32, 'n_a': 61, 'n_steps': 7, 'gamma': 1.410611178318575, 'lambda_sparse': 0.0008458443106048299, 'learning_rate': 0.007172895657782093, 'batch_size': 256, 'num_epochs': 83}. Best is trial 23 with value: 20.873160944553998.


epoch 19 | loss: 2129.40479| val_0_mse: 20662.92509|  0:00:02s

Early stopping occurred at epoch 19 with best_epoch = 14 and best_val_0_mse = 4095.46555


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8425.61719| val_0_mse: 12944.69835|  0:00:00s
epoch 1  | loss: 7322.01807| val_0_mse: 9314.64281|  0:00:00s
epoch 2  | loss: 6263.61816| val_0_mse: 7997.68843|  0:00:01s
epoch 3  | loss: 5386.40723| val_0_mse: 5365.41701|  0:00:01s
epoch 4  | loss: 4258.91699| val_0_mse: 3777.32355|  0:00:01s
epoch 5  | loss: 3349.86304| val_0_mse: 3176.179|  0:00:02s
epoch 6  | loss: 3204.79126| val_0_mse: 4131.91714|  0:00:02s
epoch 7  | loss: 1779.92114| val_0_mse: 3753.4809|  0:00:02s
epoch 8  | loss: 1527.93005| val_0_mse: 6386.67193|  0:00:02s
epoch 9  | loss: 1295.95288| val_0_mse: 8407.2764|  0:00:02s


[I 2024-07-26 08:41:21,118] Trial 38 finished with value: 56.357599292097326 and parameters: {'n_d': 60, 'n_a': 56, 'n_steps': 9, 'gamma': 1.1852187431368035, 'lambda_sparse': 0.0006902389674819551, 'learning_rate': 0.022975924925499096, 'batch_size': 32, 'num_epochs': 74}. Best is trial 23 with value: 20.873160944553998.


epoch 10 | loss: 920.3772| val_0_mse: 8326.18219|  0:00:02s

Early stopping occurred at epoch 10 with best_epoch = 5 and best_val_0_mse = 3176.179


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8548.01758| val_0_mse: 14714.04787|  0:00:00s
epoch 1  | loss: 8522.44531| val_0_mse: 14543.32728|  0:00:00s
epoch 2  | loss: 8507.3916| val_0_mse: 14444.19517|  0:00:00s
epoch 3  | loss: 8496.91113| val_0_mse: 14278.8817|  0:00:00s
epoch 4  | loss: 8487.85352| val_0_mse: 14121.00491|  0:00:00s
epoch 5  | loss: 8478.31934| val_0_mse: 14025.89476|  0:00:00s
epoch 6  | loss: 8468.57031| val_0_mse: 13901.30229|  0:00:00s
epoch 7  | loss: 8458.47266| val_0_mse: 13823.7482|  0:00:00s
epoch 8  | loss: 8447.81641| val_0_mse: 13745.9951|  0:00:00s
epoch 9  | loss: 8436.83398| val_0_mse: 13677.08089|  0:00:00s
epoch 10 | loss: 8425.87207| val_0_mse: 13617.39014|  0:00:00s
epoch 11 | loss: 8415.25 | val_0_mse: 13565.46377|  0:00:01s
epoch 12 | loss: 8405.125| val_0_mse: 13518.34271|  0:00:01s
epoch 13 | loss: 8395.31641| val_0_mse: 13471.05939|  0:00:01s
epoch 14 | loss: 8385.54492| val_0_mse: 13425.92625|  0:00:01s
epoch 15 | loss: 8375.79102| val_0_mse: 13384.65617|  0:00:01s


[I 2024-07-26 08:41:28,374] Trial 39 finished with value: 112.23721793467367 and parameters: {'n_d': 45, 'n_a': 9, 'n_steps': 4, 'gamma': 1.0451311703058161, 'lambda_sparse': 0.0009930492560720047, 'learning_rate': 0.00014450478287569922, 'batch_size': 128, 'num_epochs': 69}. Best is trial 23 with value: 20.873160944553998.


Stop training because you reached max_epochs = 69 with best_epoch = 68 and best_val_0_mse = 12597.19309
epoch 0  | loss: 8473.35254| val_0_mse: 7121.36044|  0:00:00s


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 1  | loss: 8314.01953| val_0_mse: 7239.76313|  0:00:00s
epoch 2  | loss: 8164.79883| val_0_mse: 7523.19155|  0:00:00s
epoch 3  | loss: 8005.32422| val_0_mse: 7692.12436|  0:00:00s
epoch 4  | loss: 7813.81152| val_0_mse: 7724.19352|  0:00:00s


[I 2024-07-26 08:41:28,962] Trial 40 finished with value: 84.38815343286088 and parameters: {'n_d': 14, 'n_a': 53, 'n_steps': 5, 'gamma': 1.5731477260566833, 'lambda_sparse': 0.0008792038462802367, 'learning_rate': 0.003667845742508077, 'batch_size': 256, 'num_epochs': 93}. Best is trial 23 with value: 20.873160944553998.


epoch 5  | loss: 7707.76807| val_0_mse: 10158.42951|  0:00:00s

Early stopping occurred at epoch 5 with best_epoch = 0 and best_val_0_mse = 7121.36044


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8372.73145| val_0_mse: 9693.44444|  0:00:00s
epoch 1  | loss: 8347.53711| val_0_mse: 9291.15022|  0:00:00s
epoch 2  | loss: 8203.45508| val_0_mse: 9559.06754|  0:00:00s
epoch 3  | loss: 7972.2085| val_0_mse: 9489.38998|  0:00:00s
epoch 4  | loss: 7950.07129| val_0_mse: 9747.65965|  0:00:00s
epoch 5  | loss: 7753.12061| val_0_mse: 9272.9204|  0:00:00s
epoch 6  | loss: 7625.46582| val_0_mse: 9889.38005|  0:00:00s
epoch 7  | loss: 7437.77051| val_0_mse: 8205.76411|  0:00:00s
epoch 8  | loss: 7284.59863| val_0_mse: 9784.80625|  0:00:01s
epoch 9  | loss: 7119.95654| val_0_mse: 8842.1658|  0:00:01s
epoch 10 | loss: 6979.00488| val_0_mse: 9951.00377|  0:00:01s
epoch 11 | loss: 6813.35205| val_0_mse: 9885.02243|  0:00:01s


[I 2024-07-26 08:41:30,608] Trial 41 finished with value: 90.58567276903717 and parameters: {'n_d': 58, 'n_a': 20, 'n_steps': 8, 'gamma': 1.0922642700553533, 'lambda_sparse': 0.0009342625036619423, 'learning_rate': 0.0017549513373475798, 'batch_size': 128, 'num_epochs': 80}. Best is trial 23 with value: 20.873160944553998.


epoch 12 | loss: 6635.0376| val_0_mse: 9712.85953|  0:00:01s

Early stopping occurred at epoch 12 with best_epoch = 7 and best_val_0_mse = 8205.76411


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8208.46289| val_0_mse: 12261.92595|  0:00:00s
epoch 1  | loss: 8078.63428| val_0_mse: 11127.96133|  0:00:00s
epoch 2  | loss: 7785.43066| val_0_mse: 14742.45948|  0:00:00s
epoch 3  | loss: 7725.04199| val_0_mse: 11395.73438|  0:00:00s
epoch 4  | loss: 7508.46582| val_0_mse: 11283.25216|  0:00:00s


[I 2024-07-26 08:41:31,535] Trial 42 finished with value: 105.48915266725105 and parameters: {'n_d': 55, 'n_a': 24, 'n_steps': 8, 'gamma': 1.0604609662216795, 'lambda_sparse': 0.0009432214537631839, 'learning_rate': 0.0019736803316609034, 'batch_size': 128, 'num_epochs': 85}. Best is trial 23 with value: 20.873160944553998.


epoch 5  | loss: 7326.21826| val_0_mse: 11659.43654|  0:00:00s
epoch 6  | loss: 7136.82861| val_0_mse: 11248.6974|  0:00:00s

Early stopping occurred at epoch 6 with best_epoch = 1 and best_val_0_mse = 11127.96133


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8247.88672| val_0_mse: 15099.06547|  0:00:00s
epoch 1  | loss: 8117.5376| val_0_mse: 14090.49162|  0:00:00s
epoch 2  | loss: 8037.46777| val_0_mse: 13503.84086|  0:00:00s
epoch 3  | loss: 7865.08887| val_0_mse: 13420.68171|  0:00:00s
epoch 4  | loss: 7663.7915| val_0_mse: 13419.44044|  0:00:00s
epoch 5  | loss: 7507.56494| val_0_mse: 13092.88806|  0:00:01s
epoch 6  | loss: 7348.37598| val_0_mse: 13144.3836|  0:00:02s
epoch 7  | loss: 7174.28027| val_0_mse: 12917.16229|  0:00:02s
epoch 8  | loss: 7023.86816| val_0_mse: 12622.77939|  0:00:02s
epoch 9  | loss: 7071.2666| val_0_mse: 12534.29479|  0:00:02s
epoch 10 | loss: 6844.28613| val_0_mse: 12485.56507|  0:00:02s
epoch 11 | loss: 6679.94189| val_0_mse: 12343.08665|  0:00:03s
epoch 12 | loss: 6539.98926| val_0_mse: 12189.56578|  0:00:03s
epoch 13 | loss: 6416.22217| val_0_mse: 11957.03612|  0:00:03s
epoch 14 | loss: 6276.00635| val_0_mse: 11894.41291|  0:00:03s
epoch 15 | loss: 6155.3667| val_0_mse: 11839.5701|  0:00:03

[I 2024-07-26 08:41:44,758] Trial 43 finished with value: 63.42012132220404 and parameters: {'n_d': 64, 'n_a': 13, 'n_steps': 9, 'gamma': 1.1186087250640335, 'lambda_sparse': 0.0008268526075009608, 'learning_rate': 0.0012000172839192395, 'batch_size': 128, 'num_epochs': 80}. Best is trial 23 with value: 20.873160944553998.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8578.6582| val_0_mse: 10516.09246|  0:00:00s
epoch 1  | loss: 8473.75684| val_0_mse: 10356.12176|  0:00:00s
epoch 2  | loss: 8128.8501| val_0_mse: 10192.3672|  0:00:00s
epoch 3  | loss: 7868.57715| val_0_mse: 9563.09009|  0:00:00s
epoch 4  | loss: 7544.38721| val_0_mse: 9074.24542|  0:00:00s
epoch 5  | loss: 7444.22559| val_0_mse: 11012.50166|  0:00:00s
epoch 6  | loss: 7303.72559| val_0_mse: 10674.21014|  0:00:00s
epoch 7  | loss: 6981.70215| val_0_mse: 10731.19903|  0:00:01s
epoch 8  | loss: 6770.92871| val_0_mse: 10723.94826|  0:00:01s
epoch 9  | loss: 6529.0332| val_0_mse: 10650.85821|  0:00:01s

Early stopping occurred at epoch 9 with best_epoch = 4 and best_val_0_mse = 9074.24542


[I 2024-07-26 08:41:46,309] Trial 44 finished with value: 95.25883379028986 and parameters: {'n_d': 49, 'n_a': 23, 'n_steps': 10, 'gamma': 1.2975813150280087, 'lambda_sparse': 0.0007407678357806717, 'learning_rate': 0.006194482200904681, 'batch_size': 128, 'num_epochs': 96}. Best is trial 23 with value: 20.873160944553998.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8609.1875| val_0_mse: 9548.21465|  0:00:00s
epoch 1  | loss: 8078.28271| val_0_mse: 8665.74289|  0:00:00s
epoch 2  | loss: 7821.70557| val_0_mse: 9250.2609|  0:00:00s
epoch 3  | loss: 7635.89209| val_0_mse: 9399.69039|  0:00:00s
epoch 4  | loss: 7329.5498| val_0_mse: 9041.65559|  0:00:00s


[I 2024-07-26 08:41:47,211] Trial 45 finished with value: 93.08997202596397 and parameters: {'n_d': 60, 'n_a': 14, 'n_steps': 7, 'gamma': 1.1692021283036722, 'lambda_sparse': 0.0009563178299581709, 'learning_rate': 0.012543558942659705, 'batch_size': 32, 'num_epochs': 57}. Best is trial 23 with value: 20.873160944553998.


epoch 5  | loss: 6761.85107| val_0_mse: 9094.42145|  0:00:00s
epoch 6  | loss: 6346.97705| val_0_mse: 9077.40492|  0:00:00s

Early stopping occurred at epoch 6 with best_epoch = 1 and best_val_0_mse = 8665.74289


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8556.85547| val_0_mse: 13390.70274|  0:00:00s
epoch 1  | loss: 8267.03711| val_0_mse: 12414.25416|  0:00:00s
epoch 2  | loss: 7621.33447| val_0_mse: 12268.57089|  0:00:00s
epoch 3  | loss: 7041.17529| val_0_mse: 11469.37095|  0:00:00s
epoch 4  | loss: 6440.3999| val_0_mse: 11254.63788|  0:00:00s
epoch 5  | loss: 5875.55811| val_0_mse: 10524.21426|  0:00:00s
epoch 6  | loss: 5340.99658| val_0_mse: 9571.25612|  0:00:00s
epoch 7  | loss: 4872.00146| val_0_mse: 9543.97607|  0:00:00s
epoch 8  | loss: 4419.04688| val_0_mse: 9182.25565|  0:00:00s
epoch 9  | loss: 3990.94897| val_0_mse: 8406.4853|  0:00:01s
epoch 10 | loss: 3580.94067| val_0_mse: 4927.95549|  0:00:01s
epoch 11 | loss: 3189.74902| val_0_mse: 6429.39374|  0:00:01s
epoch 12 | loss: 2826.24634| val_0_mse: 5335.39991|  0:00:01s
epoch 13 | loss: 2476.02588| val_0_mse: 5009.65954|  0:00:01s
epoch 14 | loss: 2149.65137| val_0_mse: 5766.44209|  0:00:01s
epoch 15 | loss: 1843.37903| val_0_mse: 6748.97862|  0:00:01s

Ear

[I 2024-07-26 08:41:49,013] Trial 46 finished with value: 70.19939804730274 and parameters: {'n_d': 39, 'n_a': 57, 'n_steps': 6, 'gamma': 1.0261263322231047, 'lambda_sparse': 0.000895646202660909, 'learning_rate': 0.009770135770208308, 'batch_size': 256, 'num_epochs': 78}. Best is trial 23 with value: 20.873160944553998.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8414.08984| val_0_mse: 13292.00981|  0:00:00s
epoch 1  | loss: 6944.19043| val_0_mse: 3093.82785|  0:00:00s
epoch 2  | loss: 5298.78125| val_0_mse: 3511.40451|  0:00:00s
epoch 3  | loss: 3872.47314| val_0_mse: 24654.26501|  0:00:00s
epoch 4  | loss: 2498.27002| val_0_mse: 45182.44317|  0:00:00s


[I 2024-07-26 08:41:49,972] Trial 47 finished with value: 55.62218848101857 and parameters: {'n_d': 54, 'n_a': 46, 'n_steps': 7, 'gamma': 1.0795042561881187, 'lambda_sparse': 0.0006331012510364267, 'learning_rate': 0.029996232930594217, 'batch_size': 128, 'num_epochs': 85}. Best is trial 23 with value: 20.873160944553998.


epoch 5  | loss: 1348.40125| val_0_mse: 101050.60802|  0:00:00s
epoch 6  | loss: 613.2735| val_0_mse: 233394.96182|  0:00:00s

Early stopping occurred at epoch 6 with best_epoch = 1 and best_val_0_mse = 3093.82785


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8469.82227| val_0_mse: 6562.53158|  0:00:00s
epoch 1  | loss: 7880.1543| val_0_mse: 9368.74111|  0:00:00s
epoch 2  | loss: 6920.66846| val_0_mse: 5331.89077|  0:00:00s
epoch 3  | loss: 6167.62744| val_0_mse: 8387.6256|  0:00:00s
epoch 4  | loss: 5456.9668| val_0_mse: 4233.53098|  0:00:00s
epoch 5  | loss: 4704.30713| val_0_mse: 3983.07991|  0:00:00s
epoch 6  | loss: 3767.64014| val_0_mse: 7924.55165|  0:00:00s
epoch 7  | loss: 3351.23853| val_0_mse: 8938.55131|  0:00:00s
epoch 8  | loss: 2611.43652| val_0_mse: 5303.9157|  0:00:01s


[I 2024-07-26 08:41:51,429] Trial 48 finished with value: 63.11164642087494 and parameters: {'n_d': 44, 'n_a': 33, 'n_steps': 8, 'gamma': 1.2299870568440276, 'lambda_sparse': 0.0005078887895404565, 'learning_rate': 0.019036255680937857, 'batch_size': 256, 'num_epochs': 71}. Best is trial 23 with value: 20.873160944553998.


epoch 9  | loss: 1932.27917| val_0_mse: 5589.64427|  0:00:01s
epoch 10 | loss: 1361.82776| val_0_mse: 5204.45158|  0:00:01s

Early stopping occurred at epoch 10 with best_epoch = 5 and best_val_0_mse = 3983.07991


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8665.9375| val_0_mse: 13189.87302|  0:00:00s
epoch 1  | loss: 8659.88184| val_0_mse: 13155.67646|  0:00:00s
epoch 2  | loss: 8609.62305| val_0_mse: 13243.7367|  0:00:00s
epoch 3  | loss: 8567.88184| val_0_mse: 13284.06595|  0:00:00s
epoch 4  | loss: 8532.4707| val_0_mse: 13093.53628|  0:00:01s
epoch 5  | loss: 8505.46387| val_0_mse: 13353.36838|  0:00:01s
epoch 6  | loss: 8473.07129| val_0_mse: 13217.03202|  0:00:01s
epoch 7  | loss: 8441.77441| val_0_mse: 12863.75926|  0:00:01s
epoch 8  | loss: 8422.89453| val_0_mse: 12743.47572|  0:00:01s
epoch 9  | loss: 8395.17871| val_0_mse: 12710.85796|  0:00:01s
epoch 10 | loss: 8369.17676| val_0_mse: 12718.80167|  0:00:01s
epoch 11 | loss: 8344.41797| val_0_mse: 12683.13186|  0:00:01s
epoch 12 | loss: 8324.32422| val_0_mse: 12825.18767|  0:00:01s
epoch 13 | loss: 8303.06543| val_0_mse: 12730.47161|  0:00:01s
epoch 14 | loss: 8279.33789| val_0_mse: 12634.37808|  0:00:01s
epoch 15 | loss: 8254.62305| val_0_mse: 12546.46949|  0:00

[I 2024-07-26 08:41:56,097] Trial 49 finished with value: 109.54779040742179 and parameters: {'n_d': 57, 'n_a': 17, 'n_steps': 5, 'gamma': 1.4793420463559992, 'lambda_sparse': 0.0008450546734865454, 'learning_rate': 0.0002658764086456467, 'batch_size': 256, 'num_epochs': 64}. Best is trial 23 with value: 20.873160944553998.


epoch 44 | loss: 7735.33984| val_0_mse: 12085.09503|  0:00:04s

Early stopping occurred at epoch 44 with best_epoch = 39 and best_val_0_mse = 12000.71838
epoch 0  | loss: 8633.33496| val_0_mse: 6839.38934|  0:00:00s




epoch 1  | loss: 8273.11816| val_0_mse: 7378.03251|  0:00:00s
epoch 2  | loss: 7855.49609| val_0_mse: 7280.71715|  0:00:00s
epoch 3  | loss: 7432.57324| val_0_mse: 8492.0348|  0:00:00s
epoch 4  | loss: 7011.39258| val_0_mse: 8269.32929|  0:00:00s
epoch 5  | loss: 6462.71191| val_0_mse: 2769.09869|  0:00:00s
epoch 6  | loss: 6330.91406| val_0_mse: 2407.87857|  0:00:00s
epoch 7  | loss: 5603.52783| val_0_mse: 1757.27259|  0:00:00s
epoch 8  | loss: 5215.52246| val_0_mse: 1056.32327|  0:00:00s
epoch 9  | loss: 5177.76025| val_0_mse: 816.5443|  0:00:00s
epoch 10 | loss: 4581.8584| val_0_mse: 1285.6773|  0:00:00s
epoch 11 | loss: 4421.73877| val_0_mse: 1041.33952|  0:00:01s
epoch 12 | loss: 4181.3125| val_0_mse: 1351.37448|  0:00:01s
epoch 13 | loss: 3395.61475| val_0_mse: 1016.18194|  0:00:01s
epoch 14 | loss: 3319.2395| val_0_mse: 435.68885|  0:00:01s
epoch 15 | loss: 2820.70264| val_0_mse: 617.68171|  0:00:01s
epoch 16 | loss: 2626.62671| val_0_mse: 729.00361|  0:00:01s
epoch 17 | loss: 2



In [30]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class KAN(nn.Module):
    def __init__(self, input_dim, inner_dim, outer_dim):
        super(KAN, self).__init__()
        self.inner_functions = nn.ModuleList([nn.Sequential(
            nn.Linear(1, inner_dim),
            nn.ReLU(),
            nn.Linear(inner_dim, 1)
        ) for _ in range(input_dim)])
        
        self.outer_function = nn.Sequential(
            nn.Linear(input_dim, outer_dim),
            nn.ReLU(),
            nn.Linear(outer_dim, 1)
        )

    def forward(self, x):
        inner_outputs = [f(x[:, i].unsqueeze(1)) for i, f in enumerate(self.inner_functions)]
        inner_concat = torch.cat(inner_outputs, dim=1)
        return self.outer_function(inner_concat)

def objective(trial):
    # Define hyperparameters to tune
    inner_dim = trial.suggest_int('inner_dim', 8, 64)
    outer_dim = trial.suggest_int('outer_dim', 8, 64)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the KAN model
    model = KAN(X_train.shape[1], inner_dim, outer_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        mse = mean_squared_error(y_test_tensor.cpu().numpy(), y_pred.cpu().numpy())
        rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final KAN model with the best hyperparameters
best_model = KAN(X_train.shape[1], best_params['inner_dim'], best_params['outer_dim']).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    y_pred = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['KAN'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


[I 2024-07-26 08:35:27,756] A new study created in memory with name: no-name-96957bfb-727e-4ca1-9508-7751ece5b423


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:35:28,116] Trial 0 finished with value: 47.61610412597656 and parameters: {'inner_dim': 16, 'outer_dim': 47, 'learning_rate': 0.024741840873254223, 'batch_size': 128, 'num_epochs': 35}. Best is trial 0 with value: 47.61610412597656.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:35:28,573] Trial 1 finished with value: 33.1011848449707 and parameters: {'inner_dim': 41, 'outer_dim': 29, 'learning_rate': 0.015935626011767175, 'batch_size': 32, 'num_epochs': 44}. Best is trial 1 with value: 33.1011848449707.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:35:29,026] Trial 2 finished with value: 28.40443229675293 and parameters: {'inner_dim': 21, 'outer_dim': 30, 'learning_rate': 0.011506847037759158, 'batch_size': 128, 'num_epochs': 56}. Best is trial 2 with value: 28.40443229675293.
  learning_rate = trial.suggest_logun

                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.991287      0.594412       0.000998   
DNN                109.746297 -1.084697      0.063799            0.0   
DCN                   4.72361  0.996138      0.418879           

In [4]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class SAINT(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads, num_layers, dropout):
        super(SAINT, self).__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout),
            num_layers=num_layers
        )
        self.fc = nn.Linear(embed_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(0)  # Add sequence dimension
        x = self.transformer(x)
        x = x.squeeze(0)  # Remove sequence dimension
        x = self.dropout(x)
        return self.fc(x)

def objective(trial):
    # Define hyperparameters to tune
    num_heads = trial.suggest_int('num_heads', 1, 8)
    embed_dim = trial.suggest_int('embed_dim', num_heads, 256, step=num_heads)  # Ensure divisibility
    num_layers = trial.suggest_int('num_layers', 1, 6)
    dropout = trial.suggest_float('dropout', 0.0, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the SAINT model
    model = SAINT(X_train.shape[1], embed_dim, num_heads, num_layers, dropout).to(device)

    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        mse = mean_squared_error(y_test_tensor.cpu().numpy(), y_pred.cpu().numpy())
        rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final SAINT model with the best hyperparameters
best_model = SAINT(X_train.shape[1], 
                   best_params['embed_dim'], 
                   best_params['num_heads'], 
                   best_params['num_layers'], 
                   best_params['dropout']).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    y_pred = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['SAINT'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


[I 2024-07-26 08:50:32,296] A new study created in memory with name: no-name-6270cf44-c4c4-4bea-a86c-6125d3d0accf


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:50:37,669] Trial 0 finished with value: 89.4405746459961 and parameters: {'num_heads': 6, 'embed_dim': 18, 'num_layers': 3, 'dropout': 0.3546954209666189, 'learning_rate': 0.0029695035123851275, 'batch_size': 64, 'num_epochs': 38}. Best is trial 0 with value: 89.4405746459961.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:50:39,868] Trial 1 finished with value: 108.74946594238281 and parameters: {'num_heads': 7, 'embed_dim': 147, 'num_layers': 2, 'dropout': 0.39053803731951175, 'learning_rate': 0.013605052614586608, 'batch_size': 256, 'num_epochs': 46}. Best is trial 0 with value: 89.4405746459961.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 08:50:43,417] Trial 2 finished with value: 88.06293487548828 and parameters: {'num_heads': 8, 'embed_dim': 136, 'num_layers': 4, 'dropout': 0.4101466384136102, 'learning_rate':

KeyboardInterrupt: 

In [32]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is your initial DataFrame
# Split data
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1).to(device)

class VIME(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(VIME, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )
        self.mask_predictor = nn.Linear(hidden_dim, input_dim)
        self.feature_predictor = nn.Linear(hidden_dim, input_dim)
        self.predictor = nn.Linear(hidden_dim, 1)

    def forward(self, x, mask=None):
        if mask is None:
            mask = torch.bernoulli(torch.ones_like(x) * 0.8)
        x_masked = x * mask
        h = self.encoder(x_masked)
        mask_pred = torch.sigmoid(self.mask_predictor(h))
        feature_pred = self.feature_predictor(h)
        y_pred = self.predictor(h)
        return y_pred, mask_pred, feature_pred

def vime_loss(y_true, y_pred, mask_true, mask_pred, feature_true, feature_pred, alpha=1.0, beta=1.0):
    prediction_loss = nn.MSELoss()(y_pred, y_true)
    mask_loss = nn.BCELoss()(mask_pred, mask_true)
    feature_loss = nn.MSELoss()(feature_pred, feature_true)
    return prediction_loss + alpha * mask_loss + beta * feature_loss

def objective(trial):
    # Define hyperparameters to tune
    hidden_dim = trial.suggest_int('hidden_dim', 32, 256)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)
    alpha = trial.suggest_loguniform('alpha', 0.1, 10.0)
    beta = trial.suggest_loguniform('beta', 0.1, 10.0)

    # Create the VIME model
    model = VIME(X_train.shape[1], hidden_dim).to(device)

    # Define optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            mask = torch.bernoulli(torch.ones_like(batch_X) * 0.8)
            y_pred, mask_pred, feature_pred = model(batch_X, mask)
            loss = vime_loss(batch_y, y_pred, mask, mask_pred, batch_X, feature_pred, alpha, beta)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred, _, _ = model(X_test_tensor)
        mse = mean_squared_error(y_test_tensor.cpu().numpy(), y_pred.cpu().numpy())
        rmse = np.sqrt(mse)

    return rmse

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final VIME model with the best hyperparameters
best_model = VIME(X_train.shape[1], best_params['hidden_dim']).to(device)
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        mask = torch.bernoulli(torch.ones_like(batch_X) * 0.8)
        y_pred, mask_pred, feature_pred = best_model(batch_X, mask)
        loss = vime_loss(batch_y, y_pred, mask, mask_pred, batch_X, feature_pred, best_params['alpha'], best_params['beta'])
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    y_pred, _, _ = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['VIME'] = [rmse, r2, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


[I 2024-07-26 08:36:20,514] A new study created in memory with name: no-name-42b9754c-a810-47f1-ad68-e644121d8d81


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
  alpha = trial.suggest_loguniform('alpha', 0.1, 10.0)
  beta = trial.suggest_loguniform('beta', 0.1, 10.0)
[I 2024-07-26 08:36:21,003] Trial 0 finished with value: 109.2194595336914 and parameters: {'hidden_dim': 60, 'learning_rate': 0.0005548669955034724, 'batch_size': 64, 'num_epochs': 65, 'alpha': 4.828197004000649, 'beta': 4.270061549002121}. Best is trial 0 with value: 109.2194595336914.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
  alpha = trial.suggest_loguniform('alpha', 0.1, 10.0)
  beta = trial.suggest_loguniform('beta', 0.1, 10.0)
[I 2024-07-26 08:36:21,534] Trial 1 finished with value: 34.86105728149414 and parameters: {'hidden_dim': 254, 'learning_rate': 0.0016483977508461843, 'batch_size': 128, 'num_epochs': 93, 'alpha': 1.5974197870695082, 'beta': 0.27458449894138787}. Best is trial 1 with value: 34.86105728149414.
  learning_rate = trial.suggest_loguniform('learning_rate

                         RMSE R-squared Training Time Inference Time  \
Linear Regression    0.211628  0.999992      0.001003            0.0   
Ridge Regression     0.905454  0.999858      0.000999            0.0   
Lasso Regression     0.383952  0.999974           0.0            0.0   
ElasticNet           4.404609  0.996642      0.001995            0.0   
Decision Tree       36.807797    0.7655      0.000998            0.0   
Random Forest       37.345571  0.758598      0.391953       0.023935   
Gradient Boosting   32.551013  0.816603      0.060839       0.000996   
XGBoost             47.556448  0.608545      0.066822       0.000997   
LightGBM           105.104923 -0.912095      0.005985       0.000997   
CatBoost             61.82527    0.3384       0.16257       0.001988   
MLP                  7.094997  0.991287      0.594412       0.000998   
DNN                109.746297 -1.084697      0.063799            0.0   
DCN                   4.72361  0.996138      0.418879           