In [47]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

# Generate a synthetic dataset
X, y = make_classification(n_samples=40, n_features=5, n_informative=3, n_redundant=0, n_clusters_per_class=1, n_classes=3)

# Convert the data to a DataFrame for easier handling
df_features = pd.DataFrame(X, columns=[f'Feature_{i+1}' for i in range(5)])
df_target = pd.DataFrame(y, columns=['Y'])

# Concatenate features and target into one DataFrame
df = pd.concat([df_features, df_target], axis=1)

print(df)


    Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Y
0    0.821209  -0.249454   1.697793  -1.175892   0.064107  2
1   -2.119163  -0.375135  -1.737217  -1.923423  -0.123764  0
2    0.131719  -1.151343   1.912180  -0.953912   0.132853  2
3   -1.847853   1.828564  -1.739899   0.305378   0.821996  1
4   -1.410634  -1.016693  -0.528842  -1.621271   0.010901  0
5    1.654569  -1.862979   1.088969  -1.594433  -1.249609  2
6   -1.576386  -0.779648  -1.571243   0.570999   0.315930  1
7   -1.394641   0.872065  -1.586451   0.404680   1.675884  1
8   -0.836367  -0.753035   0.692772  -3.515997  -1.324955  2
9   -0.206201   0.731854  -0.506389   1.166799  -0.879734  1
10  -1.090315  -0.440489  -0.858923   1.146621   1.331481  1
11  -0.363141  -0.782933  -0.193642  -0.583964   1.762350  0
12  -0.671268  -0.183820  -1.067632   0.890163   0.417564  1
13  -0.654774  -0.371809  -0.952730  -0.801694  -0.409717  0
14   2.057538  -0.469427   1.393917   0.439180  -0.236643  2
15   0.406995  -1.343292

In [48]:
import pandas as pd
import numpy as np

Y_column = df['Y'].copy()
df.drop('Y', axis=1, inplace=True)

# Identify categorical data (change this based on your actual data)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Standardize only the continuous (non-categorical) columns
continuous_cols = df.columns.difference(categorical_cols)  # Gets the difference, i.e., continuous cols
df[continuous_cols] = (df[continuous_cols] - df[continuous_cols].mean()) / df[continuous_cols].std()

# Filter out outliers in continuous data (|z-score| > 5)
mask = (np.abs(df[continuous_cols]) < 5).all(axis=1)
df = df[mask]

# Reattach the target variable 'Y' to the DataFrame
df['Y'] = Y_column[mask]

In [49]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()

# Apply Label Encoding to each categorical column
for column in categorical_cols:
    # Ensure the column is of type object (string) or category
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        df[column] = le.fit_transform(df[column])
        
df['Y'], unique = pd.factorize(df['Y'])

In [50]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Dictionary of models and their reduced hyperparameter grids
models = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), {
        'C': [0.01, 0.1, 1],
        'solver': ['liblinear', 'lbfgs']
    }),
    'KNN': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    }),
    'Random Forest': (RandomForestClassifier(), {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10],
        'min_samples_split': [2, 5]
    }),
    'Gradient Boosting': (GradientBoostingClassifier(), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 4]
    }),
    'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 4]
    }),
    'LightGBM': (LGBMClassifier(), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 50]
    }),
    'CatBoost': (CatBoostClassifier(verbose=0), {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.1],
        'depth': [4, 6]
    })
}

# Dictionary to store results
results = {}

for name, (model, param_grid) in models.items():
    start_time = time.time()
    
    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    
    # Get the best model
    best_model = grid_search.best_estimator_

    # Measure training time for best parameters
    best_param_train_start = time.time()
    best_model.fit(X_train_scaled, y_train)
    best_param_train_time = time.time() - best_param_train_start

    # Measure inference time for best parameters
    inference_start_time = time.time()
    y_pred = best_model.predict(X_test_scaled)
    inference_time = time.time() - inference_start_time
    
    # Calculate total computation time
    computation_time = time.time() - start_time
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    if len(np.unique(y)) == 2:  # Binary classification
        auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled)[:, 1])
    else:  # Multiclass classification
        auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled), multi_class='ovr', average='macro')

    results[name] = {
        'Accuracy': accuracy,
        'AUC Score': auc,
        'Training Time (Best Params)': best_param_train_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': grid_search.best_params_
    }

# Convert results to a DataFrame
result = pd.DataFrame(results).T

print(result)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.268511
[LightGBM] [Info] Start training from score -0.980829
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.268511
[LightGBM] [Info] Start training from score -0.980829
                    Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression     0.75  0.927778                    0.002991   
KNN                     0.75       1.0                    0.000996   
Decision Tree          0.875  0.931746                       0.001   
Random Forest           0.75  0.933333                    0.118717   
Gradient Boosting      0.875  0.920635                    0.247005   
XGBoost         

In [51]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.neural_network import MLPClassifier

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the MLP model and its hyperparameter grid
mlp = MLPClassifier(max_iter=1000, random_state=42)
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}

# Perform GridSearchCV with StratifiedKFold
start_time = time.time()
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(mlp, param_grid=param_grid, cv=skf, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Measure training time for best parameters
best_param_train_start = time.time()
best_model.fit(X_train_scaled, y_train)
training_time = time.time() - best_param_train_start

# Measure inference time for best parameters
inference_start_time = time.time()
y_pred = best_model.predict(X_test_scaled)
inference_time = time.time() - inference_start_time

# Calculate total computation time
computation_time = time.time() - start_time

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
if len(np.unique(y)) == 2:  # Binary classification
    auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled)[:, 1])
else:  # Multiclass classification
    auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled), multi_class='ovr', average='macro')

# Store results in the existing result DataFrame
result.loc['MLP'] = [accuracy, auc, training_time, inference_time, computation_time, grid_search.best_params_]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in grid_search.best_params_.items():
    print(f"{param}: {value}")

                    Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression     0.75  0.927778                    0.002991   
KNN                     0.75       1.0                    0.000996   
Decision Tree          0.875  0.931746                       0.001   
Random Forest           0.75  0.933333                    0.118717   
Gradient Boosting      0.875  0.920635                    0.247005   
XGBoost                 0.75  0.805556                    0.082778   
LightGBM               0.125       0.5                    0.010972   
CatBoost                0.75  0.977778                    0.119442   
MLP                     0.75       1.0                    0.513627   

                    Inference Time (Best Params) Computation Time (Total)  \
Logistic Regression                     0.002992                 5.321193   
KNN                                     0.002262                 0.203027   
Decision Tree                                0.0                 0.0

In [52]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

# Define the DNN model
class DNN(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim):
        super(DNN, self).__init__()
        layers = []
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(input_dim, hidden_dim))
            layers.append(nn.ReLU())
            input_dim = hidden_dim
        layers.append(nn.Linear(input_dim, output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# Define the objective function for Optuna
def objective(trial):
    # Define hyperparameters to tune
    hidden_dims = [trial.suggest_int(f'hidden_dim_{i}', 32, 256) for i in range(3)]
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y))
    model = DNN(input_dim, hidden_dims, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Start timing the entire process
start_time = time.time()

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final model with the best hyperparameters
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
best_model = DNN(input_dim, [best_params[f'hidden_dim_{i}'] for i in range(3)], output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['DNN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:46:16,958] A new study created in memory with name: no-name-7560ba0e-7c0e-41f7-a212-760c6068857a


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:46:17,319] Trial 0 finished with value: 0.875 and parameters: {'hidden_dim_0': 188, 'hidden_dim_1': 168, 'hidden_dim_2': 143, 'learning_rate': 0.0025503930381602113, 'batch_size': 64, 'num_epochs': 65}. Best is trial 0 with value: 0.875.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:46:17,609] Trial 1 finished with value: 0.875 and parameters: {'hidden_dim_0': 211, 'hidden_dim_1': 167, 'hidden_dim_2': 102, 'learning_rate': 0.0047179236236254045, 'batch_size': 64, 'num_epochs': 60}. Best is trial 0 with value: 0.875.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:46:17,689] Trial 2 finished with value: 0.75 and parameters: {'hidden_dim_0': 238, 'hidden_dim_1': 213, 'hidden_dim_2': 132, 'learning_rate': 0.001214410015281299, 'batch_size': 256, 'num_epochs': 11}. Best is trial 0 with value: 0.875.
  learning_rate = tr

                    Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression     0.75  0.927778                    0.002991   
KNN                     0.75       1.0                    0.000996   
Decision Tree          0.875  0.931746                       0.001   
Random Forest           0.75  0.933333                    0.118717   
Gradient Boosting      0.875  0.920635                    0.247005   
XGBoost                 0.75  0.805556                    0.082778   
LightGBM               0.125       0.5                    0.010972   
CatBoost                0.75  0.977778                    0.119442   
MLP                     0.75       1.0                    0.513627   
DNN                     0.75  0.922222                     0.23946   

                    Inference Time (Best Params) Computation Time (Total)  \
Logistic Regression                     0.002992                 5.321193   
KNN                                     0.002262                 0.203027  

In [53]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class CrossLayer(nn.Module):
    def __init__(self, input_dim):
        super(CrossLayer, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(input_dim, 1))
        self.bias = nn.Parameter(torch.Tensor(input_dim, 1))
        nn.init.xavier_uniform_(self.weight)
        nn.init.zeros_(self.bias)

    def forward(self, x0, x):
        x = x.unsqueeze(2)
        x0 = x0.unsqueeze(2)
        interaction = torch.matmul(x0, torch.matmul(x.transpose(1, 2), self.weight))
        return x0.squeeze(2) + interaction.squeeze(2) + self.bias.T

class DCN(nn.Module):
    def __init__(self, input_dim, cross_layers, hidden_layers, output_dim):
        super(DCN, self).__init__()
        self.cross_layers = nn.ModuleList([CrossLayer(input_dim) for _ in range(cross_layers)])
        
        deep_layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                deep_layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                deep_layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            deep_layers.append(nn.ReLU())
        self.deep_net = nn.Sequential(*deep_layers)
        
        self.final_layer = nn.Linear(input_dim + hidden_layers[-1], output_dim)

    def forward(self, x):
        cross_out = x
        for layer in self.cross_layers:
            cross_out = layer(x, cross_out)
        deep_out = self.deep_net(x)
        concat_out = torch.cat([cross_out, deep_out], dim=1)
        return self.final_layer(concat_out)

def objective(trial):
    # Define hyperparameters to tune
    cross_layers = trial.suggest_int('cross_layers', 1, 5)
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y))
    model = DCN(input_dim, cross_layers, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Start timing the entire process
start_time = time.time()

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final model with the best hyperparameters
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
best_model = DCN(input_dim, best_params['cross_layers'], 
                 [best_params[f'hidden_layer_{i}'] for i in range(3)], 
                 output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['DCN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:46:39,311] A new study created in memory with name: no-name-3900da39-de46-45da-b04d-8140173eabe5


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:46:39,632] Trial 0 finished with value: 0.875 and parameters: {'cross_layers': 4, 'hidden_layer_0': 184, 'hidden_layer_1': 169, 'hidden_layer_2': 100, 'learning_rate': 0.0003024897105741653, 'batch_size': 128, 'num_epochs': 36}. Best is trial 0 with value: 0.875.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:46:39,943] Trial 1 finished with value: 0.5 and parameters: {'cross_layers': 1, 'hidden_layer_0': 235, 'hidden_layer_1': 47, 'hidden_layer_2': 73, 'learning_rate': 0.08362348124405034, 'batch_size': 128, 'num_epochs': 58}. Best is trial 0 with value: 0.875.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:46:40,342] Trial 2 finished with value: 0.625 and parameters: {'cross_layers': 4, 'hidden_layer_0': 57, 'hidden_layer_1': 141, 'hidden_layer_2': 243, 'learning_rate': 0.00010340647121060388, 'batch_size': 256, '

                    Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression     0.75  0.927778                    0.002991   
KNN                     0.75       1.0                    0.000996   
Decision Tree          0.875  0.931746                       0.001   
Random Forest           0.75  0.933333                    0.118717   
Gradient Boosting      0.875  0.920635                    0.247005   
XGBoost                 0.75  0.805556                    0.082778   
LightGBM               0.125       0.5                    0.010972   
CatBoost                0.75  0.977778                    0.119442   
MLP                     0.75       1.0                    0.513627   
DNN                     0.75  0.922222                     0.23946   
DCN                    0.875  0.927778                    0.297203   

                    Inference Time (Best Params) Computation Time (Total)  \
Logistic Regression                     0.002992                 5.321193   
KNN  

In [54]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class WideAndDeepNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(WideAndDeepNetwork, self).__init__()
        
        # Wide part
        self.wide = nn.Linear(input_dim, output_dim)
        
        # Deep part
        deep_layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                deep_layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                deep_layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            deep_layers.append(nn.ReLU())
        deep_layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.deep = nn.Sequential(*deep_layers)

    def forward(self, x):
        wide_out = self.wide(x)
        deep_out = self.deep(x)
        return wide_out + deep_out

def objective(trial):
    # Define hyperparameters to tune
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y))
    model = WideAndDeepNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Start timing the entire process
start_time = time.time()

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final model with the best hyperparameters
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
best_model = WideAndDeepNetwork(input_dim, 
                                [best_params[f'hidden_layer_{i}'] for i in range(3)], 
                                output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['Wide_and_Deep'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:47:03,528] A new study created in memory with name: no-name-e4818d4b-c71f-436d-bd5b-0e49d9597479


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:47:03,937] Trial 0 finished with value: 0.875 and parameters: {'hidden_layer_0': 56, 'hidden_layer_1': 227, 'hidden_layer_2': 118, 'learning_rate': 0.0041430798801775125, 'batch_size': 128, 'num_epochs': 84}. Best is trial 0 with value: 0.875.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:47:03,997] Trial 1 finished with value: 0.625 and parameters: {'hidden_layer_0': 190, 'hidden_layer_1': 197, 'hidden_layer_2': 167, 'learning_rate': 0.05923346005014561, 'batch_size': 32, 'num_epochs': 10}. Best is trial 0 with value: 0.875.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:47:04,352] Trial 2 finished with value: 0.75 and parameters: {'hidden_layer_0': 123, 'hidden_layer_1': 108, 'hidden_layer_2': 148, 'learning_rate': 0.0011046238994844699, 'batch_size': 32, 'num_epochs': 78}. Best is trial 0 with value: 0.875.
  le

                    Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression     0.75  0.927778                    0.002991   
KNN                     0.75       1.0                    0.000996   
Decision Tree          0.875  0.931746                       0.001   
Random Forest           0.75  0.933333                    0.118717   
Gradient Boosting      0.875  0.920635                    0.247005   
XGBoost                 0.75  0.805556                    0.082778   
LightGBM               0.125       0.5                    0.010972   
CatBoost                0.75  0.977778                    0.119442   
MLP                     0.75       1.0                    0.513627   
DNN                     0.75  0.922222                     0.23946   
DCN                    0.875  0.927778                    0.297203   
Wide_and_Deep          0.875  0.830556                    0.449732   

                    Inference Time (Best Params) Computation Time (Total)  \
Logistic Reg

In [55]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(NeuralNetwork, self).__init__()
        layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for XGBoost
    xgb_params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }

    # Train XGBoost model
    xgb_model = XGBClassifier(**xgb_params, use_label_encoder=False, eval_metric='mlogloss')
    xgb_model.fit(X_train_scaled, y_train)
    
    # Extract features using XGBoost
    X_train_transformed = xgb_model.apply(X_train_scaled)
    X_test_transformed = xgb_model.apply(X_test_scaled)
    
    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
    y_train_tensor = torch.LongTensor(y_train.values).to(device)
    y_test_tensor = torch.LongTensor(y_test.values).to(device)
    
    # Define hyperparameters to tune for Neural Network
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Neural Network model
    input_dim = X_train_transformed.shape[1]
    output_dim = len(np.unique(y))
    model = NeuralNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_transformed_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final XGBoost model with the best hyperparameters
xgb_best_params = {
    'n_estimators': best_params['n_estimators'],
    'max_depth': best_params['max_depth'],
    'learning_rate': best_params['xgb_learning_rate'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree']
}
xgb_model = XGBClassifier(**xgb_best_params, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_scaled, y_train)

# Extract features using XGBoost
X_train_transformed = xgb_model.apply(X_train_scaled)
X_test_transformed = xgb_model.apply(X_test_scaled)

# Convert to PyTorch tensors
X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

# Train the final Neural Network model with the best hyperparameters
nn_best_params = {
    'hidden_layers': [best_params[f'hidden_layer_{i}'] for i in range(3)],
    'learning_rate': best_params['nn_learning_rate'],
    'batch_size': best_params['batch_size'],
    'num_epochs': best_params['num_epochs']
}
input_dim = X_train_transformed.shape[1]
output_dim = len(np.unique(y))
best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers'], output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(nn_best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_transformed_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['XGBoost + NN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:47:26,601] A new study created in memory with name: no-name-14cde900-b97d-4cbb-adbc-5872cc99c59a


Using device: cpu


  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
Parameters: { "use_label_encoder" } are not used.

  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:47:27,143] Trial 0 finished with value: 0.875 and parameters: {'n_estimators': 204, 'max_depth': 7, 'xgb_learning_rate': 0.0010371340988060615, 'subsample': 0.6175172180456624, 'colsample_bytree': 0.7451200216773843, 'hidden_layer_0': 127, 'hidden_layer_1': 88, 'hidden_layer_2': 92, 'nn_learning_rate': 0.0041130999314882115, 'batch_size': 256, 'num_epochs': 77}. Best is trial 0 with value: 0.875.
  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
Parameters: { "use_label_encoder" } are not used.

  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:47:28,255] Trial 1 finished with value: 0.875 and parameters: {'n_estimators': 197, 'max_depth': 4, 'xgb_learning_rate': 0.0009457318477613953, 'subsamp

                    Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression     0.75  0.927778                    0.002991   
KNN                     0.75       1.0                    0.000996   
Decision Tree          0.875  0.931746                       0.001   
Random Forest           0.75  0.933333                    0.118717   
Gradient Boosting      0.875  0.920635                    0.247005   
XGBoost                 0.75  0.805556                    0.082778   
LightGBM               0.125       0.5                    0.010972   
CatBoost                0.75  0.977778                    0.119442   
MLP                     0.75       1.0                    0.513627   
DNN                     0.75  0.922222                     0.23946   
DCN                    0.875  0.927778                    0.297203   
Wide_and_Deep          0.875  0.830556                    0.449732   
XGBoost + NN           0.875  0.888889                    0.364206   

                   

In [56]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from lightgbm import LGBMClassifier
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(NeuralNetwork, self).__init__()
        layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for LightGBM
    lgb_params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }

    # Train LightGBM model
    lgb_model = LGBMClassifier(**lgb_params)
    lgb_model.fit(X_train_scaled, y_train)
    
    # Extract features using LightGBM
    X_train_transformed = lgb_model.predict_proba(X_train_scaled)
    X_test_transformed = lgb_model.predict_proba(X_test_scaled)
    
    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
    y_train_tensor = torch.LongTensor(y_train.values).to(device)
    y_test_tensor = torch.LongTensor(y_test.values).to(device)
    
    # Define hyperparameters to tune for Neural Network
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Neural Network model
    input_dim = X_train_transformed.shape[1]
    output_dim = len(np.unique(y))
    model = NeuralNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_transformed_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final LightGBM model with the best hyperparameters
lgb_best_params = {
    'n_estimators': best_params['n_estimators'],
    'max_depth': best_params['max_depth'],
    'learning_rate': best_params['lgb_learning_rate'],
    'num_leaves': best_params['num_leaves'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree']
}
lgb_model = LGBMClassifier(**lgb_best_params)
lgb_model.fit(X_train_scaled, y_train)

# Extract features using LightGBM
X_train_transformed = lgb_model.predict_proba(X_train_scaled)
X_test_transformed = lgb_model.predict_proba(X_test_scaled)

# Convert to PyTorch tensors
X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

# Train the final Neural Network model with the best hyperparameters
nn_best_params = {
    'hidden_layers': [best_params[f'hidden_layer_{i}'] for i in range(3)],
    'learning_rate': best_params['nn_learning_rate'],
    'batch_size': best_params['batch_size'],
    'num_epochs': best_params['num_epochs']
}
input_dim = X_train_transformed.shape[1]
output_dim = len(np.unique(y))
best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers'], output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(nn_best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_transformed_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['LightGBM + NN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:48:08,387] A new study created in memory with name: no-name-643bb24c-82bd-4848-b6be-2c32307b6582


Using device: cpu
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:08,639] Trial 0 finished with value: 0.375 and parameters: {'n_estimators': 135, 'max_depth': 6, 'lgb_learning_rate': 0.004344106798222012, 'num_leaves': 87, 'subsample': 0.627133286041994, 'colsample_bytree': 0.678357634226493, 'hidden_layer_0': 102, 'hidden_layer_1': 177, 'hidden_layer_2': 180, 'nn_learning_rate': 0.035613521983106015, 'batch_size': 128, 'num_epochs': 45}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:08,703] Trial 1 finished with value: 0.25 and parameters: {'n_estimators': 282, 'max_depth': 3, 'lgb_learning_rate': 0.0008018380661104405, 'num_leaves': 27, 'subsample': 0.910903275217358, 'colsample_bytree': 0.7231024885289279, 'hidd

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:09,011] Trial 2 finished with value: 0.25 and parameters: {'n_estimators': 293, 'max_depth': 6, 'lgb_learning_rate': 0.0177700758435059, 'num_leaves': 31, 'subsample': 0.7018695501799963, 'colsample_bytree': 0.6981637430902841, 'hidden_layer_0': 152, 'hidden_layer_1': 244, 'hidden_layer_2': 233, 'nn_learning_rate': 0.007261364537486744, 'batch_size': 64, 'num_epochs': 51}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:09,422] Trial 3 finished with value: 0.25 and parameters: {'n_estimators': 152, 'max_depth': 9, 'lgb_learning_rate': 0.00238015967610395, 'num_leaves': 42, 'subsample': 0.9489529461407711, 'colsample_bytree': 0.7700332527321114, 'hidden_layer_0': 179, 'hidden_layer_1': 59, 'hidden_layer_2': 181, 'nn_learning_rate': 0.0002920280837952565, 'batch_size': 256, 'num_epochs': 76}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:09,635] Trial 4 finished with value: 0.25 and parameters: {'n_estimators': 156, 'max_depth': 9, 'lgb_learning_rate': 0.001961888163322222, 'num_leaves': 82, 'subsample': 0.5804702300187159, 'colsample_bytree': 0.6964346630985988, 'hidden_layer_0': 229, 'hidden_layer_1': 210, 'hidden_layer_2': 145, 'nn_learning_rate': 0.014180753814713726, 'batch_size': 256, 'num_epochs': 36}. Best is tria

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:09,842] Trial 5 finished with value: 0.375 and parameters: {'n_estimators': 249, 'max_depth': 10, 'lgb_learning_rate': 0.002367778417159049, 'num_leaves': 90, 'subsample': 0.5884822282084095, 'colsample_bytree': 0.7017338008742362, 'hidden_layer_0': 63, 'hidden_layer_1': 37, 'hidden_layer_2': 221, 'nn_learning_rate': 0.07588812610256784, 'batch_size': 256, 'num_epochs': 31}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:10,377] Trial 6 finished with value: 0.375 and parameters: {'n_estimators': 218, 'max_depth': 4, 'lgb_learning_rate': 0.04282815445966122, 'num_leaves': 96, 'subsample': 0.5183653187149606, 'colsample_bytree': 0.8270552101547505, 'hidden_layer_0': 146, 'hidden_layer_1': 159, 'hidden_layer_2': 167, 'nn_learning_rate': 0.002326052319484496, 'batch_size': 256, 'num_epochs': 96}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:10,522] Trial 7 finished with value: 0.375 and parameters: {'n_estimators': 209, 'max_depth': 5, 'lgb_learning_rate': 0.00045632340763066004, 'num_leaves': 68, 'subsample': 0.5367865414493158, 'colsample_bytree': 0.7978401108060664, 'hidden_layer_0': 234, 'hidden_layer_1': 188, 'hidden_layer_2': 132, 'nn_learning_rate': 0.0002505476925759534, 'batch_size': 128, 'num_epochs': 25}. Best is

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:10,727] Trial 8 finished with value: 0.375 and parameters: {'n_estimators': 52, 'max_depth': 3, 'lgb_learning_rate': 0.03603541675597387, 'num_leaves': 70, 'subsample': 0.5975970688301913, 'colsample_bytree': 0.7130265047251779, 'hidden_layer_0': 130, 'hidden_layer_1': 140, 'hidden_layer_2': 193, 'nn_learning_rate': 0.00046447336244987716, 'batch_size': 128, 'num_epochs': 38}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:10,994] Trial 9 finished with value: 0.375 and parameters: {'n_estimators': 256, 'max_depth': 9, 'lgb_learning_rate': 0.0005482527019910094, 'num_leaves': 96, 'subsample': 0.5256009648916088, 'colsample_bytree': 0.9713035113771805, 'hidden_layer_0': 203, 'hidden_layer_1': 137, 'hidden_layer_2': 247, 'nn_learning_rate': 0.0010384555266015557, 'batch_size': 32, 'num_epochs': 42}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:11,375] Trial 10 finished with value: 0.25 and parameters: {'n_estimators': 95, 'max_depth': 7, 'lgb_learning_rate': 0.00010007883783423802, 'num_leaves': 51, 'subsample': 0.7728659838818752, 'colsample_bytree': 0.5128506987562584, 'hidden_layer_0': 32, 'hidden_layer_1': 97, 'hidden_layer_2': 73, 'nn_learning_rate': 0.09372371688506563, 'batch_size': 128, 'num_epochs': 68}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:11,585] Trial 11 finished with value: 0.375 and parameters: {'n_estimators': 127, 'max_depth': 7, 'lgb_learning_rate': 0.007173957459639556, 'num_leaves': 82, 'subsample': 0.6800614040408063, 'colsample_bytree': 0.5852411397718272, 'hidden_layer_0': 66, 'hidden_layer_1': 98, 'hidden_layer_2': 207, 'nn_learning_rate': 0.0808821429728747, 'batch_size': 32, 'num_epochs': 22}. Best is trial 0 

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:11,958] Trial 12 finished with value: 0.25 and parameters: {'n_estimators': 205, 'max_depth': 10, 'lgb_learning_rate': 0.007375551736030586, 'num_leaves': 84, 'subsample': 0.6487233058281614, 'colsample_bytree': 0.6217810508935546, 'hidden_layer_0': 80, 'hidden_layer_1': 88, 'hidden_layer_2': 103, 'nn_learning_rate': 0.029165665687270748, 'batch_size': 256, 'num_epochs': 61}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:12,298] Trial 13 finished with value: 0.25 and parameters: {'n_estimators': 102, 'max_depth': 6, 'lgb_learning_rate': 0.0069973479433327705, 'num_leaves': 99, 'subsample': 0.7846070490509167, 'colsample_bytree': 0.8852422958695563, 'hidden_layer_0': 90, 'hidden_layer_1': 183, 'hidden_layer_2': 40, 'nn_learning_rate': 0.03208166245568093, 'batch_size': 128, 'num_epochs': 50}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:12,548] Trial 14 finished with value: 0.375 and parameters: {'n_estimators': 238, 'max_depth': 8, 'lgb_learning_rate': 0.0011242032746784252, 'num_leaves': 67, 'subsample': 0.8485218077810216, 'colsample_bytree': 0.6224805854020403, 'hidden_layer_0': 32, 'hidden_layer_1': 241, 'hidden_layer_2': 209, 'nn_learning_rate': 0.034601170956689, 'batch_size': 128, 'num_epochs': 27}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:13,033] Trial 15 finished with value: 0.25 and parameters: {'n_estimators': 181, 'max_depth': 5, 'lgb_learning_rate': 0.00017256247355594112, 'num_leaves': 87, 'subsample': 0.6348558741520534, 'colsample_bytree': 0.6413250387156253, 'hidden_layer_0': 112, 'hidden_layer_1': 120, 'hidden_layer_2': 155, 'nn_learning_rate': 0.012224772258098211, 'batch_size': 256, 'num_epochs': 84}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:13,222] Trial 16 finished with value: 0.375 and parameters: {'n_estimators': 177, 'max_depth': 10, 'lgb_learning_rate': 0.004443500948286934, 'num_leaves': 56, 'subsample': 0.7266110656589441, 'colsample_bytree': 0.5423412262260742, 'hidden_layer_0': 73, 'hidden_layer_1': 175, 'hidden_layer_2': 115, 'nn_learning_rate': 0.0027547684074587745, 'batch_size': 64, 'num_epochs': 13}. Best i

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:13,663] Trial 17 finished with value: 0.25 and parameters: {'n_estimators': 253, 'max_depth': 8, 'lgb_learning_rate': 0.013773222895002905, 'num_leaves': 75, 'subsample': 0.5993610945891761, 'colsample_bytree': 0.8657673039509162, 'hidden_layer_0': 103, 'hidden_layer_1': 214, 'hidden_layer_2': 223, 'nn_learning_rate': 0.05905866980041131, 'batch_size': 32, 'num_epochs': 59}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:13,965] Trial 18 finished with value: 0.25 and parameters: {'n_estimators': 68, 'max_depth': 5, 'lgb_learning_rate': 0.0016348641122665728, 'num_leaves': 89, 'subsample': 0.647447960839574, 'colsample_bytree': 0.6596803184938218, 'hidden_layer_0': 62, 'hidden_layer_1': 74, 'hidden_layer_2': 191, 'nn_learning_rate': 0.018051811251142417, 'batch_size': 256, 'num_epochs': 44}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:14,229] Trial 19 finished with value: 0.375 and parameters: {'n_estimators': 126, 'max_depth': 8, 'lgb_learning_rate': 0.09623445755228234, 'num_leaves': 76, 'subsample': 0.5601377385802258, 'colsample_bytree': 0.5701265132024069, 'hidden_layer_0': 51, 'hidden_layer_1': 120, 'hidden_layer_2': 171, 'nn_learning_rate': 0.005214117881031498, 'batch_size': 128, 'num_epochs': 32}. Best is trial

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:14,439] Trial 20 finished with value: 0.375 and parameters: {'n_estimators': 148, 'max_depth': 7, 'lgb_learning_rate': 0.00040685396267511263, 'num_leaves': 61, 'subsample': 0.8274949808552886, 'colsample_bytree': 0.7499437763657953, 'hidden_layer_0': 165, 'hidden_layer_1': 37, 'hidden_layer_2': 227, 'nn_learning_rate': 0.04934587842896551, 'batch_size': 128, 'num_epochs': 21}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:15,096] Trial 21 finished with value: 0.375 and parameters: {'n_estimators': 218, 'max_depth': 4, 'lgb_learning_rate': 0.08100348534347629, 'num_leaves': 92, 'subsample': 0.5113265994474822, 'colsample_bytree': 0.8306041706714573, 'hidden_layer_0': 139, 'hidden_layer_1': 162, 'hidden_layer_2': 169, 'nn_learning_rate': 0.0023980118111536016, 'batch_size': 256, 'num_epochs': 98}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:16,337] Trial 22 finished with value: 0.375 and parameters: {'n_estimators': 233, 'max_depth': 4, 'lgb_learning_rate': 0.026264400508879036, 'num_leaves': 99, 'subsample': 0.6032177781665184, 'colsample_bytree': 0.9023133981917331, 'hidden_layer_0': 99, 'hidden_layer_1': 158, 'hidden_layer_2': 156, 'nn_learning_rate': 0.000985211637536405, 'batch_size': 256, 'num_epochs': 100}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:16,926] Trial 23 finished with value: 0.375 and parameters: {'n_estimators': 266, 'max_depth': 4, 'lgb_learning_rate': 0.0037874425990602045, 'num_leaves': 92, 'subsample': 0.5530022979578257, 'colsample_bytree': 0.806616985822967, 'hidden_layer_0': 188, 'hidden_layer_1': 213, 'hidden_layer_2': 202, 'nn_learning_rate': 0.0013489152142408846, 'batch_size': 256, 'num_epochs': 80}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:17,485] Trial 24 finished with value: 0.375 and parameters: {'n_estimators': 198, 'max_depth': 6, 'lgb_learning_rate': 0.05627674682876316, 'num_leaves': 78, 'subsample': 0.5081371731998021, 'colsample_bytree': 0.9285771213132018, 'hidden_layer_0': 150, 'hidden_layer_1': 121, 'hidden_layer_2': 133, 'nn_learning_rate': 0.0037463722583148083, 'batch_size': 256, 'num_epochs': 89}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:17,924] Trial 25 finished with value: 0.375 and parameters: {'n_estimators': 230, 'max_depth': 5, 'lgb_learning_rate': 0.012872367172915575, 'num_leaves': 93, 'subsample': 0.6823161550928606, 'colsample_bytree': 0.7542547127199513, 'hidden_layer_0': 121, 'hidden_layer_1': 192, 'hidden_layer_2': 176, 'nn_learning_rate': 0.022292990306568634, 'batch_size': 256, 'num_epochs': 69}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:18,315] Trial 26 finished with value: 0.375 and parameters: {'n_estimators': 189, 'max_depth': 3, 'lgb_learning_rate': 0.003826991189587347, 'num_leaves': 99, 'subsample': 0.6223541681959035, 'colsample_bytree': 0.8405257595228213, 'hidden_layer_0': 54, 'hidden_layer_1': 160, 'hidden_layer_2': 219, 'nn_learning_rate': 0.010560736767986188, 'batch_size': 64, 'num_epochs': 47}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:18,707] Trial 27 finished with value: 0.25 and parameters: {'n_estimators': 275, 'max_depth': 4, 'lgb_learning_rate': 0.03146212803629289, 'num_leaves': 87, 'subsample': 0.5634001113993131, 'colsample_bytree': 0.6620829990544586, 'hidden_layer_0': 89, 'hidden_layer_1': 150, 'hidden_layer_2': 89, 'nn_learning_rate': 0.00010650994726848133, 'batch_size': 32, 'num_epochs': 56}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:19,171] Trial 28 finished with value: 0.25 and parameters: {'n_estimators': 299, 'max_depth': 6, 'lgb_learning_rate': 0.0012151031840681894, 'num_leaves': 72, 'subsample': 0.7312862626169214, 'colsample_bytree': 0.7786584615292467, 'hidden_layer_0': 109, 'hidden_layer_1': 171, 'hidden_layer_2': 191, 'nn_learning_rate': 0.038400102710106285, 'batch_size': 256, 'num_epochs': 65}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:19,355] Trial 29 finished with value: 0.375 and parameters: {'n_estimators': 244, 'max_depth': 7, 'lgb_learning_rate': 0.0007368192932976072, 'num_leaves': 26, 'subsample': 0.6690871714654332, 'colsample_bytree': 0.7233253880240917, 'hidden_layer_0': 128, 'hidden_layer_1': 54, 'hidden_layer_2': 244, 'nn_learning_rate': 0.007608384456651124, 'batch_size': 64, 'num_epochs': 12}. Best is 

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:19,682] Trial 30 finished with value: 0.375 and parameters: {'n_estimators': 162, 'max_depth': 3, 'lgb_learning_rate': 0.00021216953666813447, 'num_leaves': 46, 'subsample': 0.5008683340969677, 'colsample_bytree': 0.6809161467810023, 'hidden_layer_0': 158, 'hidden_layer_1': 199, 'hidden_layer_2': 256, 'nn_learning_rate': 0.002006023395398832, 'batch_size': 128, 'num_epochs': 32}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:19,961] Trial 31 finished with value: 0.375 and parameters: {'n_estimators': 214, 'max_depth': 5, 'lgb_learning_rate': 0.000343179411863547, 'num_leaves': 66, 'subsample': 0.5463551817097777, 'colsample_bytree': 0.7977677387987683, 'hidden_layer_0': 239, 'hidden_layer_1': 227, 'hidden_layer_2': 136, 'nn_learning_rate': 0.00018125854729103007, 'batch_size': 128, 'num_epochs': 21}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:20,238] Trial 32 finished with value: 0.375 and parameters: {'n_estimators': 219, 'max_depth': 5, 'lgb_learning_rate': 0.0010062931152423045, 'num_leaves': 35, 'subsample': 0.5321661074259103, 'colsample_bytree': 0.734297981760575, 'hidden_layer_0': 246, 'hidden_layer_1': 199, 'hidden_layer_2': 119, 'nn_learning_rate': 0.00037597730437771194, 'batch_size': 128, 'num_epochs': 27}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:20,647] Trial 33 finished with value: 0.375 and parameters: {'n_estimators': 281, 'max_depth': 6, 'lgb_learning_rate': 0.0022739476740252446, 'num_leaves': 80, 'subsample': 0.5754572323182072, 'colsample_bytree': 0.8292215951426952, 'hidden_layer_0': 211, 'hidden_layer_1': 185, 'hidden_layer_2': 154, 'nn_learning_rate': 0.0006860353154845884, 'batch_size': 128, 'num_epochs': 39}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:21,053] Trial 34 finished with value: 0.25 and parameters: {'n_estimators': 133, 'max_depth': 4, 'lgb_learning_rate': 0.010820769092394555, 'num_leaves': 60, 'subsample': 0.6166606748195517, 'colsample_bytree': 0.7869344533241622, 'hidden_layer_0': 220, 'hidden_layer_1': 132, 'hidden_layer_2': 165, 'nn_learning_rate': 0.00010488415639743368, 'batch_size': 128, 'num_epochs': 52}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:21,368] Trial 35 finished with value: 0.375 and parameters: {'n_estimators': 197, 'max_depth': 5, 'lgb_learning_rate': 0.05013273737699614, 'num_leaves': 84, 'subsample': 0.5799185685913761, 'colsample_bytree': 0.7029464761883252, 'hidden_layer_0': 256, 'hidden_layer_1': 168, 'hidden_layer_2': 125, 'nn_learning_rate': 0.00019978203959116072, 'batch_size': 256, 'num_epochs': 33}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:21,647] Trial 36 finished with value: 0.25 and parameters: {'n_estimators': 165, 'max_depth': 6, 'lgb_learning_rate': 0.018836368910118435, 'num_leaves': 94, 'subsample': 0.9297435284203068, 'colsample_bytree': 0.7542459059767115, 'hidden_layer_0': 173, 'hidden_layer_1': 252, 'hidden_layer_2': 182, 'nn_learning_rate': 0.004286876758570333, 'batch_size': 64, 'num_epochs': 26}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:21,871] Trial 37 finished with value: 0.25 and parameters: {'n_estimators': 263, 'max_depth': 9, 'lgb_learning_rate': 0.0006255045316460203, 'num_leaves': 89, 'subsample': 0.5386684312286147, 'colsample_bytree': 0.6859197197988874, 'hidden_layer_0': 193, 'hidden_layer_1': 150, 'hidden_layer_2': 145, 'nn_learning_rate': 0.0005939751603566032, 'batch_size': 128, 'num_epochs': 16}. Best is 

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:22,988] Trial 38 finished with value: 0.25 and parameters: {'n_estimators': 113, 'max_depth': 3, 'lgb_learning_rate': 0.0002734230942658305, 'num_leaves': 71, 'subsample': 0.578430591951557, 'colsample_bytree': 0.862748204526536, 'hidden_layer_0': 136, 'hidden_layer_1': 222, 'hidden_layer_2': 106, 'nn_learning_rate': 0.06609061470582059, 'batch_size': 256, 'num_epochs': 41}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:23,271] Trial 39 finished with value: 0.375 and parameters: {'n_estimators': 148, 'max_depth': 4, 'lgb_learning_rate': 0.00012076837938316001, 'num_leaves': 36, 'subsample': 0.5317332021081886, 'colsample_bytree': 0.9809981267194415, 'hidden_layer_0': 46, 'hidden_layer_1': 53, 'hidden_layer_2': 240, 'nn_learning_rate': 0.0015526126560255018, 'batch_size': 32, 'num_epochs': 36}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:23,848] Trial 40 finished with value: 0.375 and parameters: {'n_estimators': 208, 'max_depth': 7, 'lgb_learning_rate': 0.0029107482286498316, 'num_leaves': 96, 'subsample': 0.995846217098215, 'colsample_bytree': 0.9448711857392207, 'hidden_layer_0': 83, 'hidden_layer_1': 181, 'hidden_layer_2': 213, 'nn_learning_rate': 0.09112920604470012, 'batch_size': 128, 'num_epochs': 76}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:24,227] Trial 41 finished with value: 0.375 and parameters: {'n_estimators': 95, 'max_depth': 3, 'lgb_learning_rate': 0.033588902532500964, 'num_leaves': 67, 'subsample': 0.6025954597777767, 'colsample_bytree': 0.710703339424055, 'hidden_layer_0': 122, 'hidden_layer_1': 138, 'hidden_layer_2': 194, 'nn_learning_rate': 0.00042296477768859425, 'batch_size': 128, 'num_epochs': 48}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:24,587] Trial 42 finished with value: 0.25 and parameters: {'n_estimators': 56, 'max_depth': 3, 'lgb_learning_rate': 0.02007526191476051, 'num_leaves': 51, 'subsample': 0.6607437692380673, 'colsample_bytree': 0.5969696251649694, 'hidden_layer_0': 95, 'hidden_layer_1': 109, 'hidden_layer_2': 181, 'nn_learning_rate': 0.0007700353197788623, 'batch_size': 128, 'num_epochs': 37}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:24,912] Trial 43 finished with value: 0.25 and parameters: {'n_estimators': 73, 'max_depth': 4, 'lgb_learning_rate': 0.006120527814637326, 'num_leaves': 83, 'subsample': 0.7033946988672379, 'colsample_bytree': 0.8097961969534013, 'hidden_layer_0': 76, 'hidden_layer_1': 86, 'hidden_layer_2': 207, 'nn_learning_rate': 0.00019955550449302057, 'batch_size': 128, 'num_epochs': 43}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:25,163] Trial 44 finished with value: 0.25 and parameters: {'n_estimators': 225, 'max_depth': 10, 'lgb_learning_rate': 0.05345931173303852, 'num_leaves': 72, 'subsample': 0.5965127058571011, 'colsample_bytree': 0.6692052215874396, 'hidden_layer_0': 111, 'hidden_layer_1': 69, 'hidden_layer_2': 197, 'nn_learning_rate': 0.0002886067105674396, 'batch_size': 128, 'num_epochs': 28}. Best is tr

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:25,416] Trial 45 finished with value: 0.25 and parameters: {'n_estimators': 245, 'max_depth': 5, 'lgb_learning_rate': 0.0004464498944941391, 'num_leaves': 59, 'subsample': 0.6298482280940679, 'colsample_bytree': 0.6418510113846818, 'hidden_layer_0': 134, 'hidden_layer_1': 205, 'hidden_layer_2': 234, 'nn_learning_rate': 0.00048174942636793777, 'batch_size': 256, 'num_epochs': 17}. Best is trial 0 with value: 0.375.


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:25,790] Trial 46 finished with value: 0.25 and parameters: {'n_estimators': 78, 'max_depth': 6, 'lgb_learning_rate': 0.009510801583720377, 'num_leaves': 64, 'subsample': 0.5211628185977476, 'colsample_bytree': 0.7730863366001575, 'hidden_layer_0': 169, 'hidden_layer_1': 145, 'hidden_layer_2': 45, 'nn_learning_rate': 0.00029318280039266936, 'batch_size': 128, 'num_epochs': 52}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


[I 2024-07-26 07:48:26,683] Trial 47 finished with value: 0.25 and parameters: {'n_estimators': 179, 'max_depth': 9, 'lgb_learning_rate': 0.0016778100197626022, 'num_leaves': 54, 'subsample': 0.6945726339776863, 'colsample_bytree': 0.7340095710074102, 'hidden_layer_0': 146, 'hidden_layer_1': 192, 'hidden_layer_2': 164, 'nn_learning_rate': 0.022579975826739977, 'batch_size': 32, 'num_epochs': 45}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:27,634] Trial 48 finished with value: 0.375 and parameters: {'n_estimators': 51, 'max_depth': 3, 'lgb_learning_rate': 0.04350613731361228, 'num_leaves': 87, 'subsample': 0.647892252543757, 'colsample_bytree': 0.6212991141149389, 'hidden_layer_0': 68, 'hidden_layer_1': 233, 'hidden_layer_2': 185, 'nn_learning_rate': 0.007821446893751976, 'batch_size': 256, 'num_epochs': 94}. Best is trial 0 with value: 0.375.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:27,904] Trial 49 finished with value: 0.375 and parameters: {'n_estimators': 113, 'max_depth': 4, 'lgb_learning_rate': 0.005865934273537366, 'num_leaves': 20, 'subsample': 0.8588704621929674, 'colsample_bytree': 0.7078553378402648, 'hidden_layer_0': 40, 'hidden_layer_1': 177, 'hidden_layer_2': 158, 'nn_learning_r

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.067841
                    Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression     0.75  0.927778                    0.002991   
KNN                     0.75       1.0                    0.000996   
Decision Tree          0.875  0.931746                       0.001   
Random Forest           0.75  0.933333                    0.118717   
Gradient Boosting      0.875  0.920635                    0.247005   
XGBoost         

In [57]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(NeuralNetwork, self).__init__()
        layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for CatBoost
    catboost_params = {
        'iterations': trial.suggest_int('iterations', 50, 300),
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('catboost_learning_rate', 1e-4, 1e-1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-4, 1e-1),
        'border_count': trial.suggest_int('border_count', 32, 255)
    }

    # Train CatBoost model
    catboost_model = CatBoostClassifier(**catboost_params, verbose=0)
    catboost_model.fit(X_train_scaled, y_train)
    
    # Extract features using CatBoost
    X_train_transformed = catboost_model.predict_proba(X_train_scaled)
    X_test_transformed = catboost_model.predict_proba(X_test_scaled)
    
    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
    y_train_tensor = torch.LongTensor(y_train.values).to(device)
    y_test_tensor = torch.LongTensor(y_test.values).to(device)
    
    # Define hyperparameters to tune for Neural Network
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Neural Network model
    input_dim = X_train_transformed.shape[1]
    output_dim = len(np.unique(y))
    model = NeuralNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_transformed_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final CatBoost model with the best hyperparameters
catboost_best_params = {
    'iterations': best_params['iterations'],
    'depth': best_params['depth'],
    'learning_rate': best_params['catboost_learning_rate'],
    'l2_leaf_reg': best_params['l2_leaf_reg'],
    'border_count': best_params['border_count']
}
catboost_model = CatBoostClassifier(**catboost_best_params, verbose=0)
catboost_model.fit(X_train_scaled, y_train)

# Extract features using CatBoost
X_train_transformed = catboost_model.predict_proba(X_train_scaled)
X_test_transformed = catboost_model.predict_proba(X_test_scaled)

# Convert to PyTorch tensors
X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

# Train the final Neural Network model with the best hyperparameters
nn_best_params = {
    'hidden_layers': [best_params[f'hidden_layer_{i}'] for i in range(3)],
    'learning_rate': best_params['nn_learning_rate'],
    'batch_size': best_params['batch_size'],
    'num_epochs': best_params['num_epochs']
}
input_dim = X_train_transformed.shape[1]
output_dim = len(np.unique(y))
best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers'], output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(nn_best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_transformed_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['CatBoost + NN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:48:28,289] A new study created in memory with name: no-name-c0bbadbb-c617-46f0-8765-0f91504a61c6


Using device: cpu


  'learning_rate': trial.suggest_loguniform('catboost_learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:34,396] Trial 0 finished with value: 0.625 and parameters: {'iterations': 153, 'depth': 9, 'catboost_learning_rate': 0.01773245000576407, 'l2_leaf_reg': 0.003396809801524944, 'border_count': 108, 'hidden_layer_0': 133, 'hidden_layer_1': 104, 'hidden_layer_2': 81, 'nn_learning_rate': 0.03713133597993742, 'batch_size': 64, 'num_epochs': 100}. Best is trial 0 with value: 0.625.
  'learning_rate': trial.suggest_loguniform('catboost_learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:48:35,377] Trial 1 finished with value: 0.75 and parameters: {'iterations': 85, 'depth': 8, 'catboost_learning_rate': 0.052

                    Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression     0.75  0.927778                    0.002991   
KNN                     0.75       1.0                    0.000996   
Decision Tree          0.875  0.931746                       0.001   
Random Forest           0.75  0.933333                    0.118717   
Gradient Boosting      0.875  0.920635                    0.247005   
XGBoost                 0.75  0.805556                    0.082778   
LightGBM               0.125       0.5                    0.010972   
CatBoost                0.75  0.977778                    0.119442   
MLP                     0.75       1.0                    0.513627   
DNN                     0.75  0.922222                     0.23946   
DCN                    0.875  0.927778                    0.297203   
Wide_and_Deep          0.875  0.830556                    0.449732   
XGBoost + NN           0.875  0.888889                    0.364206   
LightGBM + NN       

In [58]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class AutoInt(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_heads, num_layers):
        super(AutoInt, self).__init__()
        self.embedding = nn.Linear(input_dim, embedding_dim)
        self.attention_layers = nn.ModuleList([
            nn.MultiheadAttention(embedding_dim, num_heads) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        for attn_layer in self.attention_layers:
            x, _ = attn_layer(x, x, x)
        x = x.squeeze(1)
        x = self.fc(x)
        return x

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(NeuralNetwork, self).__init__()
        layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for AutoInt
    num_heads = trial.suggest_int('num_heads', 1, 8)
    embedding_dim = trial.suggest_int('embedding_dim', num_heads, 64, step=num_heads)
    num_layers = trial.suggest_int('num_layers', 1, 3)
    
    # Train AutoInt model
    autoint_model = AutoInt(X_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
    optimizer = optim.Adam(autoint_model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor.float().unsqueeze(1))
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    for epoch in range(10):  # Fixed number of epochs for AutoInt
        autoint_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = autoint_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    
    # Extract features using AutoInt
    autoint_model.eval()
    with torch.no_grad():
        X_train_transformed = autoint_model.embedding(X_train_tensor).cpu().numpy()
        X_test_transformed = autoint_model.embedding(X_test_tensor).cpu().numpy()
    
    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
    
    # Define hyperparameters to tune for Neural Network
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Neural Network model
    input_dim = X_train_transformed.shape[1]
    output_dim = len(np.unique(y))
    model = NeuralNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_transformed_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final AutoInt model with the best hyperparameters
embedding_dim = best_params['embedding_dim']
num_heads = best_params['num_heads']
num_layers = best_params['num_layers']
autoint_model = AutoInt(X_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
optimizer = optim.Adam(autoint_model.parameters(), lr=0.001)
criterion = nn.MSELoss()

train_dataset = TensorDataset(X_train_tensor, y_train_tensor.float().unsqueeze(1))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

for epoch in range(10):  # Fixed number of epochs for AutoInt
    autoint_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = autoint_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

# Extract features using AutoInt
autoint_model.eval()
with torch.no_grad():
    X_train_transformed = autoint_model.embedding(X_train_tensor).cpu().numpy()
    X_test_transformed = autoint_model.embedding(X_test_tensor).cpu().numpy()

# Convert to PyTorch tensors
X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)

# Train the final Neural Network model with the best hyperparameters
nn_best_params = {
    'hidden_layers': [best_params[f'hidden_layer_{i}'] for i in range(3)],
    'learning_rate': best_params['nn_learning_rate'],
    'batch_size': best_params['batch_size'],
    'num_epochs': best_params['num_epochs']
}
input_dim = X_train_transformed.shape[1]
output_dim = len(np.unique(y))
best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers'], output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(nn_best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_transformed_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['AutoInt'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:49:17,041] A new study created in memory with name: no-name-f2c415c6-28ee-431f-8031-586e30dec0b2


Using device: cpu


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:49:17,308] Trial 0 finished with value: 0.625 and parameters: {'num_heads': 1, 'embedding_dim': 60, 'num_layers': 3, 'hidden_layer_0': 59, 'hidden_layer_1': 248, 'hidden_layer_2': 107, 'nn_learning_rate': 0.00014720027560375466, 'batch_size': 32, 'num_epochs': 17}. Best is trial 0 with value: 0.625.
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:49:17,858] Trial 1 finished with value: 0.875 and parameters: {'num_heads': 3, 'embedding_dim': 39, 'num_layers': 1, 'hidden_layer_0': 176, 'hidden_layer_1': 147, 'hidden_layer_2': 66, 'nn_learning_rate': 0.04480925362642859, 'batch_size': 256, 'num_epochs': 97}. Best is trial 1 with value: 0.875.
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:49:18,646] Trial 2 finished with value: 0.75 and parameters: {'num_heads': 7, 'embedding_dim': 42, 'num_layers': 2,

                    Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression     0.75  0.927778                    0.002991   
KNN                     0.75       1.0                    0.000996   
Decision Tree          0.875  0.931746                       0.001   
Random Forest           0.75  0.933333                    0.118717   
Gradient Boosting      0.875  0.920635                    0.247005   
XGBoost                 0.75  0.805556                    0.082778   
LightGBM               0.125       0.5                    0.010972   
CatBoost                0.75  0.977778                    0.119442   
MLP                     0.75       1.0                    0.513627   
DNN                     0.75  0.922222                     0.23946   
DCN                    0.875  0.927778                    0.297203   
Wide_and_Deep          0.875  0.830556                    0.449732   
XGBoost + NN           0.875  0.888889                    0.364206   
LightGBM + NN       

In [59]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class FTTransformer(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_heads, num_layers):
        super(FTTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, embedding_dim)
        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embedding_dim, input_dim)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        for transformer_layer in self.transformer_layers:
            x = transformer_layer(x)
        x = x.squeeze(1)
        x = self.fc(x)
        return x

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(NeuralNetwork, self).__init__()
        layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for FT-Transformer
    num_heads = trial.suggest_int('num_heads', 1, 8)
    embedding_dim = trial.suggest_int('embedding_dim', num_heads, 64, step=num_heads)
    num_layers = trial.suggest_int('num_layers', 1, 3)
    
    # Train FT-Transformer model
    ft_transformer_model = FTTransformer(X_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
    optimizer = optim.Adam(ft_transformer_model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor.float().unsqueeze(1))
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    for epoch in range(10):  # Fixed number of epochs for FT-Transformer
        ft_transformer_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = ft_transformer_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    
    # Extract features using FT-Transformer
    ft_transformer_model.eval()
    with torch.no_grad():
        X_train_transformed = ft_transformer_model.embedding(X_train_tensor).cpu().numpy()
        X_test_transformed = ft_transformer_model.embedding(X_test_tensor).cpu().numpy()
    
    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
    
    # Define hyperparameters to tune for Neural Network
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Neural Network model
    input_dim = X_train_transformed.shape[1]
    output_dim = len(np.unique(y))
    model = NeuralNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_transformed_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final FT-Transformer model with the best hyperparameters
embedding_dim = best_params['embedding_dim']
num_heads = best_params['num_heads']
num_layers = best_params['num_layers']
ft_transformer_model = FTTransformer(X_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
optimizer = optim.Adam(ft_transformer_model.parameters(), lr=0.001)
criterion = nn.MSELoss()

train_dataset = TensorDataset(X_train_tensor, y_train_tensor.float().unsqueeze(1))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

for epoch in range(10):  # Fixed number of epochs for FT-Transformer
    ft_transformer_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = ft_transformer_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

# Extract features using FT-Transformer
ft_transformer_model.eval()
with torch.no_grad():
    X_train_transformed = ft_transformer_model.embedding(X_train_tensor).cpu().numpy()
    X_test_transformed = ft_transformer_model.embedding(X_test_tensor).cpu().numpy()

# Convert to PyTorch tensors
X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)

# Train the final Neural Network model with the best hyperparameters
nn_best_params = {
    'hidden_layers': [best_params[f'hidden_layer_{i}'] for i in range(3)],
    'learning_rate': best_params['nn_learning_rate'],
    'batch_size': best_params['batch_size'],
    'num_epochs': best_params['num_epochs']
}
input_dim = X_train_transformed.shape[1]
output_dim = len(np.unique(y))
best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers'], output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(nn_best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_transformed_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['FT-Transformer'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:49:51,798] A new study created in memory with name: no-name-14fa1cd7-5559-4abd-8519-080704ddae98


Using device: cpu


  return F.mse_loss(input, target, reduction=self.reduction)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:50:00,084] Trial 0 finished with value: 0.875 and parameters: {'num_heads': 5, 'embedding_dim': 50, 'num_layers': 2, 'hidden_layer_0': 89, 'hidden_layer_1': 188, 'hidden_layer_2': 116, 'nn_learning_rate': 0.019592167641062218, 'batch_size': 64, 'num_epochs': 51}. Best is trial 0 with value: 0.875.
  return F.mse_loss(input, target, reduction=self.reduction)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:50:04,942] Trial 1 finished with value: 0.875 and parameters: {'num_heads': 5, 'embedding_dim': 20, 'num_layers': 3, 'hidden_layer_0': 39, 'hidden_layer_1': 216, 'hidden_layer_2': 42, 'nn_learning_rate': 0.008545646708488109, 'batch_size': 64, 'num_epochs': 95}. Best is trial 0 with value: 0.875.
  return F.mse_loss(input, target, reduction=self.reduction)
  nn_learning_rate = trial.sug

                    Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression     0.75  0.927778                    0.002991   
KNN                     0.75       1.0                    0.000996   
Decision Tree          0.875  0.931746                       0.001   
Random Forest           0.75  0.933333                    0.118717   
Gradient Boosting      0.875  0.920635                    0.247005   
XGBoost                 0.75  0.805556                    0.082778   
LightGBM               0.125       0.5                    0.010972   
CatBoost                0.75  0.977778                    0.119442   
MLP                     0.75       1.0                    0.513627   
DNN                     0.75  0.922222                     0.23946   
DCN                    0.875  0.927778                    0.297203   
Wide_and_Deep          0.875  0.830556                    0.449732   
XGBoost + NN           0.875  0.888889                    0.364206   
LightGBM + NN       

In [60]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(NeuralNetwork, self).__init__()
        layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for Neural Network
    num_layers = trial.suggest_int('num_layers', 1, 5)
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(num_layers)]
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Neural Network model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y))
    model = NeuralNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final Neural Network model with the best hyperparameters
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
best_model = NeuralNetwork(input_dim, 
                           [best_params[f'hidden_layer_{i}'] for i in range(best_params['num_layers'])], 
                           output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['Neural Architecture Search'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:50:36,225] A new study created in memory with name: no-name-ff75fcac-0335-43eb-b5f4-33102e0bea4b


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:50:37,197] Trial 0 finished with value: 0.75 and parameters: {'num_layers': 1, 'hidden_layer_0': 82, 'learning_rate': 0.009076696335475115, 'batch_size': 64, 'num_epochs': 59}. Best is trial 0 with value: 0.75.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:50:37,425] Trial 1 finished with value: 0.625 and parameters: {'num_layers': 2, 'hidden_layer_0': 72, 'hidden_layer_1': 64, 'learning_rate': 0.00045258260109776665, 'batch_size': 32, 'num_epochs': 72}. Best is trial 0 with value: 0.75.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:50:37,651] Trial 2 finished with value: 0.75 and parameters: {'num_layers': 5, 'hidden_layer_0': 148, 'hidden_layer_1': 208, 'hidden_layer_2': 140, 'hidden_layer_3': 51, 'hidden_layer_4': 250, 'learning_rate': 0.03738912571197548, 'batch_size': 128, 'num_epochs': 33}. Best is trial 0 w

                           Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression            0.75  0.927778                    0.002991   
KNN                            0.75       1.0                    0.000996   
Decision Tree                 0.875  0.931746                       0.001   
Random Forest                  0.75  0.933333                    0.118717   
Gradient Boosting             0.875  0.920635                    0.247005   
XGBoost                        0.75  0.805556                    0.082778   
LightGBM                      0.125       0.5                    0.010972   
CatBoost                       0.75  0.977778                    0.119442   
MLP                            0.75       1.0                    0.513627   
DNN                            0.75  0.922222                     0.23946   
DCN                           0.875  0.927778                    0.297203   
Wide_and_Deep                 0.875  0.830556                    0.449732   

In [61]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class NODE(nn.Module):
    def __init__(self, input_dim, num_layers, num_trees, tree_dim, output_dim):
        super(NODE, self).__init__()
        self.layers = nn.ModuleList()
        for _ in range(num_layers):
            layer = nn.ModuleList()
            for _ in range(num_trees):
                tree = nn.Sequential(
                    nn.Linear(input_dim, tree_dim),
                    nn.ReLU(),
                    nn.Linear(tree_dim, 1)
                )
                layer.append(tree)
            self.layers.append(layer)
        self.output = nn.Linear(num_layers * num_trees, output_dim)

    def forward(self, x):
        tree_outputs = []
        for layer in self.layers:
            layer_outputs = []
            for tree in layer:
                layer_outputs.append(tree(x))
            layer_output = torch.cat(layer_outputs, dim=1)
            tree_outputs.append(layer_output)
        x = torch.cat(tree_outputs, dim=1)
        return self.output(x)

def objective(trial):
    # Define hyperparameters to tune for NODE
    num_layers = trial.suggest_int('num_layers', 1, 5)
    num_trees = trial.suggest_int('num_trees', 1, 10)
    tree_dim = trial.suggest_int('tree_dim', 8, 64)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the NODE model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y))
    model = NODE(input_dim, num_layers, num_trees, tree_dim, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final NODE model with the best hyperparameters
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
best_model = NODE(input_dim, 
                  best_params['num_layers'], 
                  best_params['num_trees'], 
                  best_params['tree_dim'], 
                  output_dim).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    
    _, predicted = torch.max(outputs, 1)
    y_pred = predicted.cpu().numpy()
    proba = torch.softmax(outputs, dim=1).cpu().numpy()

    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        auc = roc_auc_score(y_true, proba[:, 1])
    else:  # Multi-class classification
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['NODE'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:50:53,824] A new study created in memory with name: no-name-cb598485-e337-47ff-a458-342b40d7bac2


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:50:54,275] Trial 0 finished with value: 0.75 and parameters: {'num_layers': 4, 'num_trees': 5, 'tree_dim': 59, 'learning_rate': 0.006039884739845829, 'batch_size': 32, 'num_epochs': 21}. Best is trial 0 with value: 0.75.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:50:56,201] Trial 1 finished with value: 0.75 and parameters: {'num_layers': 5, 'num_trees': 7, 'tree_dim': 51, 'learning_rate': 0.005037138471410539, 'batch_size': 256, 'num_epochs': 83}. Best is trial 0 with value: 0.75.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:50:56,733] Trial 2 finished with value: 0.75 and parameters: {'num_layers': 3, 'num_trees': 2, 'tree_dim': 43, 'learning_rate': 0.014640401333604208, 'batch_size': 32, 'num_epochs': 94}. Best is trial 0 with value: 0.75.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-

                           Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression            0.75  0.927778                    0.002991   
KNN                            0.75       1.0                    0.000996   
Decision Tree                 0.875  0.931746                       0.001   
Random Forest                  0.75  0.933333                    0.118717   
Gradient Boosting             0.875  0.920635                    0.247005   
XGBoost                        0.75  0.805556                    0.082778   
LightGBM                      0.125       0.5                    0.010972   
CatBoost                       0.75  0.977778                    0.119442   
MLP                            0.75       1.0                    0.513627   
DNN                            0.75  0.922222                     0.23946   
DCN                           0.875  0.927778                    0.297203   
Wide_and_Deep                 0.875  0.830556                    0.449732   

In [62]:
import pandas as pd
import numpy as np
import time
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from pytorch_tabnet.tab_model import TabNetClassifier
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

def objective(trial):
    # Define hyperparameters to tune for TabNet
    n_d = trial.suggest_int('n_d', 8, 64)
    n_a = trial.suggest_int('n_a', 8, 64)
    n_steps = trial.suggest_int('n_steps', 3, 10)
    gamma = trial.suggest_float('gamma', 1.0, 2.0)
    lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the TabNet model
    model = TabNetClassifier(
        n_d=n_d,
        n_a=n_a,
        n_steps=n_steps,
        gamma=gamma,
        lambda_sparse=lambda_sparse,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=learning_rate),
        device_name=device
    )

    # Training
    model.fit(
        X_train=X_train_scaled, y_train=y_train.values,
        eval_set=[(X_test_scaled, y_test.values)],
        eval_name=['val'],
        eval_metric=['accuracy'],
        max_epochs=num_epochs,
        patience=10,
        batch_size=batch_size,
        virtual_batch_size=batch_size // 2,
        num_workers=0,
        drop_last=False
    )

    # Evaluation
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final TabNet model with the best hyperparameters
best_model = TabNetClassifier(
    n_d=best_params['n_d'],
    n_a=best_params['n_a'],
    n_steps=best_params['n_steps'],
    gamma=best_params['gamma'],
    lambda_sparse=best_params['lambda_sparse'],
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=best_params['learning_rate']),
    device_name=device
)

training_start_time = time.time()
best_model.fit(
    X_train=X_train_scaled, y_train=y_train.values,
    eval_set=[(X_test_scaled, y_test.values)],
    eval_name=['val'],
    eval_metric=['accuracy'],
    max_epochs=best_params['num_epochs'],
    patience=10,
    batch_size=best_params['batch_size'],
    virtual_batch_size=best_params['batch_size'] // 2,
    num_workers=0,
    drop_last=False
)
training_time = time.time() - training_start_time

# Evaluation
y_pred = best_model.predict(X_test_scaled)
inference_start_time = time.time()
y_pred_proba = best_model.predict_proba(X_test_scaled)
inference_time = time.time() - inference_start_time

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
if len(np.unique(y)) == 2:  # Binary classification
    auc = roc_auc_score(y_test, y_pred_proba[:, 1])
else:  # Multiclass classification
    auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['TabNet'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:51:28,267] A new study created in memory with name: no-name-65d91aa5-2e31-4c38-9900-2149d78c7c9e


Using device: cpu


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.03484 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 2.53832 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 2.39976 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 2.10387 | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 1.88961 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 1.67937 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 1.53497 | val_accuracy: 0.375   |  0:00:00s
epoch 7  | loss: 1.47155 | val_accuracy: 0.375   |  0:00:00s
epoch 8  | loss: 1.25748 | val_accuracy: 0.5     |  0:00:00s
epoch 9  | loss: 1.15941 | val_accuracy: 0.375   |  0:00:00s
epoch 10 | loss: 1.01234 | val_accuracy: 0.375   |  0:00:00s
epoch 11 | loss: 0.85431 | val_accuracy: 0.375   |  0:00:00s
epoch 12 | loss: 0.7321  | val_accuracy: 0.375   |  0:00:00s
epoch 13 | loss: 0.63131 | val_accuracy: 0.25    |  0:00:01s
epoch 14 | loss: 0.53783 | val_accuracy: 0.25    |  0:00:01s
epoch 15 | loss: 0.45452 | val_accuracy: 0.25    |  0:00:01s
epoch 16 | loss: 0.39329

[I 2024-07-26 07:51:29,788] Trial 0 finished with value: 0.5 and parameters: {'n_d': 19, 'n_a': 30, 'n_steps': 5, 'gamma': 1.556564525825605, 'lambda_sparse': 3.1173888195380574e-05, 'learning_rate': 0.002232208624417936, 'batch_size': 256, 'num_epochs': 96}. Best is trial 0 with value: 0.5.


epoch 18 | loss: 0.2822  | val_accuracy: 0.5     |  0:00:01s

Early stopping occurred at epoch 18 with best_epoch = 8 and best_val_accuracy = 0.5


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 5.61421 | val_accuracy: 0.125   |  0:00:00s
epoch 1  | loss: 5.22528 | val_accuracy: 0.125   |  0:00:00s
epoch 2  | loss: 5.49748 | val_accuracy: 0.125   |  0:00:00s
epoch 3  | loss: 4.9106  | val_accuracy: 0.125   |  0:00:00s
epoch 4  | loss: 4.98214 | val_accuracy: 0.125   |  0:00:00s
epoch 5  | loss: 4.70704 | val_accuracy: 0.125   |  0:00:00s
epoch 6  | loss: 4.17931 | val_accuracy: 0.25    |  0:00:00s
epoch 7  | loss: 3.97757 | val_accuracy: 0.125   |  0:00:01s
epoch 8  | loss: 3.72314 | val_accuracy: 0.125   |  0:00:01s
epoch 9  | loss: 3.44004 | val_accuracy: 0.25    |  0:00:01s
epoch 10 | loss: 2.91679 | val_accuracy: 0.25    |  0:00:01s
epoch 11 | loss: 2.9444  | val_accuracy: 0.25    |  0:00:01s
epoch 12 | loss: 2.45651 | val_accuracy: 0.25    |  0:00:01s
epoch 13 | loss: 2.21151 | val_accuracy: 0.125   |  0:00:01s
epoch 14 | loss: 2.38589 | val_accuracy: 0.125   |  0:00:01s
epoch 15 | loss: 2.23473 | val_accuracy: 0.25    |  0:00:01s


[I 2024-07-26 07:51:32,031] Trial 1 finished with value: 0.25 and parameters: {'n_d': 31, 'n_a': 59, 'n_steps': 9, 'gamma': 1.9087487269211894, 'lambda_sparse': 7.258238302276175e-05, 'learning_rate': 0.0007502316990846142, 'batch_size': 128, 'num_epochs': 48}. Best is trial 0 with value: 0.5.


epoch 16 | loss: 2.12412 | val_accuracy: 0.25    |  0:00:02s

Early stopping occurred at epoch 16 with best_epoch = 6 and best_val_accuracy = 0.25


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.31908 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.2873  | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 1.03215 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 0.72689 | val_accuracy: 0.75    |  0:00:00s
epoch 4  | loss: 0.31858 | val_accuracy: 0.75    |  0:00:00s
epoch 5  | loss: 0.24203 | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.22771 | val_accuracy: 0.75    |  0:00:01s
epoch 7  | loss: 0.29577 | val_accuracy: 0.75    |  0:00:01s
epoch 8  | loss: 0.05836 | val_accuracy: 0.75    |  0:00:01s
epoch 9  | loss: 0.22151 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.28188 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.1801  | val_accuracy: 0.5     |  0:00:01s


[I 2024-07-26 07:51:33,970] Trial 2 finished with value: 0.75 and parameters: {'n_d': 39, 'n_a': 21, 'n_steps': 8, 'gamma': 1.3028252142102472, 'lambda_sparse': 6.617813696848974e-06, 'learning_rate': 0.010491828327412765, 'batch_size': 32, 'num_epochs': 22}. Best is trial 2 with value: 0.75.


epoch 12 | loss: 0.2458  | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.31854 | val_accuracy: 0.625   |  0:00:01s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.82017 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.59947 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 0.58559 | val_accuracy: 0.875   |  0:00:00s
epoch 3  | loss: 0.30334 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.35251 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 0.1936  | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.20486 | val_accuracy: 0.625   |  0:00:00s
epoch 7  | loss: 0.06346 | val_accuracy: 0.75    |  0:00:00s
epoch 8  | loss: 0.10369 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.19772 | val_accuracy: 0.5     |  0:00:01s
epoch 10 | loss: 0.01403 | val_accuracy: 0.5     |  0:00:01s


[I 2024-07-26 07:51:35,642] Trial 3 finished with value: 0.875 and parameters: {'n_d': 53, 'n_a': 22, 'n_steps': 9, 'gamma': 1.8647734943223964, 'lambda_sparse': 8.733634015881521e-05, 'learning_rate': 0.018643588068473157, 'batch_size': 128, 'num_epochs': 70}. Best is trial 3 with value: 0.875.


epoch 11 | loss: 0.06089 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.00304 | val_accuracy: 0.75    |  0:00:01s

Early stopping occurred at epoch 12 with best_epoch = 2 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.7942  | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 0.84023 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 0.69361 | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 0.57125 | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 0.68997 | val_accuracy: 0.75    |  0:00:00s
epoch 5  | loss: 0.43867 | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.41765 | val_accuracy: 0.625   |  0:00:00s
epoch 7  | loss: 0.2317  | val_accuracy: 0.5     |  0:00:00s
epoch 8  | loss: 0.30155 | val_accuracy: 0.625   |  0:00:00s
epoch 9  | loss: 0.32288 | val_accuracy: 0.5     |  0:00:00s
epoch 10 | loss: 0.12804 | val_accuracy: 0.25    |  0:00:00s
epoch 11 | loss: 0.11992 | val_accuracy: 0.25    |  0:00:00s
epoch 12 | loss: 0.05517 | val_accuracy: 0.25    |  0:00:00s
epoch 13 | loss: 0.02834 | val_accuracy: 0.375   |  0:00:00s
epoch 14 | loss: 0.11071 | val_accuracy: 0.625   |  0:00:00s

Early stopping occurred at epoch 14 with best_epoch = 4 and best_val_accuracy = 0.75

[I 2024-07-26 07:51:36,731] Trial 4 finished with value: 0.75 and parameters: {'n_d': 46, 'n_a': 23, 'n_steps': 4, 'gamma': 1.93500193295748, 'lambda_sparse': 0.0005362243130477063, 'learning_rate': 0.011396648187614777, 'batch_size': 32, 'num_epochs': 39}. Best is trial 3 with value: 0.875.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 4.25718 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 3.87252 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 2.70745 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 1.71801 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 0.83841 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 0.77875 | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 0.43579 | val_accuracy: 0.25    |  0:00:00s
epoch 7  | loss: 0.22863 | val_accuracy: 0.25    |  0:00:00s
epoch 8  | loss: 0.12777 | val_accuracy: 0.375   |  0:00:00s
epoch 9  | loss: 0.06096 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 0.05203 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.04027 | val_accuracy: 0.5     |  0:00:01s
epoch 12 | loss: 0.02368 | val_accuracy: 0.5     |  0:00:01s
epoch 13 | loss: 0.01491 | val_accuracy: 0.375   |  0:00:01s
epoch 14 | loss: 0.01019 | val_accuracy: 0.375   |  0:00:01s
epoch 15 | loss: 0.0071  | val_accuracy: 0.5     |  0:00:01s
epoch 16 | loss: 0.00469

[I 2024-07-26 07:51:40,100] Trial 5 finished with value: 0.625 and parameters: {'n_d': 30, 'n_a': 54, 'n_steps': 8, 'gamma': 1.0932220007623157, 'lambda_sparse': 5.37579987029907e-06, 'learning_rate': 0.003877140174416138, 'batch_size': 256, 'num_epochs': 42}. Best is trial 3 with value: 0.875.


epoch 30 | loss: 0.00033 | val_accuracy: 0.5     |  0:00:03s

Early stopping occurred at epoch 30 with best_epoch = 20 and best_val_accuracy = 0.625


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.70753 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 2.51915 | val_accuracy: 0.75    |  0:00:00s
epoch 2  | loss: 0.415   | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 0.85367 | val_accuracy: 0.75    |  0:00:00s
epoch 4  | loss: 0.33942 | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.10524 | val_accuracy: 0.625   |  0:00:01s
epoch 6  | loss: 0.22911 | val_accuracy: 0.625   |  0:00:01s
epoch 7  | loss: 0.32055 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.05427 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.16377 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.01161 | val_accuracy: 0.5     |  0:00:02s
epoch 11 | loss: 0.00219 | val_accuracy: 0.5     |  0:00:02s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_accuracy = 0.75


[I 2024-07-26 07:51:42,621] Trial 6 finished with value: 0.75 and parameters: {'n_d': 59, 'n_a': 45, 'n_steps': 8, 'gamma': 1.2112221688064326, 'lambda_sparse': 7.955328501488355e-05, 'learning_rate': 0.038786769053035866, 'batch_size': 64, 'num_epochs': 38}. Best is trial 3 with value: 0.875.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.67921 | val_accuracy: 0.0     |  0:00:00s
epoch 1  | loss: 2.58568 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 1.93807 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 1.46323 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.95907 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 1.46162 | val_accuracy: 0.625   |  0:00:01s
epoch 6  | loss: 1.0283  | val_accuracy: 0.875   |  0:00:01s
epoch 7  | loss: 0.95073 | val_accuracy: 0.875   |  0:00:01s
epoch 8  | loss: 0.90242 | val_accuracy: 0.875   |  0:00:01s
epoch 9  | loss: 0.89346 | val_accuracy: 0.875   |  0:00:01s
epoch 10 | loss: 1.30834 | val_accuracy: 0.875   |  0:00:01s
epoch 11 | loss: 0.55264 | val_accuracy: 0.875   |  0:00:02s
epoch 12 | loss: 0.68679 | val_accuracy: 1.0     |  0:00:02s
epoch 13 | loss: 0.79067 | val_accuracy: 1.0     |  0:00:02s
epoch 14 | loss: 0.68402 | val_accuracy: 0.75    |  0:00:02s
epoch 15 | loss: 0.37625 | val_accuracy: 0.75    |  0:00:02s
epoch 16 | loss: 0.74194

[I 2024-07-26 07:51:46,600] Trial 7 finished with value: 1.0 and parameters: {'n_d': 57, 'n_a': 17, 'n_steps': 9, 'gamma': 1.3034245899800996, 'lambda_sparse': 7.612386327078212e-06, 'learning_rate': 0.0016123969024413201, 'batch_size': 32, 'num_epochs': 38}. Best is trial 7 with value: 1.0.


epoch 22 | loss: 0.41653 | val_accuracy: 0.625   |  0:00:03s

Early stopping occurred at epoch 22 with best_epoch = 12 and best_val_accuracy = 1.0


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.08827 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 2.94747 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 2.97685 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 2.88588 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 2.80256 | val_accuracy: 0.25    |  0:00:00s
epoch 5  | loss: 2.72652 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 2.67119 | val_accuracy: 0.375   |  0:00:00s
epoch 7  | loss: 2.59109 | val_accuracy: 0.375   |  0:00:00s
epoch 8  | loss: 2.54188 | val_accuracy: 0.375   |  0:00:00s
epoch 9  | loss: 2.50543 | val_accuracy: 0.25    |  0:00:00s
epoch 10 | loss: 2.46434 | val_accuracy: 0.375   |  0:00:00s
epoch 11 | loss: 2.42132 | val_accuracy: 0.375   |  0:00:00s


[I 2024-07-26 07:51:47,485] Trial 8 finished with value: 0.375 and parameters: {'n_d': 18, 'n_a': 17, 'n_steps': 6, 'gamma': 1.1197562112490531, 'lambda_sparse': 7.865725547242261e-06, 'learning_rate': 0.00021391281369107567, 'batch_size': 128, 'num_epochs': 43}. Best is trial 7 with value: 1.0.


epoch 12 | loss: 2.3676  | val_accuracy: 0.375   |  0:00:00s
epoch 13 | loss: 2.3576  | val_accuracy: 0.25    |  0:00:00s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.375


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 4.60874 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 3.4687  | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 2.82399 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 2.69998 | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 2.36756 | val_accuracy: 0.25    |  0:00:00s
epoch 5  | loss: 1.77063 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 1.35233 | val_accuracy: 0.75    |  0:00:00s
epoch 7  | loss: 0.66564 | val_accuracy: 0.625   |  0:00:00s
epoch 8  | loss: 0.45678 | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.53094 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.34905 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.37621 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.16669 | val_accuracy: 0.75    |  0:00:01s
epoch 13 | loss: 0.16314 | val_accuracy: 0.75    |  0:00:01s
epoch 14 | loss: 0.10188 | val_accuracy: 0.75    |  0:00:01s
epoch 15 | loss: 0.06926 | val_accuracy: 0.75    |  0:00:01s
epoch 16 | loss: 0.05127

[I 2024-07-26 07:51:50,624] Trial 9 finished with value: 0.875 and parameters: {'n_d': 29, 'n_a': 53, 'n_steps': 8, 'gamma': 1.6091211852136809, 'lambda_sparse': 0.0004124409399027308, 'learning_rate': 0.003478145022163928, 'batch_size': 256, 'num_epochs': 49}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 5.28729 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 6.00497 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 5.75246 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 4.8725  | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 6.37624 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 5.94951 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 4.82025 | val_accuracy: 0.375   |  0:00:01s
epoch 7  | loss: 5.34019 | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 5.28923 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 5.02494 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 5.44393 | val_accuracy: 0.375   |  0:00:01s
epoch 11 | loss: 5.40111 | val_accuracy: 0.375   |  0:00:01s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_accuracy = 0.5


[I 2024-07-26 07:51:52,666] Trial 10 finished with value: 0.5 and parameters: {'n_d': 63, 'n_a': 37, 'n_steps': 10, 'gamma': 1.370490145223286, 'lambda_sparse': 2.7312909894349267e-06, 'learning_rate': 0.00011007196691242702, 'batch_size': 32, 'num_epochs': 12}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.62699 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 2.42588 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 0.66821 | val_accuracy: 0.625   |  0:00:01s
epoch 3  | loss: 0.63924 | val_accuracy: 0.75    |  0:00:01s
epoch 4  | loss: 0.91726 | val_accuracy: 0.75    |  0:00:01s
epoch 5  | loss: 1.67687 | val_accuracy: 0.625   |  0:00:01s
epoch 6  | loss: 0.7964  | val_accuracy: 0.5     |  0:00:01s
epoch 7  | loss: 0.44735 | val_accuracy: 0.875   |  0:00:01s
epoch 8  | loss: 0.52781 | val_accuracy: 0.75    |  0:00:01s
epoch 9  | loss: 0.74244 | val_accuracy: 0.625   |  0:00:02s
epoch 10 | loss: 0.33466 | val_accuracy: 0.625   |  0:00:02s
epoch 11 | loss: 0.04602 | val_accuracy: 0.625   |  0:00:02s
epoch 12 | loss: 0.00491 | val_accuracy: 0.5     |  0:00:02s
epoch 13 | loss: 0.30945 | val_accuracy: 0.625   |  0:00:02s
epoch 14 | loss: 0.36166 | val_accuracy: 0.625   |  0:00:02s
epoch 15 | loss: 0.10807 | val_accuracy: 0.75    |  0:00:02s
epoch 16 | loss: 0.0043 

[I 2024-07-26 07:51:56,164] Trial 11 finished with value: 0.875 and parameters: {'n_d': 50, 'n_a': 8, 'n_steps': 10, 'gamma': 1.8176929918863265, 'lambda_sparse': 1.0772402330953988e-06, 'learning_rate': 0.07057026510783988, 'batch_size': 128, 'num_epochs': 76}. Best is trial 7 with value: 1.0.



Early stopping occurred at epoch 17 with best_epoch = 7 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.71505 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.6601  | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 1.71494 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 1.75709 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 1.48737 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 1.46083 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 1.23015 | val_accuracy: 0.375   |  0:00:00s
epoch 7  | loss: 1.16298 | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 1.09208 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 1.02651 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 1.02277 | val_accuracy: 0.375   |  0:00:01s
epoch 11 | loss: 1.04077 | val_accuracy: 0.25    |  0:00:01s


[I 2024-07-26 07:51:57,984] Trial 12 finished with value: 0.5 and parameters: {'n_d': 53, 'n_a': 9, 'n_steps': 7, 'gamma': 1.7506342516499096, 'lambda_sparse': 2.6794340864588764e-05, 'learning_rate': 0.0008013126652704683, 'batch_size': 64, 'num_epochs': 70}. Best is trial 7 with value: 1.0.


epoch 12 | loss: 0.83019 | val_accuracy: 0.25    |  0:00:01s
epoch 13 | loss: 0.76777 | val_accuracy: 0.25    |  0:00:01s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.5


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.92818 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.17947 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 0.57006 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 0.54434 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.41692 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.33407 | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 0.09777 | val_accuracy: 0.5     |  0:00:01s
epoch 7  | loss: 0.01912 | val_accuracy: 0.625   |  0:00:01s
epoch 8  | loss: 0.27756 | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.22    | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.18488 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.011   | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.01699 | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.01931 | val_accuracy: 0.625   |  0:00:02s
epoch 14 | loss: 0.0041  | val_accuracy: 0.625   |  0:00:02s
epoch 15 | loss: 0.00429 | val_accuracy: 0.625   |  0:00:02s

Early stopping occurred

[I 2024-07-26 07:52:00,477] Trial 13 finished with value: 0.625 and parameters: {'n_d': 42, 'n_a': 32, 'n_steps': 10, 'gamma': 1.4293380668114093, 'lambda_sparse': 0.00014271626679335613, 'learning_rate': 0.017185852121093637, 'batch_size': 128, 'num_epochs': 65}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.17872 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 2.88954 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 2.67798 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 1.71385 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 2.09457 | val_accuracy: 0.25    |  0:00:00s
epoch 5  | loss: 1.91457 | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 1.99209 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 1.72819 | val_accuracy: 0.5     |  0:00:00s
epoch 8  | loss: 1.7439  | val_accuracy: 0.625   |  0:00:00s
epoch 9  | loss: 1.1715  | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 1.59925 | val_accuracy: 0.875   |  0:00:01s
epoch 11 | loss: 1.40371 | val_accuracy: 0.75    |  0:00:01s
epoch 12 | loss: 1.65445 | val_accuracy: 0.875   |  0:00:01s
epoch 13 | loss: 1.29405 | val_accuracy: 0.875   |  0:00:01s
epoch 14 | loss: 1.47996 | val_accuracy: 0.75    |  0:00:01s
epoch 15 | loss: 1.25678 | val_accuracy: 0.625   |  0:00:01s
epoch 16 | loss: 1.04543

[I 2024-07-26 07:52:02,756] Trial 14 finished with value: 0.875 and parameters: {'n_d': 56, 'n_a': 17, 'n_steps': 6, 'gamma': 1.664340522279427, 'lambda_sparse': 1.7659617696551092e-05, 'learning_rate': 0.0009069182415799667, 'batch_size': 32, 'num_epochs': 87}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.73375 | val_accuracy: 0.75    |  0:00:00s
epoch 1  | loss: 0.7706  | val_accuracy: 0.625   |  0:00:00s
epoch 2  | loss: 0.38852 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 0.14676 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.05365 | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.02394 | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 0.22996 | val_accuracy: 0.625   |  0:00:00s
epoch 7  | loss: 0.19586 | val_accuracy: 0.5     |  0:00:00s
epoch 8  | loss: 0.03438 | val_accuracy: 0.375   |  0:00:00s


[I 2024-07-26 07:52:03,483] Trial 15 finished with value: 0.75 and parameters: {'n_d': 64, 'n_a': 26, 'n_steps': 3, 'gamma': 1.471874723678618, 'lambda_sparse': 0.00016518329868424274, 'learning_rate': 0.018406685124564573, 'batch_size': 128, 'num_epochs': 62}. Best is trial 7 with value: 1.0.


epoch 9  | loss: 0.01615 | val_accuracy: 0.375   |  0:00:00s
epoch 10 | loss: 0.01313 | val_accuracy: 0.375   |  0:00:00s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 4.01481 | val_accuracy: 0.125   |  0:00:00s
epoch 1  | loss: 2.01307 | val_accuracy: 0.125   |  0:00:00s
epoch 2  | loss: 0.90863 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 0.2217  | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 0.2983  | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.18222 | val_accuracy: 0.75    |  0:00:01s
epoch 6  | loss: 0.11671 | val_accuracy: 0.75    |  0:00:01s
epoch 7  | loss: 0.09933 | val_accuracy: 0.625   |  0:00:01s
epoch 8  | loss: 0.07166 | val_accuracy: 0.75    |  0:00:01s
epoch 9  | loss: 0.1104  | val_accuracy: 0.75    |  0:00:01s
epoch 10 | loss: 0.07476 | val_accuracy: 0.75    |  0:00:01s
epoch 11 | loss: 0.13674 | val_accuracy: 0.875   |  0:00:01s
epoch 12 | loss: 0.05318 | val_accuracy: 0.875   |  0:00:02s
epoch 13 | loss: 0.09581 | val_accuracy: 0.875   |  0:00:02s
epoch 14 | loss: 0.02542 | val_accuracy: 0.875   |  0:00:02s
epoch 15 | loss: 0.03673 | val_accuracy: 0.75    |  0:00:02s
epoch 16 | loss: 0.06489

[I 2024-07-26 07:52:06,969] Trial 16 finished with value: 0.875 and parameters: {'n_d': 53, 'n_a': 39, 'n_steps': 9, 'gamma': 1.0066330727174586, 'lambda_sparse': 1.4583266652296632e-05, 'learning_rate': 0.006704314033719113, 'batch_size': 32, 'num_epochs': 24}. Best is trial 7 with value: 1.0.


epoch 21 | loss: 0.04478 | val_accuracy: 0.75    |  0:00:03s

Early stopping occurred at epoch 21 with best_epoch = 11 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 6.59057 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 2.49338 | val_accuracy: 0.625   |  0:00:00s
epoch 2  | loss: 0.84202 | val_accuracy: 0.75    |  0:00:00s
epoch 3  | loss: 0.81008 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 0.51199 | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.33289 | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 0.44866 | val_accuracy: 0.875   |  0:00:00s
epoch 7  | loss: 0.14166 | val_accuracy: 0.75    |  0:00:01s
epoch 8  | loss: 0.027   | val_accuracy: 0.75    |  0:00:01s
epoch 9  | loss: 0.0755  | val_accuracy: 0.75    |  0:00:01s
epoch 10 | loss: 0.11924 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.00571 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.05179 | val_accuracy: 0.75    |  0:00:01s
epoch 13 | loss: 0.00189 | val_accuracy: 0.625   |  0:00:01s
epoch 14 | loss: 0.15819 | val_accuracy: 0.625   |  0:00:01s
epoch 15 | loss: 0.00259 | val_accuracy: 0.625   |  0:00:02s


[I 2024-07-26 07:52:09,287] Trial 17 finished with value: 0.875 and parameters: {'n_d': 46, 'n_a': 14, 'n_steps': 9, 'gamma': 1.2774293943692865, 'lambda_sparse': 5.878636232830251e-05, 'learning_rate': 0.03823478571651991, 'batch_size': 64, 'num_epochs': 80}. Best is trial 7 with value: 1.0.


epoch 16 | loss: 0.00293 | val_accuracy: 0.625   |  0:00:02s

Early stopping occurred at epoch 16 with best_epoch = 6 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.18282 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 2.30465 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 2.19266 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 2.30917 | val_accuracy: 0.5     |  0:00:01s
epoch 4  | loss: 2.25482 | val_accuracy: 0.375   |  0:00:01s
epoch 5  | loss: 2.25161 | val_accuracy: 0.375   |  0:00:01s
epoch 6  | loss: 2.14871 | val_accuracy: 0.375   |  0:00:02s
epoch 7  | loss: 2.2743  | val_accuracy: 0.25    |  0:00:02s
epoch 8  | loss: 2.17142 | val_accuracy: 0.25    |  0:00:02s
epoch 9  | loss: 2.19935 | val_accuracy: 0.375   |  0:00:02s
epoch 10 | loss: 2.06765 | val_accuracy: 0.375   |  0:00:02s
epoch 11 | loss: 2.21652 | val_accuracy: 0.375   |  0:00:02s
epoch 12 | loss: 2.11195 | val_accuracy: 0.375   |  0:00:02s
epoch 13 | loss: 2.13165 | val_accuracy: 0.25    |  0:00:02s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.5


[I 2024-07-26 07:52:12,369] Trial 18 finished with value: 0.5 and parameters: {'n_d': 12, 'n_a': 27, 'n_steps': 7, 'gamma': 1.770242394528392, 'lambda_sparse': 2.1052703485076523e-06, 'learning_rate': 0.00039554843784019635, 'batch_size': 128, 'num_epochs': 57}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.61122 | val_accuracy: 0.0     |  0:00:00s
epoch 1  | loss: 2.09753 | val_accuracy: 0.0     |  0:00:00s
epoch 2  | loss: 1.77585 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 1.91886 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 1.26659 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 1.31263 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 1.74004 | val_accuracy: 0.25    |  0:00:01s
epoch 7  | loss: 1.3632  | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 1.30278 | val_accuracy: 0.25    |  0:00:01s
epoch 9  | loss: 1.58022 | val_accuracy: 0.5     |  0:00:01s
epoch 10 | loss: 1.16553 | val_accuracy: 0.375   |  0:00:01s
epoch 11 | loss: 0.73444 | val_accuracy: 0.375   |  0:00:01s
epoch 12 | loss: 0.876   | val_accuracy: 0.375   |  0:00:01s
epoch 13 | loss: 1.08539 | val_accuracy: 0.25    |  0:00:01s
epoch 14 | loss: 1.00669 | val_accuracy: 0.25    |  0:00:02s
epoch 15 | loss: 0.48352 | val_accuracy: 0.25    |  0:00:02s
epoch 16 | loss: 0.51201

[I 2024-07-26 07:52:15,278] Trial 19 finished with value: 0.5 and parameters: {'n_d': 60, 'n_a': 43, 'n_steps': 9, 'gamma': 1.9985500692918534, 'lambda_sparse': 0.0002505328443623411, 'learning_rate': 0.0015361400779721618, 'batch_size': 32, 'num_epochs': 28}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.98261 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 2.31205 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 1.70144 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 1.80403 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 1.2485  | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 1.02459 | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 1.29502 | val_accuracy: 0.375   |  0:00:00s
epoch 7  | loss: 0.55924 | val_accuracy: 0.125   |  0:00:00s
epoch 8  | loss: 0.78842 | val_accuracy: 0.375   |  0:00:00s
epoch 9  | loss: 1.40273 | val_accuracy: 0.375   |  0:00:00s


[I 2024-07-26 07:52:16,614] Trial 20 finished with value: 0.5 and parameters: {'n_d': 36, 'n_a': 14, 'n_steps': 7, 'gamma': 1.6615679765897018, 'lambda_sparse': 1.329836744785016e-05, 'learning_rate': 0.005642610785196317, 'batch_size': 32, 'num_epochs': 98}. Best is trial 7 with value: 1.0.


epoch 10 | loss: 0.67516 | val_accuracy: 0.375   |  0:00:01s
epoch 11 | loss: 0.75441 | val_accuracy: 0.375   |  0:00:01s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_accuracy = 0.5


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 5.20448 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 4.81812 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 4.14604 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 3.86926 | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 3.20738 | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 3.61815 | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 3.4245  | val_accuracy: 0.625   |  0:00:01s
epoch 7  | loss: 2.80176 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 2.98066 | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 2.92229 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 2.56568 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 2.20845 | val_accuracy: 0.5     |  0:00:01s
epoch 12 | loss: 1.95458 | val_accuracy: 0.5     |  0:00:02s


[I 2024-07-26 07:52:18,983] Trial 21 finished with value: 0.625 and parameters: {'n_d': 27, 'n_a': 52, 'n_steps': 8, 'gamma': 1.5660068535545482, 'lambda_sparse': 0.00019010641457810245, 'learning_rate': 0.0017352455857720394, 'batch_size': 256, 'num_epochs': 53}. Best is trial 7 with value: 1.0.


epoch 13 | loss: 1.76727 | val_accuracy: 0.5     |  0:00:02s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.625


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.08751 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 1.70684 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 1.40799 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 1.24302 | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 0.98645 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 0.91391 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 0.83267 | val_accuracy: 0.375   |  0:00:01s
epoch 7  | loss: 0.8329  | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 0.70898 | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.45448 | val_accuracy: 0.5     |  0:00:01s
epoch 10 | loss: 0.39284 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.23079 | val_accuracy: 0.5     |  0:00:01s
epoch 12 | loss: 0.15603 | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.10677 | val_accuracy: 0.75    |  0:00:01s
epoch 14 | loss: 0.13571 | val_accuracy: 0.5     |  0:00:02s
epoch 15 | loss: 0.10789 | val_accuracy: 0.5     |  0:00:02s
epoch 16 | loss: 0.19593

[I 2024-07-26 07:52:22,275] Trial 22 finished with value: 0.75 and parameters: {'n_d': 24, 'n_a': 50, 'n_steps': 9, 'gamma': 1.6197556849657164, 'lambda_sparse': 0.0005996580319049698, 'learning_rate': 0.0036467289090384423, 'batch_size': 256, 'num_epochs': 33}. Best is trial 7 with value: 1.0.


epoch 22 | loss: 0.03691 | val_accuracy: 0.625   |  0:00:02s
epoch 23 | loss: 0.03776 | val_accuracy: 0.625   |  0:00:03s

Early stopping occurred at epoch 23 with best_epoch = 13 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.50954 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 3.52993 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 0.5343  | val_accuracy: 0.75    |  0:00:00s
epoch 3  | loss: 0.19195 | val_accuracy: 0.75    |  0:00:00s
epoch 4  | loss: 0.38341 | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.68579 | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 1.04034 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.29411 | val_accuracy: 0.875   |  0:00:00s
epoch 8  | loss: 0.20413 | val_accuracy: 0.875   |  0:00:01s
epoch 9  | loss: 0.43365 | val_accuracy: 0.75    |  0:00:01s
epoch 10 | loss: 0.11341 | val_accuracy: 1.0     |  0:00:01s
epoch 11 | loss: 0.00174 | val_accuracy: 0.75    |  0:00:01s
epoch 12 | loss: 0.00292 | val_accuracy: 0.75    |  0:00:01s
epoch 13 | loss: 0.02788 | val_accuracy: 0.5     |  0:00:01s
epoch 14 | loss: 0.15153 | val_accuracy: 0.75    |  0:00:01s
epoch 15 | loss: 0.0078  | val_accuracy: 0.625   |  0:00:01s
epoch 16 | loss: 0.00702

[I 2024-07-26 07:52:24,821] Trial 23 finished with value: 1.0 and parameters: {'n_d': 34, 'n_a': 62, 'n_steps': 8, 'gamma': 1.4000309104710151, 'lambda_sparse': 0.0003778996182318105, 'learning_rate': 0.08209135334861486, 'batch_size': 256, 'num_epochs': 51}. Best is trial 7 with value: 1.0.


epoch 19 | loss: 0.00217 | val_accuracy: 0.75    |  0:00:02s
epoch 20 | loss: 0.00233 | val_accuracy: 0.75    |  0:00:02s

Early stopping occurred at epoch 20 with best_epoch = 10 and best_val_accuracy = 1.0


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.81028 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 2.14348 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 2.06397 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 2.32671 | val_accuracy: 0.75    |  0:00:00s
epoch 4  | loss: 1.46931 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 2.95948 | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 1.08031 | val_accuracy: 0.5     |  0:00:01s
epoch 7  | loss: 0.45538 | val_accuracy: 0.75    |  0:00:01s
epoch 8  | loss: 0.20974 | val_accuracy: 0.75    |  0:00:01s
epoch 9  | loss: 0.27192 | val_accuracy: 0.75    |  0:00:01s
epoch 10 | loss: 0.22217 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.03842 | val_accuracy: 0.875   |  0:00:01s
epoch 12 | loss: 0.00451 | val_accuracy: 0.75    |  0:00:01s
epoch 13 | loss: 0.02419 | val_accuracy: 0.625   |  0:00:02s
epoch 14 | loss: 0.00891 | val_accuracy: 0.75    |  0:00:02s
epoch 15 | loss: 0.00884 | val_accuracy: 0.75    |  0:00:02s
epoch 16 | loss: 0.09389

[I 2024-07-26 07:52:28,292] Trial 24 finished with value: 0.875 and parameters: {'n_d': 35, 'n_a': 63, 'n_steps': 10, 'gamma': 1.3892455490696232, 'lambda_sparse': 0.00035834842756476206, 'learning_rate': 0.0929160914636805, 'batch_size': 256, 'num_epochs': 58}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.94252 | val_accuracy: 0.75    |  0:00:03s
epoch 1  | loss: 2.92783 | val_accuracy: 0.625   |  0:00:14s
epoch 2  | loss: 1.8776  | val_accuracy: 0.5     |  0:00:14s
epoch 3  | loss: 0.39668 | val_accuracy: 0.875   |  0:00:14s
epoch 4  | loss: 0.8244  | val_accuracy: 0.875   |  0:00:15s
epoch 5  | loss: 0.13624 | val_accuracy: 0.75    |  0:00:16s
epoch 6  | loss: 0.00661 | val_accuracy: 0.625   |  0:00:16s
epoch 7  | loss: 0.00992 | val_accuracy: 0.5     |  0:00:16s
epoch 8  | loss: 0.02594 | val_accuracy: 0.5     |  0:00:16s
epoch 9  | loss: 0.02778 | val_accuracy: 0.625   |  0:00:17s
epoch 10 | loss: 0.00368 | val_accuracy: 0.75    |  0:00:18s
epoch 11 | loss: 0.00249 | val_accuracy: 0.5     |  0:00:18s
epoch 12 | loss: 0.00579 | val_accuracy: 0.625   |  0:00:19s
epoch 13 | loss: 0.00225 | val_accuracy: 0.5     |  0:00:21s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.875


[I 2024-07-26 07:52:50,195] Trial 25 finished with value: 0.875 and parameters: {'n_d': 46, 'n_a': 31, 'n_steps': 9, 'gamma': 1.2162827155261575, 'lambda_sparse': 0.0009224467293671625, 'learning_rate': 0.04218319001712552, 'batch_size': 256, 'num_epochs': 71}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.05701 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 0.98471 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 0.37495 | val_accuracy: 0.5     |  0:00:02s
epoch 3  | loss: 0.18087 | val_accuracy: 0.75    |  0:00:07s
epoch 4  | loss: 0.04718 | val_accuracy: 0.75    |  0:00:10s
epoch 5  | loss: 0.02249 | val_accuracy: 0.625   |  0:00:10s
epoch 6  | loss: 0.01301 | val_accuracy: 0.625   |  0:00:10s
epoch 7  | loss: 0.00582 | val_accuracy: 0.625   |  0:00:10s
epoch 8  | loss: 0.0029  | val_accuracy: 0.625   |  0:00:11s
epoch 9  | loss: 0.00123 | val_accuracy: 0.625   |  0:00:11s
epoch 10 | loss: 0.00076 | val_accuracy: 0.625   |  0:00:11s
epoch 11 | loss: 0.00063 | val_accuracy: 0.625   |  0:00:11s


[I 2024-07-26 07:53:02,302] Trial 26 finished with value: 0.75 and parameters: {'n_d': 42, 'n_a': 20, 'n_steps': 7, 'gamma': 1.4995597462009012, 'lambda_sparse': 0.00010417417307943548, 'learning_rate': 0.027300145768136823, 'batch_size': 128, 'num_epochs': 83}. Best is trial 7 with value: 1.0.


epoch 12 | loss: 0.0005  | val_accuracy: 0.625   |  0:00:11s
epoch 13 | loss: 0.00027 | val_accuracy: 0.625   |  0:00:11s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 8.58302 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 2.60218 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 1.49392 | val_accuracy: 0.625   |  0:00:01s
epoch 3  | loss: 0.14777 | val_accuracy: 0.625   |  0:00:01s
epoch 4  | loss: 1.68633 | val_accuracy: 0.75    |  0:00:01s
epoch 5  | loss: 0.38713 | val_accuracy: 0.875   |  0:00:01s
epoch 6  | loss: 0.04107 | val_accuracy: 0.875   |  0:00:01s
epoch 7  | loss: 0.16581 | val_accuracy: 0.75    |  0:00:01s
epoch 8  | loss: 0.0012  | val_accuracy: 0.625   |  0:00:02s
epoch 9  | loss: 0.00023 | val_accuracy: 0.75    |  0:00:02s
epoch 10 | loss: 0.00123 | val_accuracy: 0.75    |  0:00:02s
epoch 11 | loss: 0.00619 | val_accuracy: 0.625   |  0:00:02s
epoch 12 | loss: 0.01992 | val_accuracy: 0.625   |  0:00:02s
epoch 13 | loss: 0.00416 | val_accuracy: 0.625   |  0:00:02s


[I 2024-07-26 07:53:05,376] Trial 27 finished with value: 0.875 and parameters: {'n_d': 51, 'n_a': 64, 'n_steps': 8, 'gamma': 1.3314472931647428, 'lambda_sparse': 4.8103950060965734e-05, 'learning_rate': 0.06366478644846081, 'batch_size': 64, 'num_epochs': 66}. Best is trial 7 with value: 1.0.


epoch 14 | loss: 0.15037 | val_accuracy: 0.875   |  0:00:02s
epoch 15 | loss: 0.00035 | val_accuracy: 0.875   |  0:00:02s

Early stopping occurred at epoch 15 with best_epoch = 5 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.56352 | val_accuracy: 0.125   |  0:00:00s
epoch 1  | loss: 1.27093 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 0.65445 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 0.37594 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.29956 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.18693 | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.11478 | val_accuracy: 0.625   |  0:00:00s
epoch 7  | loss: 0.04857 | val_accuracy: 0.625   |  0:00:00s
epoch 8  | loss: 0.02334 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.01064 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.00861 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.00339 | val_accuracy: 0.75    |  0:00:01s
epoch 12 | loss: 0.00231 | val_accuracy: 0.75    |  0:00:01s


[I 2024-07-26 07:53:07,096] Trial 28 finished with value: 0.75 and parameters: {'n_d': 56, 'n_a': 13, 'n_steps': 6, 'gamma': 1.2282981017988324, 'lambda_sparse': 4.2605455733059956e-05, 'learning_rate': 0.007476064504673291, 'batch_size': 256, 'num_epochs': 16}. Best is trial 7 with value: 1.0.


epoch 13 | loss: 0.0017  | val_accuracy: 0.75    |  0:00:01s
epoch 14 | loss: 0.00139 | val_accuracy: 0.75    |  0:00:01s
epoch 15 | loss: 0.00107 | val_accuracy: 0.75    |  0:00:01s

Early stopping occurred at epoch 15 with best_epoch = 5 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.53035 | val_accuracy: 0.625   |  0:00:00s
epoch 1  | loss: 0.73835 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 0.83482 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 0.78673 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.5351  | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.22226 | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.18687 | val_accuracy: 0.75    |  0:00:00s
epoch 7  | loss: 0.39319 | val_accuracy: 0.5     |  0:00:00s
epoch 8  | loss: 0.03184 | val_accuracy: 0.5     |  0:00:00s
epoch 9  | loss: 0.17685 | val_accuracy: 0.625   |  0:00:00s
epoch 10 | loss: 0.20432 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.10829 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.14512 | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.09924 | val_accuracy: 0.625   |  0:00:01s


[I 2024-07-26 07:53:08,801] Trial 29 finished with value: 0.75 and parameters: {'n_d': 22, 'n_a': 34, 'n_steps': 5, 'gamma': 1.5453551803432573, 'lambda_sparse': 2.8019504543914807e-05, 'learning_rate': 0.019899916798499444, 'batch_size': 32, 'num_epochs': 33}. Best is trial 7 with value: 1.0.


epoch 14 | loss: 0.09597 | val_accuracy: 0.5     |  0:00:01s
epoch 15 | loss: 0.03303 | val_accuracy: 0.5     |  0:00:01s

Early stopping occurred at epoch 15 with best_epoch = 5 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 9.69684 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 9.49331 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 8.67491 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 7.7418  | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 7.81776 | val_accuracy: 0.25    |  0:00:00s
epoch 5  | loss: 7.24997 | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 6.59123 | val_accuracy: 0.25    |  0:00:01s
epoch 7  | loss: 6.13259 | val_accuracy: 0.25    |  0:00:01s
epoch 8  | loss: 5.31325 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 5.30109 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 5.00581 | val_accuracy: 0.375   |  0:00:01s
epoch 11 | loss: 4.80086 | val_accuracy: 0.375   |  0:00:01s
epoch 12 | loss: 4.1359  | val_accuracy: 0.375   |  0:00:01s
epoch 13 | loss: 3.71915 | val_accuracy: 0.375   |  0:00:01s
epoch 14 | loss: 3.09687 | val_accuracy: 0.375   |  0:00:02s
epoch 15 | loss: 2.84551 | val_accuracy: 0.375   |  0:00:02s
epoch 16 | loss: 2.30726

[I 2024-07-26 07:53:12,481] Trial 30 finished with value: 0.75 and parameters: {'n_d': 34, 'n_a': 26, 'n_steps': 10, 'gamma': 1.436883085504353, 'lambda_sparse': 0.00025772099482927627, 'learning_rate': 0.0018471330038734779, 'batch_size': 128, 'num_epochs': 90}. Best is trial 7 with value: 1.0.


epoch 26 | loss: 0.31392 | val_accuracy: 0.75    |  0:00:03s
epoch 27 | loss: 0.44991 | val_accuracy: 0.5     |  0:00:03s

Early stopping occurred at epoch 27 with best_epoch = 17 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.25444 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 2.65625 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 2.52875 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 2.32448 | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 1.51845 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 1.62165 | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 1.26487 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.9661  | val_accuracy: 0.625   |  0:00:00s
epoch 8  | loss: 0.8154  | val_accuracy: 0.375   |  0:00:00s
epoch 9  | loss: 0.68673 | val_accuracy: 0.5     |  0:00:01s
epoch 10 | loss: 0.62962 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.56533 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.34272 | val_accuracy: 0.75    |  0:00:01s
epoch 13 | loss: 0.42452 | val_accuracy: 0.625   |  0:00:01s
epoch 14 | loss: 0.30016 | val_accuracy: 0.625   |  0:00:01s
epoch 15 | loss: 0.31908 | val_accuracy: 0.5     |  0:00:01s
epoch 16 | loss: 0.24779

[I 2024-07-26 07:53:15,172] Trial 31 finished with value: 0.75 and parameters: {'n_d': 30, 'n_a': 58, 'n_steps': 8, 'gamma': 1.8104785951383606, 'lambda_sparse': 0.00041663029993119287, 'learning_rate': 0.0026842239549012864, 'batch_size': 256, 'num_epochs': 49}. Best is trial 7 with value: 1.0.


epoch 22 | loss: 0.07127 | val_accuracy: 0.625   |  0:00:02s

Early stopping occurred at epoch 22 with best_epoch = 12 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 7.34614 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 6.69593 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 6.31023 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 5.95487 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 5.54906 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 4.63043 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 4.62921 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 4.57672 | val_accuracy: 0.375   |  0:00:00s
epoch 8  | loss: 4.36561 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 4.24154 | val_accuracy: 0.5     |  0:00:01s
epoch 10 | loss: 3.93954 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 3.53632 | val_accuracy: 0.5     |  0:00:01s


[I 2024-07-26 07:53:16,913] Trial 32 finished with value: 0.5 and parameters: {'n_d': 40, 'n_a': 58, 'n_steps': 9, 'gamma': 1.716182192887831, 'lambda_sparse': 0.0009687929089092655, 'learning_rate': 0.0010524078524907596, 'batch_size': 256, 'num_epochs': 51}. Best is trial 7 with value: 1.0.


epoch 12 | loss: 3.30935 | val_accuracy: 0.5     |  0:00:01s

Early stopping occurred at epoch 12 with best_epoch = 2 and best_val_accuracy = 0.5


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.6894  | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 3.74136 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 3.77732 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 3.28272 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 3.34689 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 3.43669 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 2.87018 | val_accuracy: 0.375   |  0:00:00s
epoch 7  | loss: 2.92976 | val_accuracy: 0.375   |  0:00:00s
epoch 8  | loss: 3.02573 | val_accuracy: 0.25    |  0:00:00s
epoch 9  | loss: 3.06043 | val_accuracy: 0.25    |  0:00:01s
epoch 10 | loss: 2.91804 | val_accuracy: 0.25    |  0:00:01s


[I 2024-07-26 07:53:18,941] Trial 33 finished with value: 0.5 and parameters: {'n_d': 18, 'n_a': 48, 'n_steps': 8, 'gamma': 1.8552574870081273, 'lambda_sparse': 0.00012986004573661716, 'learning_rate': 0.0005306945342287403, 'batch_size': 256, 'num_epochs': 45}. Best is trial 7 with value: 1.0.


epoch 11 | loss: 2.95775 | val_accuracy: 0.25    |  0:00:01s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_accuracy = 0.5


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.27622 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 1.54457 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 0.99091 | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 0.45791 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.4306  | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.35022 | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 0.30895 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.18397 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.21368 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.27595 | val_accuracy: 0.75    |  0:00:01s
epoch 10 | loss: 0.26252 | val_accuracy: 0.75    |  0:00:01s
epoch 11 | loss: 0.19254 | val_accuracy: 0.875   |  0:00:01s
epoch 12 | loss: 0.08032 | val_accuracy: 0.875   |  0:00:01s
epoch 13 | loss: 0.20193 | val_accuracy: 0.875   |  0:00:01s
epoch 14 | loss: 0.07799 | val_accuracy: 0.875   |  0:00:01s
epoch 15 | loss: 0.08393 | val_accuracy: 0.875   |  0:00:02s
epoch 16 | loss: 0.06866

[I 2024-07-26 07:53:21,888] Trial 34 finished with value: 0.875 and parameters: {'n_d': 33, 'n_a': 60, 'n_steps': 9, 'gamma': 1.5747964174154367, 'lambda_sparse': 0.0002795800144439728, 'learning_rate': 0.0048401622106278444, 'batch_size': 256, 'num_epochs': 59}. Best is trial 7 with value: 1.0.



Early stopping occurred at epoch 21 with best_epoch = 11 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.53506 | val_accuracy: 0.625   |  0:00:00s
epoch 1  | loss: 1.93407 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 0.9458  | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 0.65964 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.33068 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.2742  | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.21756 | val_accuracy: 0.625   |  0:00:00s
epoch 7  | loss: 0.22453 | val_accuracy: 0.875   |  0:00:00s
epoch 8  | loss: 0.15182 | val_accuracy: 0.75    |  0:00:01s
epoch 9  | loss: 0.13861 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.08897 | val_accuracy: 0.875   |  0:00:01s
epoch 11 | loss: 0.07109 | val_accuracy: 0.5     |  0:00:01s
epoch 12 | loss: 0.0511  | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.02488 | val_accuracy: 0.625   |  0:00:01s
epoch 14 | loss: 0.01209 | val_accuracy: 0.625   |  0:00:01s
epoch 15 | loss: 0.00754 | val_accuracy: 0.75    |  0:00:01s
epoch 16 | loss: 0.00573

[I 2024-07-26 07:53:24,003] Trial 35 finished with value: 0.875 and parameters: {'n_d': 27, 'n_a': 55, 'n_steps': 8, 'gamma': 1.3560203777597766, 'lambda_sparse': 0.0006109977276838761, 'learning_rate': 0.011151200682186939, 'batch_size': 256, 'num_epochs': 35}. Best is trial 7 with value: 1.0.


epoch 17 | loss: 0.01228 | val_accuracy: 0.875   |  0:00:01s

Early stopping occurred at epoch 17 with best_epoch = 7 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.74028 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 2.30749 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 2.35085 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 1.41886 | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 1.17023 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 0.7196  | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 1.0612  | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.55987 | val_accuracy: 0.375   |  0:00:00s
epoch 8  | loss: 0.57256 | val_accuracy: 0.375   |  0:00:00s
epoch 9  | loss: 0.64435 | val_accuracy: 0.375   |  0:00:00s
epoch 10 | loss: 0.47835 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.5094  | val_accuracy: 0.5     |  0:00:01s
epoch 12 | loss: 0.35975 | val_accuracy: 0.5     |  0:00:01s
epoch 13 | loss: 0.19583 | val_accuracy: 0.5     |  0:00:01s


[I 2024-07-26 07:53:25,674] Trial 36 finished with value: 0.625 and parameters: {'n_d': 38, 'n_a': 21, 'n_steps': 7, 'gamma': 1.8910367133927672, 'lambda_sparse': 8.402652956468465e-05, 'learning_rate': 0.010194634368893502, 'batch_size': 32, 'num_epochs': 45}. Best is trial 7 with value: 1.0.


epoch 14 | loss: 0.42521 | val_accuracy: 0.5     |  0:00:01s
epoch 15 | loss: 0.23606 | val_accuracy: 0.5     |  0:00:01s

Early stopping occurred at epoch 15 with best_epoch = 5 and best_val_accuracy = 0.625


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.56526 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 3.46152 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 2.7219  | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 2.61589 | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 2.51223 | val_accuracy: 0.25    |  0:00:00s
epoch 5  | loss: 2.33208 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 2.19197 | val_accuracy: 0.25    |  0:00:00s
epoch 7  | loss: 1.88667 | val_accuracy: 0.125   |  0:00:00s
epoch 8  | loss: 1.63369 | val_accuracy: 0.125   |  0:00:01s
epoch 9  | loss: 1.33703 | val_accuracy: 0.125   |  0:00:01s
epoch 10 | loss: 1.06723 | val_accuracy: 0.125   |  0:00:01s
epoch 11 | loss: 0.90419 | val_accuracy: 0.125   |  0:00:01s
epoch 12 | loss: 0.68899 | val_accuracy: 0.125   |  0:00:01s
epoch 13 | loss: 0.60022 | val_accuracy: 0.125   |  0:00:01s
epoch 14 | loss: 0.46608 | val_accuracy: 0.25    |  0:00:01s
epoch 15 | loss: 0.33382 | val_accuracy: 0.375   |  0:00:01s

Early stopping occurred

[I 2024-07-26 07:53:27,623] Trial 37 finished with value: 0.375 and parameters: {'n_d': 13, 'n_a': 40, 'n_steps': 8, 'gamma': 1.2711887728714317, 'lambda_sparse': 5.398951550280452e-06, 'learning_rate': 0.0029726550955297255, 'batch_size': 256, 'num_epochs': 52}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 6.30705 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 5.30685 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 5.87369 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 5.80424 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 5.76852 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 5.16187 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 5.15159 | val_accuracy: 0.375   |  0:00:01s
epoch 7  | loss: 5.1333  | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 4.91867 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 5.03198 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 4.7411  | val_accuracy: 0.375   |  0:00:01s
epoch 11 | loss: 4.5355  | val_accuracy: 0.375   |  0:00:01s
epoch 12 | loss: 4.75449 | val_accuracy: 0.375   |  0:00:01s


[I 2024-07-26 07:53:29,540] Trial 38 finished with value: 0.5 and parameters: {'n_d': 49, 'n_a': 61, 'n_steps': 9, 'gamma': 1.9657739566087793, 'lambda_sparse': 8.76570981998317e-06, 'learning_rate': 0.0003305792552893228, 'batch_size': 128, 'num_epochs': 39}. Best is trial 7 with value: 1.0.



Early stopping occurred at epoch 12 with best_epoch = 2 and best_val_accuracy = 0.5


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.45463 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 3.6097  | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 2.31082 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 1.05252 | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 0.44635 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.2838  | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.28314 | val_accuracy: 0.75    |  0:00:00s
epoch 7  | loss: 0.4186  | val_accuracy: 0.875   |  0:00:00s
epoch 8  | loss: 0.02638 | val_accuracy: 0.75    |  0:00:00s
epoch 9  | loss: 0.01321 | val_accuracy: 0.75    |  0:00:00s
epoch 10 | loss: 0.03048 | val_accuracy: 0.5     |  0:00:00s
epoch 11 | loss: 0.02925 | val_accuracy: 0.5     |  0:00:01s
epoch 12 | loss: 0.04323 | val_accuracy: 0.5     |  0:00:01s
epoch 13 | loss: 0.01548 | val_accuracy: 0.5     |  0:00:01s
epoch 14 | loss: 0.01124 | val_accuracy: 0.625   |  0:00:01s


[I 2024-07-26 07:53:31,155] Trial 39 finished with value: 0.875 and parameters: {'n_d': 59, 'n_a': 46, 'n_steps': 5, 'gamma': 1.1420448082266055, 'lambda_sparse': 0.0003877293264322241, 'learning_rate': 0.0995487290230079, 'batch_size': 64, 'num_epochs': 47}. Best is trial 7 with value: 1.0.


epoch 15 | loss: 0.0335  | val_accuracy: 0.625   |  0:00:01s
epoch 16 | loss: 0.01191 | val_accuracy: 0.75    |  0:00:01s
epoch 17 | loss: 0.01389 | val_accuracy: 0.75    |  0:00:01s

Early stopping occurred at epoch 17 with best_epoch = 7 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.86226 | val_accuracy: 0.0     |  0:00:00s
epoch 1  | loss: 2.14555 | val_accuracy: 0.0     |  0:00:00s
epoch 2  | loss: 2.33664 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 2.1147  | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 1.70731 | val_accuracy: 0.25    |  0:00:00s
epoch 5  | loss: 1.56256 | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 1.51119 | val_accuracy: 0.25    |  0:00:01s
epoch 7  | loss: 1.63218 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 1.27548 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 1.15785 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 1.54117 | val_accuracy: 0.375   |  0:00:01s
epoch 11 | loss: 0.88421 | val_accuracy: 0.25    |  0:00:01s
epoch 12 | loss: 1.41877 | val_accuracy: 0.375   |  0:00:02s
epoch 13 | loss: 1.28524 | val_accuracy: 0.375   |  0:00:02s
epoch 14 | loss: 1.04793 | val_accuracy: 0.375   |  0:00:02s
epoch 15 | loss: 1.02044 | val_accuracy: 0.375   |  0:00:02s
epoch 16 | loss: 1.03901

[I 2024-07-26 07:53:34,071] Trial 40 finished with value: 0.5 and parameters: {'n_d': 31, 'n_a': 55, 'n_steps': 10, 'gamma': 1.410770211394871, 'lambda_sparse': 3.0858934323569277e-06, 'learning_rate': 0.0012561539599503005, 'batch_size': 32, 'num_epochs': 71}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 4.50669 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 2.94786 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 1.10407 | val_accuracy: 0.125   |  0:00:00s
epoch 3  | loss: 3.02331 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.3522  | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 0.31021 | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 0.15172 | val_accuracy: 0.25    |  0:00:00s
epoch 7  | loss: 0.74175 | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 0.07827 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.04687 | val_accuracy: 0.5     |  0:00:01s
epoch 10 | loss: 0.03097 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.02096 | val_accuracy: 0.375   |  0:00:01s
epoch 12 | loss: 0.15248 | val_accuracy: 0.25    |  0:00:01s
epoch 13 | loss: 0.14754 | val_accuracy: 0.25    |  0:00:01s
epoch 14 | loss: 0.00775 | val_accuracy: 0.375   |  0:00:02s
epoch 15 | loss: 0.01134 | val_accuracy: 0.5     |  0:00:02s
epoch 16 | loss: 0.02112

[I 2024-07-26 07:53:36,806] Trial 41 finished with value: 0.625 and parameters: {'n_d': 49, 'n_a': 8, 'n_steps': 10, 'gamma': 1.8434257186106227, 'lambda_sparse': 1.2511043318765517e-06, 'learning_rate': 0.056962684365044376, 'batch_size': 128, 'num_epochs': 78}. Best is trial 7 with value: 1.0.


epoch 18 | loss: 0.00186 | val_accuracy: 0.625   |  0:00:02s

Early stopping occurred at epoch 18 with best_epoch = 8 and best_val_accuracy = 0.625


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 6.85427 | val_accuracy: 0.625   |  0:00:00s
epoch 1  | loss: 6.08919 | val_accuracy: 0.75    |  0:00:00s
epoch 2  | loss: 9.60929 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 6.73365 | val_accuracy: 0.75    |  0:00:00s
epoch 4  | loss: 2.61528 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.75579 | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 1.4514  | val_accuracy: 0.75    |  0:00:00s
epoch 7  | loss: 0.51622 | val_accuracy: 0.625   |  0:00:01s
epoch 8  | loss: 0.57949 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 2.45661 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.3574  | val_accuracy: 0.875   |  0:00:01s
epoch 11 | loss: 0.06612 | val_accuracy: 0.875   |  0:00:01s
epoch 12 | loss: 0.10445 | val_accuracy: 0.75    |  0:00:01s
epoch 13 | loss: 0.17816 | val_accuracy: 0.75    |  0:00:02s
epoch 14 | loss: 0.90304 | val_accuracy: 0.75    |  0:00:02s
epoch 15 | loss: 0.26538 | val_accuracy: 0.5     |  0:00:02s
epoch 16 | loss: 0.06999

[I 2024-07-26 07:53:40,394] Trial 42 finished with value: 0.875 and parameters: {'n_d': 56, 'n_a': 10, 'n_steps': 10, 'gamma': 1.9344703406205523, 'lambda_sparse': 1.2709591478948665e-06, 'learning_rate': 0.07035996153229228, 'batch_size': 128, 'num_epochs': 74}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 9.09274 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 2.61489 | val_accuracy: 0.125   |  0:00:01s
epoch 2  | loss: 0.81    | val_accuracy: 0.75    |  0:00:01s
epoch 3  | loss: 0.53626 | val_accuracy: 0.75    |  0:00:02s
epoch 4  | loss: 0.35563 | val_accuracy: 0.625   |  0:00:03s
epoch 5  | loss: 1.17934 | val_accuracy: 0.75    |  0:00:03s
epoch 6  | loss: 0.66638 | val_accuracy: 0.75    |  0:00:04s
epoch 7  | loss: 0.37204 | val_accuracy: 0.375   |  0:00:04s
epoch 8  | loss: 0.08292 | val_accuracy: 0.375   |  0:00:04s
epoch 9  | loss: 0.04339 | val_accuracy: 0.375   |  0:00:05s
epoch 10 | loss: 0.00999 | val_accuracy: 0.75    |  0:00:05s
epoch 11 | loss: 0.01835 | val_accuracy: 0.875   |  0:00:05s
epoch 12 | loss: 0.00979 | val_accuracy: 0.5     |  0:00:05s
epoch 13 | loss: 0.00502 | val_accuracy: 0.5     |  0:00:05s
epoch 14 | loss: 0.00201 | val_accuracy: 0.625   |  0:00:05s
epoch 15 | loss: 0.0009  | val_accuracy: 0.625   |  0:00:06s
epoch 16 | loss: 0.00054

[I 2024-07-26 07:53:47,574] Trial 43 finished with value: 0.875 and parameters: {'n_d': 44, 'n_a': 17, 'n_steps': 9, 'gamma': 1.7071121392012623, 'lambda_sparse': 4.0719079125513675e-06, 'learning_rate': 0.02604007937116507, 'batch_size': 128, 'num_epochs': 63}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.87416 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 4.44346 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 0.51336 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 1.53906 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 2.86004 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 2.7204  | val_accuracy: 0.625   |  0:00:01s
epoch 6  | loss: 1.86495 | val_accuracy: 0.5     |  0:00:01s
epoch 7  | loss: 1.46425 | val_accuracy: 0.875   |  0:00:01s
epoch 8  | loss: 0.16453 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.48328 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.29292 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.02188 | val_accuracy: 0.625   |  0:00:02s
epoch 12 | loss: 0.00475 | val_accuracy: 0.625   |  0:00:02s
epoch 13 | loss: 0.00555 | val_accuracy: 0.625   |  0:00:02s
epoch 14 | loss: 0.13566 | val_accuracy: 0.625   |  0:00:02s
epoch 15 | loss: 0.00131 | val_accuracy: 0.75    |  0:00:02s
epoch 16 | loss: 0.56735

[I 2024-07-26 07:53:50,907] Trial 44 finished with value: 0.875 and parameters: {'n_d': 62, 'n_a': 11, 'n_steps': 10, 'gamma': 1.7970566548536064, 'lambda_sparse': 9.999725497663805e-06, 'learning_rate': 0.049170204709412905, 'batch_size': 128, 'num_epochs': 41}. Best is trial 7 with value: 1.0.


epoch 17 | loss: 0.29242 | val_accuracy: 0.75    |  0:00:03s

Early stopping occurred at epoch 17 with best_epoch = 7 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 7.07223 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 2.59616 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 3.50183 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 3.06094 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 1.60873 | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.7972  | val_accuracy: 0.875   |  0:00:00s
epoch 6  | loss: 0.28243 | val_accuracy: 0.75    |  0:00:01s
epoch 7  | loss: 0.63912 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.15636 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.03843 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.63422 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.20867 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.43948 | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.02503 | val_accuracy: 0.625   |  0:00:01s


[I 2024-07-26 07:53:53,424] Trial 45 finished with value: 0.875 and parameters: {'n_d': 52, 'n_a': 17, 'n_steps': 9, 'gamma': 1.8737960839093863, 'lambda_sparse': 0.000199044348626163, 'learning_rate': 0.029104411032351937, 'batch_size': 128, 'num_epochs': 67}. Best is trial 7 with value: 1.0.


epoch 14 | loss: 0.10296 | val_accuracy: 0.625   |  0:00:02s
epoch 15 | loss: 0.05144 | val_accuracy: 0.625   |  0:00:02s

Early stopping occurred at epoch 15 with best_epoch = 5 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.59927 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 7.2009  | val_accuracy: 0.75    |  0:00:00s
epoch 2  | loss: 0.92298 | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 3.01105 | val_accuracy: 0.75    |  0:00:00s
epoch 4  | loss: 2.99071 | val_accuracy: 0.75    |  0:00:00s
epoch 5  | loss: 0.83669 | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.10073 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.10033 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.34649 | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.28345 | val_accuracy: 0.875   |  0:00:01s
epoch 10 | loss: 0.34854 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.31078 | val_accuracy: 0.375   |  0:00:01s
epoch 12 | loss: 0.00656 | val_accuracy: 0.75    |  0:00:01s
epoch 13 | loss: 0.00196 | val_accuracy: 0.875   |  0:00:01s
epoch 14 | loss: 0.05386 | val_accuracy: 0.875   |  0:00:01s
epoch 15 | loss: 0.00157 | val_accuracy: 0.875   |  0:00:02s
epoch 16 | loss: 0.00157

[I 2024-07-26 07:53:56,287] Trial 46 finished with value: 0.875 and parameters: {'n_d': 55, 'n_a': 23, 'n_steps': 8, 'gamma': 1.4786143695514284, 'lambda_sparse': 1.8233110408444885e-06, 'learning_rate': 0.08522082139195931, 'batch_size': 128, 'num_epochs': 92}. Best is trial 7 with value: 1.0.


epoch 19 | loss: 0.11405 | val_accuracy: 0.875   |  0:00:02s

Early stopping occurred at epoch 19 with best_epoch = 9 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.4344  | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.41192 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 1.23978 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 0.29479 | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 1.04786 | val_accuracy: 1.0     |  0:00:00s
epoch 5  | loss: 1.1175  | val_accuracy: 0.875   |  0:00:01s
epoch 6  | loss: 0.31609 | val_accuracy: 0.75    |  0:00:01s
epoch 7  | loss: 0.45109 | val_accuracy: 0.75    |  0:00:01s
epoch 8  | loss: 0.3147  | val_accuracy: 0.875   |  0:00:01s
epoch 9  | loss: 0.24384 | val_accuracy: 0.875   |  0:00:01s
epoch 10 | loss: 0.52912 | val_accuracy: 0.875   |  0:00:01s
epoch 11 | loss: 0.27221 | val_accuracy: 0.75    |  0:00:01s
epoch 12 | loss: 0.33142 | val_accuracy: 0.75    |  0:00:02s
epoch 13 | loss: 0.57567 | val_accuracy: 0.625   |  0:00:02s
epoch 14 | loss: 0.53797 | val_accuracy: 1.0     |  0:00:02s

Early stopping occurred at epoch 14 with best_epoch = 4 and best_val_accuracy = 1.0


[I 2024-07-26 07:53:58,978] Trial 47 finished with value: 1.0 and parameters: {'n_d': 27, 'n_a': 28, 'n_steps': 10, 'gamma': 1.3087236723184485, 'lambda_sparse': 3.6010095166494255e-05, 'learning_rate': 0.032136943711229576, 'batch_size': 32, 'num_epochs': 84}. Best is trial 7 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.64556 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.65761 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 1.01829 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 0.6276  | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.3913  | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.28392 | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 0.25716 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.2219  | val_accuracy: 0.25    |  0:00:00s
epoch 8  | loss: 0.28046 | val_accuracy: 0.25    |  0:00:00s
epoch 9  | loss: 0.10343 | val_accuracy: 0.25    |  0:00:00s
epoch 10 | loss: 0.09104 | val_accuracy: 0.25    |  0:00:00s
epoch 11 | loss: 0.25747 | val_accuracy: 0.375   |  0:00:00s
epoch 12 | loss: 0.14195 | val_accuracy: 0.25    |  0:00:00s
epoch 13 | loss: 0.40013 | val_accuracy: 0.375   |  0:00:00s


[I 2024-07-26 07:54:00,225] Trial 48 finished with value: 0.625 and parameters: {'n_d': 28, 'n_a': 34, 'n_steps': 3, 'gamma': 1.3369495069191328, 'lambda_sparse': 2.3445638971758493e-05, 'learning_rate': 0.01572585410784491, 'batch_size': 32, 'num_epochs': 82}. Best is trial 7 with value: 1.0.


epoch 14 | loss: 0.05411 | val_accuracy: 0.25    |  0:00:01s

Early stopping occurred at epoch 14 with best_epoch = 4 and best_val_accuracy = 0.625


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 5.58947 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 4.02601 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 2.98202 | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 2.03288 | val_accuracy: 0.75    |  0:00:00s
epoch 4  | loss: 1.39047 | val_accuracy: 0.75    |  0:00:01s
epoch 5  | loss: 0.84137 | val_accuracy: 0.625   |  0:00:01s
epoch 6  | loss: 0.67585 | val_accuracy: 0.5     |  0:00:01s
epoch 7  | loss: 0.34734 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.2705  | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.24225 | val_accuracy: 0.625   |  0:00:02s
epoch 10 | loss: 0.38281 | val_accuracy: 0.5     |  0:00:02s
epoch 11 | loss: 0.16443 | val_accuracy: 0.625   |  0:00:02s
epoch 12 | loss: 0.21916 | val_accuracy: 0.5     |  0:00:02s


[I 2024-07-26 07:54:03,463] Trial 49 finished with value: 0.75 and parameters: {'n_d': 23, 'n_a': 29, 'n_steps': 9, 'gamma': 1.1641557272828789, 'lambda_sparse': 6.600465112439704e-05, 'learning_rate': 0.007821917938746453, 'batch_size': 32, 'num_epochs': 55}. Best is trial 7 with value: 1.0.


epoch 13 | loss: 0.29652 | val_accuracy: 0.625   |  0:00:02s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.75




epoch 0  | loss: 2.67921 | val_accuracy: 0.0     |  0:00:00s
epoch 1  | loss: 2.58568 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 1.93807 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 1.46323 | val_accuracy: 0.5     |  0:00:01s
epoch 4  | loss: 0.95907 | val_accuracy: 0.5     |  0:00:01s
epoch 5  | loss: 1.46162 | val_accuracy: 0.625   |  0:00:01s
epoch 6  | loss: 1.0283  | val_accuracy: 0.875   |  0:00:01s
epoch 7  | loss: 0.95073 | val_accuracy: 0.875   |  0:00:02s
epoch 8  | loss: 0.90242 | val_accuracy: 0.875   |  0:00:02s
epoch 9  | loss: 0.89346 | val_accuracy: 0.875   |  0:00:02s
epoch 10 | loss: 1.30834 | val_accuracy: 0.875   |  0:00:02s
epoch 11 | loss: 0.55264 | val_accuracy: 0.875   |  0:00:02s
epoch 12 | loss: 0.68679 | val_accuracy: 1.0     |  0:00:02s
epoch 13 | loss: 0.79067 | val_accuracy: 1.0     |  0:00:02s
epoch 14 | loss: 0.68402 | val_accuracy: 0.75    |  0:00:03s
epoch 15 | loss: 0.37625 | val_accuracy: 0.75    |  0:00:03s
epoch 16 | loss: 0.74194



In [63]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class KAN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(KAN, self).__init__()
        self.hidden_layer = nn.Linear(input_dim, hidden_dim)
        self.activation = nn.ReLU()
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h = self.activation(self.hidden_layer(x))
        out = self.output_layer(h)
        return out

def objective(trial):
    # Define hyperparameters to tune for KAN
    hidden_dim = trial.suggest_int('hidden_dim', 32, 256)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the KAN model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y))
    model = KAN(input_dim, hidden_dim, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final KAN model with the best hyperparameters
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
best_model = KAN(input_dim, 
                 best_params['hidden_dim'], 
                 output_dim).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    
    _, predicted = torch.max(outputs, 1)
    y_pred = predicted.cpu().numpy()
    proba = torch.softmax(outputs, dim=1).cpu().numpy()

    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        auc = roc_auc_score(y_true, proba[:, 1])
    else:  # Multi-class classification
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['KAN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:54:08,233] A new study created in memory with name: no-name-31e1a594-edde-41c5-8805-881e363c58f3


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:54:08,703] Trial 0 finished with value: 0.75 and parameters: {'hidden_dim': 32, 'learning_rate': 0.008869898978533712, 'batch_size': 128, 'num_epochs': 83}. Best is trial 0 with value: 0.75.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:54:08,881] Trial 1 finished with value: 1.0 and parameters: {'hidden_dim': 86, 'learning_rate': 0.09549506729887208, 'batch_size': 128, 'num_epochs': 48}. Best is trial 1 with value: 1.0.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:54:12,193] Trial 2 finished with value: 0.5 and parameters: {'hidden_dim': 37, 'learning_rate': 0.0003143798772683211, 'batch_size': 128, 'num_epochs': 74}. Best is trial 1 with value: 1.0.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:54:12,563] Trial 3 finished with value: 0.75 and parameters: {'hidden_dim':

                           Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression            0.75  0.927778                    0.002991   
KNN                            0.75       1.0                    0.000996   
Decision Tree                 0.875  0.931746                       0.001   
Random Forest                  0.75  0.933333                    0.118717   
Gradient Boosting             0.875  0.920635                    0.247005   
XGBoost                        0.75  0.805556                    0.082778   
LightGBM                      0.125       0.5                    0.010972   
CatBoost                       0.75  0.977778                    0.119442   
MLP                            0.75       1.0                    0.513627   
DNN                            0.75  0.922222                     0.23946   
DCN                           0.875  0.927778                    0.297203   
Wide_and_Deep                 0.875  0.830556                    0.449732   

In [64]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class SAINT(nn.Module):
    def __init__(self, input_dim, num_classes, dim, depth, heads, mlp_dim, dropout=0.1):
        super(SAINT, self).__init__()
        self.embeds = nn.Linear(input_dim, dim)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim, dropout=dropout),
            num_layers=depth
        )
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, x):
        x = self.embeds(x)
        x = x.unsqueeze(1)  # Add sequence dimension
        x = self.transformer(x)
        x = x.squeeze(1)  # Remove sequence dimension
        return self.mlp_head(x)

def objective(trial):
    # Define hyperparameters to tune for SAINT
    heads = trial.suggest_int('heads', 1, 8)
    dim = trial.suggest_int('dim', heads, 256, step=heads)  # Ensure dim is divisible by heads
    depth = trial.suggest_int('depth', 1, 6)
    mlp_dim = trial.suggest_int('mlp_dim', 32, 256)
    dropout = trial.suggest_float('dropout', 0.0, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the SAINT model
    input_dim = X_train.shape[1]
    num_classes = len(np.unique(y))
    model = SAINT(input_dim, num_classes, dim, depth, heads, mlp_dim, dropout).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final SAINT model with the best hyperparameters
input_dim = X_train.shape[1]
num_classes = len(np.unique(y))
best_model = SAINT(input_dim, num_classes, 
                   best_params['dim'], 
                   best_params['depth'], 
                   best_params['heads'], 
                   best_params['mlp_dim'], 
                   best_params['dropout']).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    
    _, predicted = torch.max(outputs, 1)
    y_pred = predicted.cpu().numpy()
    proba = torch.softmax(outputs, dim=1).cpu().numpy()

    accuracy = accuracy_score(y_true, y_pred)
    
    if num_classes == 2:  # Binary classification
        auc = roc_auc_score(y_true, proba[:, 1])
    else:  # Multi-class classification
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['SAINT'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:54:25,227] A new study created in memory with name: no-name-d918826b-935d-4448-ba2c-793621b5c467


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:54:28,455] Trial 0 finished with value: 0.5 and parameters: {'heads': 1, 'dim': 3, 'depth': 3, 'mlp_dim': 107, 'dropout': 0.27778102782570907, 'learning_rate': 0.0001716030028123978, 'batch_size': 32, 'num_epochs': 32}. Best is trial 0 with value: 0.5.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:54:30,116] Trial 1 finished with value: 0.75 and parameters: {'heads': 4, 'dim': 36, 'depth': 2, 'mlp_dim': 71, 'dropout': 0.05274879792496301, 'learning_rate': 0.011689208013149276, 'batch_size': 256, 'num_epochs': 46}. Best is trial 1 with value: 0.75.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 07:54:32,500] Trial 2 finished with value: 0.75 and parameters: {'heads': 1, 'dim': 145, 'depth': 2, 'mlp_dim': 114, 'dropout': 0.049208137321176626, 'learning_rate': 0.00016984020008410823, 'batch_size': 32, 'num_epochs': 58}. 

                           Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression            0.75  0.927778                    0.002991   
KNN                            0.75       1.0                    0.000996   
Decision Tree                 0.875  0.931746                       0.001   
Random Forest                  0.75  0.933333                    0.118717   
Gradient Boosting             0.875  0.920635                    0.247005   
XGBoost                        0.75  0.805556                    0.082778   
LightGBM                      0.125       0.5                    0.010972   
CatBoost                       0.75  0.977778                    0.119442   
MLP                            0.75       1.0                    0.513627   
DNN                            0.75  0.922222                     0.23946   
DCN                           0.875  0.927778                    0.297203   
Wide_and_Deep                 0.875  0.830556                    0.449732   

In [65]:
import pandas as pd
import numpy as np
import time
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from pytorch_tabnet.tab_model import TabNetClassifier
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

def objective(trial):
    # Define hyperparameters to tune for VIME-like model (using TabNet as proxy)
    n_d = trial.suggest_int('n_d', 8, 64)
    n_a = trial.suggest_int('n_a', 8, 64)
    n_steps = trial.suggest_int('n_steps', 3, 10)
    gamma = trial.suggest_float('gamma', 1.0, 2.0)
    lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the TabNet model
    model = TabNetClassifier(
        n_d=n_d,
        n_a=n_a,
        n_steps=n_steps,
        gamma=gamma,
        lambda_sparse=lambda_sparse,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=learning_rate),
        device_name=device
    )

    # Training
    model.fit(
        X_train=X_train_scaled, y_train=y_train.values,
        eval_set=[(X_test_scaled, y_test.values)],
        eval_name=['val'],
        eval_metric=['accuracy'],
        max_epochs=num_epochs,
        patience=10,
        batch_size=batch_size,
        virtual_batch_size=batch_size // 2,
        num_workers=0,
        drop_last=False
    )

    # Evaluation
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final TabNet model with the best hyperparameters
best_model = TabNetClassifier(
    n_d=best_params['n_d'],
    n_a=best_params['n_a'],
    n_steps=best_params['n_steps'],
    gamma=best_params['gamma'],
    lambda_sparse=best_params['lambda_sparse'],
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=best_params['learning_rate']),
    device_name=device
)

training_start_time = time.time()
best_model.fit(
    X_train=X_train_scaled, y_train=y_train.values,
    eval_set=[(X_test_scaled, y_test.values)],
    eval_name=['val'],
    eval_metric=['accuracy'],
    max_epochs=best_params['num_epochs'],
    patience=10,
    batch_size=best_params['batch_size'],
    virtual_batch_size=best_params['batch_size'] // 2,
    num_workers=0,
    drop_last=False
)
training_time = time.time() - training_start_time

# Evaluation
y_pred = best_model.predict(X_test_scaled)
inference_start_time = time.time()
y_pred_proba = best_model.predict_proba(X_test_scaled)
inference_time = time.time() - inference_start_time

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
if len(np.unique(y)) == 2:  # Binary classification
    auc = roc_auc_score(y_test, y_pred_proba[:, 1])
else:  # Multiclass classification
    auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['VIME'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 07:57:32,222] A new study created in memory with name: no-name-9d43ab35-eb79-4e26-a516-277b27d33e12


Using device: cpu


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.50374 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.89285 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 1.9433  | val_accuracy: 0.75    |  0:00:00s
epoch 3  | loss: 1.58311 | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 1.61411 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 1.3291  | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 1.40681 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 1.21003 | val_accuracy: 0.625   |  0:00:01s
epoch 8  | loss: 1.38053 | val_accuracy: 0.75    |  0:00:01s
epoch 9  | loss: 1.34901 | val_accuracy: 0.875   |  0:00:01s
epoch 10 | loss: 1.09436 | val_accuracy: 0.875   |  0:00:01s
epoch 11 | loss: 1.05671 | val_accuracy: 0.75    |  0:00:01s
epoch 12 | loss: 1.0244  | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.77091 | val_accuracy: 0.625   |  0:00:01s
epoch 14 | loss: 0.85441 | val_accuracy: 0.625   |  0:00:02s
epoch 15 | loss: 0.63634 | val_accuracy: 0.5     |  0:00:02s
epoch 16 | loss: 0.65374

[I 2024-07-26 07:57:35,108] Trial 0 finished with value: 0.875 and parameters: {'n_d': 11, 'n_a': 16, 'n_steps': 10, 'gamma': 1.6273843140950668, 'lambda_sparse': 6.757651400226338e-05, 'learning_rate': 0.005768493906323893, 'batch_size': 64, 'num_epochs': 35}. Best is trial 0 with value: 0.875.


epoch 19 | loss: 0.49697 | val_accuracy: 0.625   |  0:00:02s

Early stopping occurred at epoch 19 with best_epoch = 9 and best_val_accuracy = 0.875
epoch 0  | loss: 2.71194 | val_accuracy: 0.625   |  0:00:00s


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 1  | loss: 0.65524 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 0.34635 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 0.1353  | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 0.06497 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 0.04482 | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 0.01546 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.02013 | val_accuracy: 0.625   |  0:00:00s
epoch 8  | loss: 0.00936 | val_accuracy: 0.625   |  0:00:00s


[I 2024-07-26 07:57:35,834] Trial 1 finished with value: 0.625 and parameters: {'n_d': 28, 'n_a': 34, 'n_steps': 3, 'gamma': 1.1064586646341397, 'lambda_sparse': 6.421522258600302e-05, 'learning_rate': 0.04499005936868059, 'batch_size': 64, 'num_epochs': 67}. Best is trial 0 with value: 0.875.


epoch 9  | loss: 0.00526 | val_accuracy: 0.5     |  0:00:00s
epoch 10 | loss: 0.00312 | val_accuracy: 0.5     |  0:00:00s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.625


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 11.98331| val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 10.70578| val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 9.05365 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 8.02818 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 6.51115 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 5.31637 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 4.47703 | val_accuracy: 0.375   |  0:00:01s
epoch 7  | loss: 3.43643 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 2.40229 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 1.51601 | val_accuracy: 0.75    |  0:00:01s
epoch 10 | loss: 0.92001 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.4099  | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.30818 | val_accuracy: 0.75    |  0:00:02s
epoch 13 | loss: 0.18114 | val_accuracy: 0.75    |  0:00:02s
epoch 14 | loss: 0.11717 | val_accuracy: 0.75    |  0:00:02s
epoch 15 | loss: 0.12478 | val_accuracy: 0.75    |  0:00:02s
epoch 16 | loss: 0.10967

[I 2024-07-26 07:57:40,094] Trial 2 finished with value: 0.875 and parameters: {'n_d': 45, 'n_a': 37, 'n_steps': 9, 'gamma': 1.1011109035567221, 'lambda_sparse': 5.246541295022321e-05, 'learning_rate': 0.0029550976487433002, 'batch_size': 128, 'num_epochs': 70}. Best is trial 0 with value: 0.875.


epoch 26 | loss: 0.00351 | val_accuracy: 0.75    |  0:00:04s

Early stopping occurred at epoch 26 with best_epoch = 16 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.24197 | val_accuracy: 0.625   |  0:00:00s
epoch 1  | loss: 0.87388 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 0.71165 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 0.47937 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.40654 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.34433 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 0.28329 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.28135 | val_accuracy: 0.5     |  0:00:00s


[I 2024-07-26 07:57:41,130] Trial 3 finished with value: 0.625 and parameters: {'n_d': 18, 'n_a': 48, 'n_steps': 5, 'gamma': 1.3998557459768195, 'lambda_sparse': 3.095444578877207e-06, 'learning_rate': 0.0019700817028198126, 'batch_size': 256, 'num_epochs': 93}. Best is trial 0 with value: 0.875.


epoch 8  | loss: 0.21534 | val_accuracy: 0.625   |  0:00:00s
epoch 9  | loss: 0.19121 | val_accuracy: 0.625   |  0:00:00s
epoch 10 | loss: 0.18697 | val_accuracy: 0.625   |  0:00:00s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.625


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.96939 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 3.6088  | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 2.34375 | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 1.62997 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 1.18817 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 1.22511 | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 0.741   | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.68664 | val_accuracy: 0.625   |  0:00:01s
epoch 8  | loss: 0.80786 | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.81594 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 1.178   | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.73027 | val_accuracy: 0.75    |  0:00:01s
epoch 12 | loss: 0.56515 | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.50578 | val_accuracy: 0.75    |  0:00:01s
epoch 14 | loss: 0.57784 | val_accuracy: 0.625   |  0:00:02s
epoch 15 | loss: 0.65469 | val_accuracy: 0.625   |  0:00:02s
epoch 16 | loss: 0.80753

[I 2024-07-26 07:57:44,395] Trial 4 finished with value: 0.75 and parameters: {'n_d': 18, 'n_a': 39, 'n_steps': 9, 'gamma': 1.7446834881392843, 'lambda_sparse': 5.857528179341757e-05, 'learning_rate': 0.014112369438392006, 'batch_size': 32, 'num_epochs': 58}. Best is trial 0 with value: 0.875.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.61736 | val_accuracy: 0.625   |  0:00:00s
epoch 1  | loss: 0.83891 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 0.43252 | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 0.07615 | val_accuracy: 0.875   |  0:00:00s
epoch 4  | loss: 0.0999  | val_accuracy: 0.875   |  0:00:00s
epoch 5  | loss: 0.00974 | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.00312 | val_accuracy: 0.75    |  0:00:00s
epoch 7  | loss: 0.0022  | val_accuracy: 0.625   |  0:00:00s
epoch 8  | loss: 0.00177 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.00182 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.0013  | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.00129 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.00088 | val_accuracy: 0.625   |  0:00:01s


[I 2024-07-26 07:57:46,105] Trial 5 finished with value: 0.875 and parameters: {'n_d': 54, 'n_a': 55, 'n_steps': 6, 'gamma': 1.4307181138232081, 'lambda_sparse': 1.6896693073525064e-06, 'learning_rate': 0.01148585948319755, 'batch_size': 128, 'num_epochs': 69}. Best is trial 0 with value: 0.875.


epoch 13 | loss: 0.00144 | val_accuracy: 0.625   |  0:00:01s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 4.39439 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 3.44014 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 3.55524 | val_accuracy: 0.25    |  0:00:01s
epoch 3  | loss: 2.47877 | val_accuracy: 0.375   |  0:00:01s
epoch 4  | loss: 2.26724 | val_accuracy: 0.5     |  0:00:01s
epoch 5  | loss: 1.78668 | val_accuracy: 0.5     |  0:00:01s
epoch 6  | loss: 1.75436 | val_accuracy: 0.375   |  0:00:01s
epoch 7  | loss: 1.60245 | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 1.66498 | val_accuracy: 0.375   |  0:00:02s
epoch 9  | loss: 1.25218 | val_accuracy: 0.625   |  0:00:02s
epoch 10 | loss: 1.11111 | val_accuracy: 0.5     |  0:00:02s
epoch 11 | loss: 0.83505 | val_accuracy: 0.5     |  0:00:02s
epoch 12 | loss: 0.58902 | val_accuracy: 0.625   |  0:00:02s
epoch 13 | loss: 0.44256 | val_accuracy: 0.5     |  0:00:02s
epoch 14 | loss: 0.65791 | val_accuracy: 0.5     |  0:00:03s
epoch 15 | loss: 0.26783 | val_accuracy: 0.5     |  0:00:03s
epoch 16 | loss: 0.21789

[I 2024-07-26 07:57:50,375] Trial 6 finished with value: 0.625 and parameters: {'n_d': 54, 'n_a': 22, 'n_steps': 9, 'gamma': 1.680954230356611, 'lambda_sparse': 4.5682820550622e-05, 'learning_rate': 0.001700705845658843, 'batch_size': 256, 'num_epochs': 48}. Best is trial 0 with value: 0.875.


epoch 19 | loss: 0.11799 | val_accuracy: 0.375   |  0:00:04s

Early stopping occurred at epoch 19 with best_epoch = 9 and best_val_accuracy = 0.625


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 5.78158 | val_accuracy: 0.125   |  0:00:00s
epoch 1  | loss: 4.73881 | val_accuracy: 0.125   |  0:00:00s
epoch 2  | loss: 4.05516 | val_accuracy: 0.125   |  0:00:00s
epoch 3  | loss: 3.31367 | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 2.98248 | val_accuracy: 0.375   |  0:00:03s
epoch 5  | loss: 2.49209 | val_accuracy: 0.375   |  0:00:03s
epoch 6  | loss: 2.2387  | val_accuracy: 0.375   |  0:00:03s
epoch 7  | loss: 1.50477 | val_accuracy: 0.5     |  0:00:03s
epoch 8  | loss: 1.15974 | val_accuracy: 0.5     |  0:00:03s
epoch 9  | loss: 0.66804 | val_accuracy: 0.5     |  0:00:03s
epoch 10 | loss: 0.51651 | val_accuracy: 0.5     |  0:00:04s
epoch 11 | loss: 0.54971 | val_accuracy: 0.5     |  0:00:04s
epoch 12 | loss: 0.4114  | val_accuracy: 0.5     |  0:00:04s
epoch 13 | loss: 0.24282 | val_accuracy: 0.5     |  0:00:04s
epoch 14 | loss: 0.2474  | val_accuracy: 0.375   |  0:00:04s
epoch 15 | loss: 0.20594 | val_accuracy: 0.5     |  0:00:04s
epoch 16 | loss: 0.11004

[I 2024-07-26 07:57:56,943] Trial 7 finished with value: 0.75 and parameters: {'n_d': 21, 'n_a': 22, 'n_steps': 10, 'gamma': 1.3022263925106232, 'lambda_sparse': 3.29321236608444e-06, 'learning_rate': 0.007255937347827829, 'batch_size': 64, 'num_epochs': 28}. Best is trial 0 with value: 0.875.


epoch 27 | loss: 0.00444 | val_accuracy: 0.75    |  0:00:06s
Stop training because you reached max_epochs = 28 with best_epoch = 20 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.21366 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 3.23226 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 3.25109 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 3.21364 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 3.22078 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 3.13421 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 3.01928 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 3.0033  | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 2.86331 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 2.88308 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 3.11733 | val_accuracy: 0.375   |  0:00:01s
epoch 11 | loss: 2.96281 | val_accuracy: 0.375   |  0:00:01s
epoch 12 | loss: 2.94956 | val_accuracy: 0.375   |  0:00:01s
epoch 13 | loss: 3.09216 | val_accuracy: 0.375   |  0:00:01s
epoch 14 | loss: 2.93179 | val_accuracy: 0.375   |  0:00:01s
epoch 15 | loss: 2.88976 | val_accuracy: 0.375   |  0:00:01s
epoch 16 | loss: 2.87069

[I 2024-07-26 07:57:59,104] Trial 8 finished with value: 0.5 and parameters: {'n_d': 12, 'n_a': 26, 'n_steps': 9, 'gamma': 1.9617007271486753, 'lambda_sparse': 9.464612100945171e-05, 'learning_rate': 0.0001014320440647219, 'batch_size': 128, 'num_epochs': 93}. Best is trial 0 with value: 0.875.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.1309  | val_accuracy: 0.75    |  0:00:00s
epoch 1  | loss: 0.56968 | val_accuracy: 0.625   |  0:00:00s
epoch 2  | loss: 0.33646 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 0.27344 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 0.18375 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.13446 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 0.09861 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.07324 | val_accuracy: 0.625   |  0:00:00s
epoch 8  | loss: 0.05333 | val_accuracy: 0.625   |  0:00:00s


[I 2024-07-26 07:57:59,713] Trial 9 finished with value: 0.75 and parameters: {'n_d': 28, 'n_a': 55, 'n_steps': 3, 'gamma': 1.7953902705873932, 'lambda_sparse': 2.501663240204828e-06, 'learning_rate': 0.014415056744622717, 'batch_size': 128, 'num_epochs': 50}. Best is trial 0 with value: 0.875.


epoch 9  | loss: 0.04085 | val_accuracy: 0.625   |  0:00:00s
epoch 10 | loss: 0.03079 | val_accuracy: 0.5     |  0:00:00s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.04645 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 3.23027 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 3.33237 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 3.16034 | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 3.19334 | val_accuracy: 0.25    |  0:00:00s
epoch 5  | loss: 3.27941 | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 3.02643 | val_accuracy: 0.25    |  0:00:00s
epoch 7  | loss: 2.96474 | val_accuracy: 0.25    |  0:00:00s
epoch 8  | loss: 2.8174  | val_accuracy: 0.25    |  0:00:00s


[I 2024-07-26 07:58:00,966] Trial 10 finished with value: 0.25 and parameters: {'n_d': 8, 'n_a': 8, 'n_steps': 7, 'gamma': 1.57375679303858, 'lambda_sparse': 0.0008206558879857576, 'learning_rate': 0.000450110733517511, 'batch_size': 64, 'num_epochs': 13}. Best is trial 0 with value: 0.875.


epoch 9  | loss: 2.79677 | val_accuracy: 0.25    |  0:00:00s
epoch 10 | loss: 2.88721 | val_accuracy: 0.125   |  0:00:01s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.25


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.90071 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 3.30481 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 2.43334 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 2.17222 | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 2.69349 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 2.23141 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 1.53146 | val_accuracy: 0.25    |  0:00:01s
epoch 7  | loss: 1.44682 | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 1.73977 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 1.34117 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 1.10957 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 1.02034 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 1.22196 | val_accuracy: 0.625   |  0:00:02s
epoch 13 | loss: 1.37118 | val_accuracy: 0.625   |  0:00:02s
epoch 14 | loss: 0.91145 | val_accuracy: 0.75    |  0:00:02s
epoch 15 | loss: 0.53126 | val_accuracy: 0.75    |  0:00:02s
epoch 16 | loss: 0.57154

[I 2024-07-26 07:58:05,100] Trial 11 finished with value: 0.75 and parameters: {'n_d': 42, 'n_a': 10, 'n_steps': 10, 'gamma': 1.0665676414027303, 'lambda_sparse': 0.0002582425611202243, 'learning_rate': 0.0011793252098365966, 'batch_size': 32, 'num_epochs': 30}. Best is trial 0 with value: 0.875.


epoch 24 | loss: 0.36729 | val_accuracy: 0.625   |  0:00:03s

Early stopping occurred at epoch 24 with best_epoch = 14 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 6.59475 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 2.58269 | val_accuracy: 0.75    |  0:00:00s
epoch 2  | loss: 0.63155 | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 0.18075 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 0.51749 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.26549 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 0.17406 | val_accuracy: 0.375   |  0:00:01s
epoch 7  | loss: 0.07502 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.00786 | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.01846 | val_accuracy: 0.5     |  0:00:01s


[I 2024-07-26 07:58:06,992] Trial 12 finished with value: 0.75 and parameters: {'n_d': 42, 'n_a': 42, 'n_steps': 8, 'gamma': 1.216253083848298, 'lambda_sparse': 1.4162014127672613e-05, 'learning_rate': 0.08310278884529404, 'batch_size': 128, 'num_epochs': 79}. Best is trial 0 with value: 0.875.


epoch 10 | loss: 0.00288 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.03398 | val_accuracy: 0.5     |  0:00:01s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.80414 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.05715 | val_accuracy: 0.875   |  0:00:00s
epoch 2  | loss: 0.32982 | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 0.11699 | val_accuracy: 0.75    |  0:00:00s
epoch 4  | loss: 0.24796 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.11024 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 0.05937 | val_accuracy: 0.25    |  0:00:00s
epoch 7  | loss: 0.03808 | val_accuracy: 0.25    |  0:00:01s
epoch 8  | loss: 0.03635 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 0.08068 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 0.06389 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.05295 | val_accuracy: 0.625   |  0:00:01s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_accuracy = 0.875


[I 2024-07-26 07:58:08,579] Trial 13 finished with value: 0.875 and parameters: {'n_d': 60, 'n_a': 32, 'n_steps': 8, 'gamma': 1.5569827723780318, 'lambda_sparse': 1.2220071154050046e-05, 'learning_rate': 0.004776736204548879, 'batch_size': 64, 'num_epochs': 40}. Best is trial 0 with value: 0.875.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 5.02295 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 5.82747 | val_accuracy: 0.125   |  0:00:00s
epoch 2  | loss: 5.66633 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 4.7686  | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 4.77387 | val_accuracy: 0.25    |  0:00:00s
epoch 5  | loss: 4.51862 | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 4.46372 | val_accuracy: 0.25    |  0:00:00s
epoch 7  | loss: 4.35199 | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 4.22326 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 3.92623 | val_accuracy: 0.375   |  0:00:01s


[I 2024-07-26 07:58:10,280] Trial 14 finished with value: 0.375 and parameters: {'n_d': 37, 'n_a': 64, 'n_steps': 10, 'gamma': 1.9020136004297377, 'lambda_sparse': 0.00018688965736154793, 'learning_rate': 0.0005193371560617653, 'batch_size': 64, 'num_epochs': 77}. Best is trial 0 with value: 0.875.


epoch 10 | loss: 4.20146 | val_accuracy: 0.375   |  0:00:01s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.375


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.26682 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 0.72235 | val_accuracy: 0.625   |  0:00:00s
epoch 2  | loss: 0.29186 | val_accuracy: 0.875   |  0:00:00s
epoch 3  | loss: 0.26244 | val_accuracy: 0.875   |  0:00:00s
epoch 4  | loss: 0.0653  | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.05192 | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 0.02126 | val_accuracy: 0.75    |  0:00:00s
epoch 7  | loss: 0.15039 | val_accuracy: 0.625   |  0:00:00s
epoch 8  | loss: 0.17439 | val_accuracy: 0.625   |  0:00:00s
epoch 9  | loss: 0.00322 | val_accuracy: 0.75    |  0:00:01s
epoch 10 | loss: 0.0156  | val_accuracy: 0.75    |  0:00:01s


[I 2024-07-26 07:58:11,788] Trial 15 finished with value: 0.875 and parameters: {'n_d': 49, 'n_a': 16, 'n_steps': 8, 'gamma': 1.2404636428781979, 'lambda_sparse': 1.6127615244733703e-05, 'learning_rate': 0.02601553013896955, 'batch_size': 128, 'num_epochs': 16}. Best is trial 0 with value: 0.875.


epoch 11 | loss: 0.01099 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.0023  | val_accuracy: 0.625   |  0:00:01s

Early stopping occurred at epoch 12 with best_epoch = 2 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.52978 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 2.36005 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 1.80907 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 1.51253 | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 1.03521 | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.87241 | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 0.51632 | val_accuracy: 0.375   |  0:00:00s
epoch 7  | loss: 0.38246 | val_accuracy: 0.375   |  0:00:00s
epoch 8  | loss: 0.33696 | val_accuracy: 0.375   |  0:00:00s
epoch 9  | loss: 0.24449 | val_accuracy: 0.5     |  0:00:00s
epoch 10 | loss: 0.28412 | val_accuracy: 0.5     |  0:00:00s
epoch 11 | loss: 0.3223  | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.16941 | val_accuracy: 0.625   |  0:00:01s


[I 2024-07-26 07:58:13,226] Trial 16 finished with value: 0.625 and parameters: {'n_d': 33, 'n_a': 29, 'n_steps': 6, 'gamma': 1.0004006258868772, 'lambda_sparse': 0.00020343521134985344, 'learning_rate': 0.004132356380412554, 'batch_size': 32, 'num_epochs': 32}. Best is trial 0 with value: 0.875.


epoch 13 | loss: 0.14143 | val_accuracy: 0.625   |  0:00:01s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.625


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.5044  | val_accuracy: 0.5     |  0:00:01s
epoch 1  | loss: 3.02679 | val_accuracy: 0.375   |  0:00:01s
epoch 2  | loss: 3.07226 | val_accuracy: 0.375   |  0:00:01s
epoch 3  | loss: 2.85474 | val_accuracy: 0.375   |  0:00:01s
epoch 4  | loss: 2.76687 | val_accuracy: 0.375   |  0:00:01s
epoch 5  | loss: 2.43931 | val_accuracy: 0.5     |  0:00:01s
epoch 6  | loss: 2.34271 | val_accuracy: 0.625   |  0:00:01s
epoch 7  | loss: 2.19651 | val_accuracy: 0.5     |  0:00:02s
epoch 8  | loss: 2.45491 | val_accuracy: 0.5     |  0:00:02s
epoch 9  | loss: 2.41725 | val_accuracy: 0.5     |  0:00:03s
epoch 10 | loss: 2.10112 | val_accuracy: 0.5     |  0:00:03s
epoch 11 | loss: 1.86105 | val_accuracy: 0.375   |  0:00:03s
epoch 12 | loss: 1.38531 | val_accuracy: 0.375   |  0:00:03s
epoch 13 | loss: 1.15466 | val_accuracy: 0.5     |  0:00:03s
epoch 14 | loss: 1.05488 | val_accuracy: 0.5     |  0:00:03s


[I 2024-07-26 07:58:17,424] Trial 17 finished with value: 0.625 and parameters: {'n_d': 45, 'n_a': 45, 'n_steps': 7, 'gamma': 1.6521146307348793, 'lambda_sparse': 2.5414877960165033e-05, 'learning_rate': 0.0005136848124139033, 'batch_size': 256, 'num_epochs': 59}. Best is trial 0 with value: 0.875.


epoch 15 | loss: 0.91886 | val_accuracy: 0.5     |  0:00:03s
epoch 16 | loss: 0.9311  | val_accuracy: 0.625   |  0:00:03s

Early stopping occurred at epoch 16 with best_epoch = 6 and best_val_accuracy = 0.625


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 5.58929 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 3.42847 | val_accuracy: 0.625   |  0:00:00s
epoch 2  | loss: 2.68823 | val_accuracy: 0.75    |  0:00:00s
epoch 3  | loss: 1.8308  | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 1.21094 | val_accuracy: 0.75    |  0:00:00s
epoch 5  | loss: 1.22486 | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.78502 | val_accuracy: 0.625   |  0:00:01s
epoch 7  | loss: 0.95486 | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 0.8377  | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.88186 | val_accuracy: 0.5     |  0:00:01s
epoch 10 | loss: 0.66955 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.46952 | val_accuracy: 0.5     |  0:00:01s


[I 2024-07-26 07:58:19,633] Trial 18 finished with value: 0.75 and parameters: {'n_d': 62, 'n_a': 16, 'n_steps': 9, 'gamma': 1.4525578248326165, 'lambda_sparse': 0.0006119522040081667, 'learning_rate': 0.002938791017366825, 'batch_size': 64, 'num_epochs': 40}. Best is trial 0 with value: 0.875.


epoch 12 | loss: 0.40645 | val_accuracy: 0.625   |  0:00:01s

Early stopping occurred at epoch 12 with best_epoch = 2 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.11002 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.66126 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 1.55696 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 1.11381 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 1.06363 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.95711 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 0.85549 | val_accuracy: 0.375   |  0:00:00s
epoch 7  | loss: 0.72729 | val_accuracy: 0.375   |  0:00:00s
epoch 8  | loss: 0.6274  | val_accuracy: 0.375   |  0:00:00s
epoch 9  | loss: 0.56609 | val_accuracy: 0.5     |  0:00:00s
epoch 10 | loss: 0.48174 | val_accuracy: 0.5     |  0:00:00s


[I 2024-07-26 07:58:20,568] Trial 19 finished with value: 0.5 and parameters: {'n_d': 34, 'n_a': 36, 'n_steps': 5, 'gamma': 1.3737982272837062, 'lambda_sparse': 0.00011735391105975612, 'learning_rate': 0.0010011893349017066, 'batch_size': 128, 'num_epochs': 100}. Best is trial 0 with value: 0.875.


epoch 11 | loss: 0.4102  | val_accuracy: 0.5     |  0:00:00s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_accuracy = 0.5


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.89163 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 2.51379 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 2.71545 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 2.48709 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 2.74735 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 2.84035 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 2.61419 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 2.67863 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 2.67694 | val_accuracy: 0.5     |  0:00:01s


[I 2024-07-26 07:58:22,194] Trial 20 finished with value: 0.5 and parameters: {'n_d': 25, 'n_a': 50, 'n_steps': 10, 'gamma': 1.838487161989383, 'lambda_sparse': 6.452638873963963e-06, 'learning_rate': 0.00018268199084555472, 'batch_size': 64, 'num_epochs': 79}. Best is trial 0 with value: 0.875.


epoch 9  | loss: 2.51386 | val_accuracy: 0.5     |  0:00:01s
epoch 10 | loss: 2.58098 | val_accuracy: 0.5     |  0:00:01s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.5


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.69357 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 0.58895 | val_accuracy: 0.625   |  0:00:00s
epoch 2  | loss: 0.5165  | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 0.46693 | val_accuracy: 0.875   |  0:00:00s
epoch 4  | loss: 0.0903  | val_accuracy: 0.75    |  0:00:00s
epoch 5  | loss: 0.13577 | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.15559 | val_accuracy: 0.625   |  0:00:00s
epoch 7  | loss: 0.0245  | val_accuracy: 0.75    |  0:00:00s
epoch 8  | loss: 0.0113  | val_accuracy: 0.75    |  0:00:00s
epoch 9  | loss: 0.00815 | val_accuracy: 0.75    |  0:00:00s
epoch 10 | loss: 0.00917 | val_accuracy: 0.625   |  0:00:00s
epoch 11 | loss: 0.00709 | val_accuracy: 0.75    |  0:00:00s


[I 2024-07-26 07:58:23,460] Trial 21 finished with value: 0.875 and parameters: {'n_d': 54, 'n_a': 62, 'n_steps': 5, 'gamma': 1.5158473878786, 'lambda_sparse': 1.2179913354843392e-06, 'learning_rate': 0.01044708536673412, 'batch_size': 128, 'num_epochs': 68}. Best is trial 0 with value: 0.875.


epoch 12 | loss: 0.06606 | val_accuracy: 0.75    |  0:00:01s
epoch 13 | loss: 0.00963 | val_accuracy: 0.75    |  0:00:01s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.76743 | val_accuracy: 0.625   |  0:00:00s
epoch 1  | loss: 1.44899 | val_accuracy: 0.75    |  0:00:00s
epoch 2  | loss: 0.51906 | val_accuracy: 0.75    |  0:00:00s
epoch 3  | loss: 0.096   | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 0.047   | val_accuracy: 0.75    |  0:00:00s
epoch 5  | loss: 0.31605 | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.01277 | val_accuracy: 0.625   |  0:00:00s
epoch 7  | loss: 0.00581 | val_accuracy: 0.625   |  0:00:00s
epoch 8  | loss: 0.00719 | val_accuracy: 0.75    |  0:00:00s
epoch 9  | loss: 0.00484 | val_accuracy: 0.875   |  0:00:01s
epoch 10 | loss: 0.00301 | val_accuracy: 0.875   |  0:00:01s
epoch 11 | loss: 0.00198 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.00128 | val_accuracy: 0.75    |  0:00:01s
epoch 13 | loss: 0.00065 | val_accuracy: 0.75    |  0:00:01s
epoch 14 | loss: 0.0004  | val_accuracy: 0.75    |  0:00:01s
epoch 15 | loss: 0.00026 | val_accuracy: 0.625   |  0:00:01s
epoch 16 | loss: 0.00017

[I 2024-07-26 07:58:25,494] Trial 22 finished with value: 0.875 and parameters: {'n_d': 54, 'n_a': 54, 'n_steps': 6, 'gamma': 1.167066881458854, 'lambda_sparse': 3.0630741145632804e-05, 'learning_rate': 0.02834262951436619, 'batch_size': 128, 'num_epochs': 68}. Best is trial 0 with value: 0.875.


epoch 19 | loss: 0.00012 | val_accuracy: 0.375   |  0:00:01s

Early stopping occurred at epoch 19 with best_epoch = 9 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.14135 | val_accuracy: 0.625   |  0:00:00s
epoch 1  | loss: 1.03106 | val_accuracy: 0.625   |  0:00:00s
epoch 2  | loss: 0.47019 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 0.23905 | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 0.3211  | val_accuracy: 0.75    |  0:00:00s
epoch 5  | loss: 0.08674 | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.0248  | val_accuracy: 0.75    |  0:00:00s
epoch 7  | loss: 0.01486 | val_accuracy: 0.75    |  0:00:00s
epoch 8  | loss: 0.00982 | val_accuracy: 0.625   |  0:00:00s
epoch 9  | loss: 0.00511 | val_accuracy: 0.625   |  0:00:00s
epoch 10 | loss: 0.00358 | val_accuracy: 0.625   |  0:00:00s
epoch 11 | loss: 0.00271 | val_accuracy: 0.5     |  0:00:00s
epoch 12 | loss: 0.00217 | val_accuracy: 0.5     |  0:00:00s
epoch 13 | loss: 0.00179 | val_accuracy: 0.5     |  0:00:00s


[I 2024-07-26 07:58:26,688] Trial 23 finished with value: 0.75 and parameters: {'n_d': 48, 'n_a': 58, 'n_steps': 4, 'gamma': 1.3374463167418176, 'lambda_sparse': 7.089277078572495e-06, 'learning_rate': 0.00705166894454283, 'batch_size': 128, 'num_epochs': 73}. Best is trial 0 with value: 0.875.


epoch 14 | loss: 0.00148 | val_accuracy: 0.5     |  0:00:01s

Early stopping occurred at epoch 14 with best_epoch = 4 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.92359 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 2.72063 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 1.97925 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 2.07942 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 1.55547 | val_accuracy: 0.25    |  0:00:00s
epoch 5  | loss: 0.96591 | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 0.78694 | val_accuracy: 0.625   |  0:00:00s
epoch 7  | loss: 0.47133 | val_accuracy: 0.75    |  0:00:00s
epoch 8  | loss: 0.29051 | val_accuracy: 0.75    |  0:00:01s
epoch 9  | loss: 0.17436 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.10301 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.07285 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.05722 | val_accuracy: 0.5     |  0:00:01s
epoch 13 | loss: 0.05631 | val_accuracy: 0.375   |  0:00:01s
epoch 14 | loss: 0.03078 | val_accuracy: 0.375   |  0:00:01s
epoch 15 | loss: 0.02217 | val_accuracy: 0.375   |  0:00:01s


[I 2024-07-26 07:58:28,786] Trial 24 finished with value: 0.75 and parameters: {'n_d': 40, 'n_a': 40, 'n_steps': 7, 'gamma': 1.6218963125384522, 'lambda_sparse': 0.00034218703367135003, 'learning_rate': 0.0028478432510061482, 'batch_size': 128, 'num_epochs': 61}. Best is trial 0 with value: 0.875.


epoch 16 | loss: 0.01714 | val_accuracy: 0.375   |  0:00:01s
epoch 17 | loss: 0.01324 | val_accuracy: 0.375   |  0:00:01s

Early stopping occurred at epoch 17 with best_epoch = 7 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.9571  | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.07416 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 0.82134 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 0.41433 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 0.35413 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 0.11317 | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 0.10193 | val_accuracy: 0.5     |  0:00:01s
epoch 7  | loss: 0.02068 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.01559 | val_accuracy: 0.5     |  0:00:02s
epoch 9  | loss: 0.00363 | val_accuracy: 0.625   |  0:00:02s
epoch 10 | loss: 0.0061  | val_accuracy: 0.5     |  0:00:02s
epoch 11 | loss: 0.00483 | val_accuracy: 0.625   |  0:00:02s
epoch 12 | loss: 0.00369 | val_accuracy: 0.625   |  0:00:02s
epoch 13 | loss: 0.00227 | val_accuracy: 0.75    |  0:00:02s
epoch 14 | loss: 0.04998 | val_accuracy: 0.75    |  0:00:02s
epoch 15 | loss: 0.00157 | val_accuracy: 0.75    |  0:00:02s
epoch 16 | loss: 0.00097

[I 2024-07-26 07:58:32,834] Trial 25 finished with value: 0.75 and parameters: {'n_d': 58, 'n_a': 14, 'n_steps': 6, 'gamma': 1.4670775432186125, 'lambda_sparse': 0.0001074132631290509, 'learning_rate': 0.02176148920751818, 'batch_size': 128, 'num_epochs': 86}. Best is trial 0 with value: 0.875.


epoch 23 | loss: 0.00056 | val_accuracy: 0.625   |  0:00:03s

Early stopping occurred at epoch 23 with best_epoch = 13 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 4.22073 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 3.14343 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 3.27465 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 1.53394 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.93024 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 1.14947 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 0.64436 | val_accuracy: 0.375   |  0:00:00s
epoch 7  | loss: 0.32417 | val_accuracy: 0.375   |  0:00:00s
epoch 8  | loss: 0.27221 | val_accuracy: 0.375   |  0:00:00s
epoch 9  | loss: 0.36158 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 0.12649 | val_accuracy: 0.375   |  0:00:01s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.5


[I 2024-07-26 07:58:34,244] Trial 26 finished with value: 0.5 and parameters: {'n_d': 50, 'n_a': 23, 'n_steps': 8, 'gamma': 1.7136186148268095, 'lambda_sparse': 3.7214219082361295e-05, 'learning_rate': 0.005738034349407903, 'batch_size': 256, 'num_epochs': 22}. Best is trial 0 with value: 0.875.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.94432 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 1.95283 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 0.551   | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 0.87481 | val_accuracy: 0.875   |  0:00:00s
epoch 4  | loss: 0.24836 | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.44117 | val_accuracy: 0.75    |  0:00:01s
epoch 6  | loss: 0.34095 | val_accuracy: 0.75    |  0:00:01s
epoch 7  | loss: 0.62406 | val_accuracy: 0.75    |  0:00:01s
epoch 8  | loss: 0.26209 | val_accuracy: 0.75    |  0:00:01s
epoch 9  | loss: 0.11276 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.4099  | val_accuracy: 0.875   |  0:00:01s
epoch 11 | loss: 0.10169 | val_accuracy: 0.75    |  0:00:01s


[I 2024-07-26 07:58:36,734] Trial 27 finished with value: 0.875 and parameters: {'n_d': 64, 'n_a': 49, 'n_steps': 9, 'gamma': 1.2745179984088855, 'lambda_sparse': 1.3049567108295784e-06, 'learning_rate': 0.00994792749154684, 'batch_size': 32, 'num_epochs': 50}. Best is trial 0 with value: 0.875.


epoch 12 | loss: 0.15103 | val_accuracy: 0.75    |  0:00:02s
epoch 13 | loss: 0.24506 | val_accuracy: 0.75    |  0:00:02s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 6.9432  | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 2.16392 | val_accuracy: 0.75    |  0:00:00s
epoch 2  | loss: 0.68187 | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 0.67949 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.17484 | val_accuracy: 0.25    |  0:00:01s
epoch 5  | loss: 0.39107 | val_accuracy: 0.375   |  0:00:01s
epoch 6  | loss: 0.02555 | val_accuracy: 0.375   |  0:00:01s
epoch 7  | loss: 0.00856 | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 0.00837 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.00037 | val_accuracy: 0.75    |  0:00:01s
epoch 10 | loss: 0.00031 | val_accuracy: 0.75    |  0:00:01s


[I 2024-07-26 07:58:38,815] Trial 28 finished with value: 0.75 and parameters: {'n_d': 46, 'n_a': 30, 'n_steps': 10, 'gamma': 1.1419741837098498, 'lambda_sparse': 7.560708622125292e-06, 'learning_rate': 0.05207179071566849, 'batch_size': 128, 'num_epochs': 41}. Best is trial 0 with value: 0.875.


epoch 11 | loss: 0.00047 | val_accuracy: 0.75    |  0:00:01s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.26745 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 2.02769 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 1.30501 | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 1.00651 | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 0.73641 | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.50735 | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 0.36803 | val_accuracy: 0.625   |  0:00:00s
epoch 7  | loss: 0.27433 | val_accuracy: 0.625   |  0:00:00s
epoch 8  | loss: 0.2276  | val_accuracy: 0.625   |  0:00:00s
epoch 9  | loss: 0.17042 | val_accuracy: 0.625   |  0:00:00s
epoch 10 | loss: 0.13349 | val_accuracy: 0.625   |  0:00:00s


[I 2024-07-26 07:58:39,818] Trial 29 finished with value: 0.625 and parameters: {'n_d': 30, 'n_a': 37, 'n_steps': 4, 'gamma': 1.0015138947725764, 'lambda_sparse': 5.652537897285923e-05, 'learning_rate': 0.0023308297049763455, 'batch_size': 64, 'num_epochs': 63}. Best is trial 0 with value: 0.875.


epoch 11 | loss: 0.10288 | val_accuracy: 0.625   |  0:00:00s
epoch 12 | loss: 0.08751 | val_accuracy: 0.625   |  0:00:00s

Early stopping occurred at epoch 12 with best_epoch = 2 and best_val_accuracy = 0.625


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.81877 | val_accuracy: 0.75    |  0:00:00s
epoch 1  | loss: 1.03134 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 0.73949 | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 0.16225 | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 0.03415 | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.0678  | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 0.17773 | val_accuracy: 0.625   |  0:00:00s
epoch 7  | loss: 0.00233 | val_accuracy: 0.625   |  0:00:00s
epoch 8  | loss: 0.00443 | val_accuracy: 0.625   |  0:00:00s
epoch 9  | loss: 0.00273 | val_accuracy: 0.5     |  0:00:00s
epoch 10 | loss: 0.00143 | val_accuracy: 0.625   |  0:00:01s


[I 2024-07-26 07:58:41,067] Trial 30 finished with value: 0.75 and parameters: {'n_d': 38, 'n_a': 34, 'n_steps': 7, 'gamma': 1.0882058816706188, 'lambda_sparse': 2.6965988632747936e-05, 'learning_rate': 0.04306125030709091, 'batch_size': 64, 'num_epochs': 85}. Best is trial 0 with value: 0.875.



Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.94984 | val_accuracy: 0.125   |  0:00:00s
epoch 1  | loss: 2.64501 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 1.8386  | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 1.26554 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 0.85521 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.54311 | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 0.5092  | val_accuracy: 0.5     |  0:00:01s
epoch 7  | loss: 0.33521 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.40992 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 0.23863 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 0.17465 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.31905 | val_accuracy: 0.375   |  0:00:02s
epoch 12 | loss: 0.43888 | val_accuracy: 0.625   |  0:00:02s
epoch 13 | loss: 0.21215 | val_accuracy: 0.625   |  0:00:02s
epoch 14 | loss: 0.23946 | val_accuracy: 0.625   |  0:00:02s
epoch 15 | loss: 0.10086 | val_accuracy: 0.75    |  0:00:02s
epoch 16 | loss: 0.05508

[I 2024-07-26 07:58:46,781] Trial 31 finished with value: 1.0 and parameters: {'n_d': 59, 'n_a': 32, 'n_steps': 8, 'gamma': 1.5408740103567724, 'lambda_sparse': 1.5950842365142268e-05, 'learning_rate': 0.0043312781000985475, 'batch_size': 64, 'num_epochs': 43}. Best is trial 31 with value: 1.0.


epoch 36 | loss: 0.00085 | val_accuracy: 0.875   |  0:00:05s

Early stopping occurred at epoch 36 with best_epoch = 26 and best_val_accuracy = 1.0


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 6.85321 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 6.16179 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 5.08308 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 4.797   | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 4.23418 | val_accuracy: 0.125   |  0:00:00s
epoch 5  | loss: 4.0357  | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 3.24042 | val_accuracy: 0.125   |  0:00:00s
epoch 7  | loss: 2.90419 | val_accuracy: 0.25    |  0:00:00s
epoch 8  | loss: 2.66295 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 2.28824 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 1.97374 | val_accuracy: 0.5     |  0:00:01s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.5


[I 2024-07-26 07:58:48,174] Trial 32 finished with value: 0.5 and parameters: {'n_d': 56, 'n_a': 44, 'n_steps': 8, 'gamma': 1.5810275409147458, 'lambda_sparse': 9.074146598670773e-05, 'learning_rate': 0.001166238518847287, 'batch_size': 64, 'num_epochs': 53}. Best is trial 31 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 4.93326 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 3.3898  | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 2.49898 | val_accuracy: 0.125   |  0:00:00s
epoch 3  | loss: 1.27167 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 1.45984 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 0.65128 | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 0.63931 | val_accuracy: 0.75    |  0:00:00s
epoch 7  | loss: 0.59069 | val_accuracy: 0.75    |  0:00:01s
epoch 8  | loss: 0.40915 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.3     | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.29268 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.23673 | val_accuracy: 0.5     |  0:00:01s
epoch 12 | loss: 0.22557 | val_accuracy: 0.5     |  0:00:01s
epoch 13 | loss: 0.18066 | val_accuracy: 0.625   |  0:00:01s
epoch 14 | loss: 0.08991 | val_accuracy: 0.75    |  0:00:01s
epoch 15 | loss: 0.1105  | val_accuracy: 0.625   |  0:00:01s


[I 2024-07-26 07:58:50,371] Trial 33 finished with value: 0.75 and parameters: {'n_d': 51, 'n_a': 18, 'n_steps': 9, 'gamma': 1.4163570749646526, 'lambda_sparse': 4.755465148737065e-06, 'learning_rate': 0.0040269887810437935, 'batch_size': 64, 'num_epochs': 38}. Best is trial 31 with value: 1.0.


epoch 16 | loss: 0.06779 | val_accuracy: 0.75    |  0:00:02s

Early stopping occurred at epoch 16 with best_epoch = 6 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.51837 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.50334 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 0.56697 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 0.87172 | val_accuracy: 0.75    |  0:00:00s
epoch 4  | loss: 0.48802 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.05809 | val_accuracy: 0.875   |  0:00:00s
epoch 6  | loss: 0.21242 | val_accuracy: 0.75    |  0:00:01s
epoch 7  | loss: 0.19586 | val_accuracy: 0.625   |  0:00:01s
epoch 8  | loss: 0.37268 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.45209 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.60953 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.11319 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.05339 | val_accuracy: 0.5     |  0:00:01s
epoch 13 | loss: 0.0954  | val_accuracy: 0.5     |  0:00:01s
epoch 14 | loss: 0.01193 | val_accuracy: 0.375   |  0:00:01s


[I 2024-07-26 07:58:52,669] Trial 34 finished with value: 0.875 and parameters: {'n_d': 60, 'n_a': 27, 'n_steps': 9, 'gamma': 1.7558604019651474, 'lambda_sparse': 2.1712323054301282e-06, 'learning_rate': 0.014333622897234577, 'batch_size': 256, 'num_epochs': 71}. Best is trial 31 with value: 1.0.


epoch 15 | loss: 0.1367  | val_accuracy: 0.375   |  0:00:02s

Early stopping occurred at epoch 15 with best_epoch = 5 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.94892 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 2.37918 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 0.54213 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 0.5144  | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.38529 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.38231 | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 0.09755 | val_accuracy: 0.375   |  0:00:01s
epoch 7  | loss: 0.04047 | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 0.02356 | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.02434 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 0.01243 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.01171 | val_accuracy: 0.625   |  0:00:02s
epoch 12 | loss: 0.00467 | val_accuracy: 0.5     |  0:00:03s
epoch 13 | loss: 0.00227 | val_accuracy: 0.625   |  0:00:03s
epoch 14 | loss: 0.21753 | val_accuracy: 0.5     |  0:00:03s
epoch 15 | loss: 0.00125 | val_accuracy: 0.5     |  0:00:03s

Early stopping occurred

[I 2024-07-26 07:58:56,654] Trial 35 finished with value: 0.625 and parameters: {'n_d': 52, 'n_a': 53, 'n_steps': 8, 'gamma': 1.6332732255004623, 'lambda_sparse': 6.50944703000834e-05, 'learning_rate': 0.009630078846706077, 'batch_size': 64, 'num_epochs': 46}. Best is trial 31 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.22885 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 2.05712 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 2.40016 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 1.8992  | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 2.1894  | val_accuracy: 0.25    |  0:00:00s
epoch 5  | loss: 1.91919 | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 2.11978 | val_accuracy: 0.25    |  0:00:01s
epoch 7  | loss: 2.36193 | val_accuracy: 0.125   |  0:00:01s
epoch 8  | loss: 1.58666 | val_accuracy: 0.25    |  0:00:01s
epoch 9  | loss: 1.58973 | val_accuracy: 0.25    |  0:00:01s


[I 2024-07-26 07:58:58,468] Trial 36 finished with value: 0.375 and parameters: {'n_d': 14, 'n_a': 25, 'n_steps': 10, 'gamma': 1.5067153047541284, 'lambda_sparse': 1.711263735568933e-05, 'learning_rate': 0.0019602606665652476, 'batch_size': 32, 'num_epochs': 56}. Best is trial 31 with value: 1.0.


epoch 10 | loss: 1.7698  | val_accuracy: 0.25    |  0:00:01s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.375


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 7.03398 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 5.85626 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 5.10245 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 3.99526 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 2.97964 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 2.03491 | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 1.6766  | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.99748 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.77816 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.36757 | val_accuracy: 0.5     |  0:00:01s
epoch 10 | loss: 0.22624 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.19019 | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.17109 | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.1836  | val_accuracy: 0.375   |  0:00:01s


[I 2024-07-26 07:59:00,521] Trial 37 finished with value: 0.625 and parameters: {'n_d': 23, 'n_a': 21, 'n_steps': 9, 'gamma': 1.3408489234184484, 'lambda_sparse': 5.236883120029269e-05, 'learning_rate': 0.005989892001361654, 'batch_size': 128, 'num_epochs': 21}. Best is trial 31 with value: 1.0.


epoch 14 | loss: 0.11569 | val_accuracy: 0.5     |  0:00:01s
epoch 15 | loss: 0.10027 | val_accuracy: 0.5     |  0:00:01s

Early stopping occurred at epoch 15 with best_epoch = 5 and best_val_accuracy = 0.625


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.66953 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 1.85675 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 2.02896 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 1.67299 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 1.35288 | val_accuracy: 0.125   |  0:00:00s
epoch 5  | loss: 1.66042 | val_accuracy: 0.375   |  0:00:00s
epoch 6  | loss: 1.3654  | val_accuracy: 0.125   |  0:00:00s
epoch 7  | loss: 0.98912 | val_accuracy: 0.375   |  0:00:00s
epoch 8  | loss: 0.76169 | val_accuracy: 0.375   |  0:00:00s


[I 2024-07-26 07:59:01,553] Trial 38 finished with value: 0.5 and parameters: {'n_d': 45, 'n_a': 46, 'n_steps': 6, 'gamma': 1.687392057422002, 'lambda_sparse': 1.7359457091744607e-06, 'learning_rate': 0.0015385413801399222, 'batch_size': 64, 'num_epochs': 34}. Best is trial 31 with value: 1.0.


epoch 9  | loss: 0.54938 | val_accuracy: 0.25    |  0:00:00s
epoch 10 | loss: 0.37701 | val_accuracy: 0.375   |  0:00:00s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.5


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.25086 | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.12444 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 0.50771 | val_accuracy: 0.75    |  0:00:00s
epoch 3  | loss: 0.58778 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 0.19285 | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.14788 | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.02641 | val_accuracy: 0.875   |  0:00:00s
epoch 7  | loss: 0.01495 | val_accuracy: 0.75    |  0:00:01s
epoch 8  | loss: 0.02017 | val_accuracy: 0.75    |  0:00:01s
epoch 9  | loss: 0.01039 | val_accuracy: 0.75    |  0:00:01s
epoch 10 | loss: 0.00721 | val_accuracy: 0.75    |  0:00:01s
epoch 11 | loss: 0.05021 | val_accuracy: 0.75    |  0:00:01s
epoch 12 | loss: 0.01802 | val_accuracy: 0.75    |  0:00:01s
epoch 13 | loss: 0.00295 | val_accuracy: 0.75    |  0:00:01s
epoch 14 | loss: 0.02228 | val_accuracy: 0.75    |  0:00:01s


[I 2024-07-26 07:59:03,713] Trial 39 finished with value: 0.875 and parameters: {'n_d': 58, 'n_a': 59, 'n_steps': 7, 'gamma': 1.41364151793716, 'lambda_sparse': 4.156369402507204e-06, 'learning_rate': 0.003501274296425856, 'batch_size': 256, 'num_epochs': 44}. Best is trial 31 with value: 1.0.


epoch 15 | loss: 0.00139 | val_accuracy: 0.625   |  0:00:01s
epoch 16 | loss: 0.00194 | val_accuracy: 0.625   |  0:00:01s

Early stopping occurred at epoch 16 with best_epoch = 6 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.83064 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 1.00008 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 0.87434 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 0.72055 | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 0.82431 | val_accuracy: 0.625   |  0:00:00s
epoch 5  | loss: 0.69291 | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 0.40507 | val_accuracy: 0.625   |  0:00:00s
epoch 7  | loss: 0.46788 | val_accuracy: 0.625   |  0:00:00s
epoch 8  | loss: 0.29796 | val_accuracy: 0.625   |  0:00:00s
epoch 9  | loss: 0.3326  | val_accuracy: 0.75    |  0:00:00s
epoch 10 | loss: 0.32226 | val_accuracy: 0.75    |  0:00:00s
epoch 11 | loss: 0.20975 | val_accuracy: 0.75    |  0:00:01s
epoch 12 | loss: 0.12876 | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.08629 | val_accuracy: 0.75    |  0:00:01s
epoch 14 | loss: 0.13104 | val_accuracy: 0.75    |  0:00:01s
epoch 15 | loss: 0.0555  | val_accuracy: 0.75    |  0:00:01s
epoch 16 | loss: 0.04627

[I 2024-07-26 07:59:06,175] Trial 40 finished with value: 0.875 and parameters: {'n_d': 17, 'n_a': 11, 'n_steps': 10, 'gamma': 1.834405269981342, 'lambda_sparse': 0.0001492086724261764, 'learning_rate': 0.01706907092963576, 'batch_size': 128, 'num_epochs': 27}. Best is trial 31 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.24379 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 1.12827 | val_accuracy: 0.5     |  0:00:00s
epoch 2  | loss: 0.47355 | val_accuracy: 0.625   |  0:00:00s
epoch 3  | loss: 0.22193 | val_accuracy: 0.875   |  0:00:00s
epoch 4  | loss: 0.48924 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.1743  | val_accuracy: 0.625   |  0:00:00s
epoch 6  | loss: 0.12286 | val_accuracy: 0.375   |  0:00:00s
epoch 7  | loss: 0.14829 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.0678  | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.11441 | val_accuracy: 0.5     |  0:00:01s
epoch 10 | loss: 0.03505 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.0254  | val_accuracy: 0.625   |  0:00:01s
epoch 12 | loss: 0.01596 | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.01427 | val_accuracy: 0.625   |  0:00:01s


[I 2024-07-26 07:59:08,187] Trial 41 finished with value: 0.875 and parameters: {'n_d': 61, 'n_a': 32, 'n_steps': 8, 'gamma': 1.5539457592606334, 'lambda_sparse': 1.2952289550757822e-05, 'learning_rate': 0.0046779735880009615, 'batch_size': 64, 'num_epochs': 36}. Best is trial 31 with value: 1.0.



Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.75746 | val_accuracy: 0.625   |  0:00:00s
epoch 1  | loss: 1.59292 | val_accuracy: 0.625   |  0:00:00s
epoch 2  | loss: 0.74516 | val_accuracy: 0.75    |  0:00:01s
epoch 3  | loss: 0.83627 | val_accuracy: 0.625   |  0:00:01s
epoch 4  | loss: 0.56993 | val_accuracy: 0.75    |  0:00:01s
epoch 5  | loss: 0.30549 | val_accuracy: 0.625   |  0:00:01s
epoch 6  | loss: 0.23912 | val_accuracy: 0.625   |  0:00:01s
epoch 7  | loss: 0.31375 | val_accuracy: 0.625   |  0:00:01s
epoch 8  | loss: 0.34751 | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.26847 | val_accuracy: 0.625   |  0:00:02s
epoch 10 | loss: 0.27595 | val_accuracy: 0.625   |  0:00:02s


[I 2024-07-26 07:59:11,251] Trial 42 finished with value: 0.75 and parameters: {'n_d': 58, 'n_a': 39, 'n_steps': 9, 'gamma': 1.5557848004955233, 'lambda_sparse': 1.0541816921060027e-05, 'learning_rate': 0.007457308462786435, 'batch_size': 64, 'num_epochs': 44}. Best is trial 31 with value: 1.0.


epoch 11 | loss: 0.07615 | val_accuracy: 0.5     |  0:00:02s
epoch 12 | loss: 0.02558 | val_accuracy: 0.5     |  0:00:02s

Early stopping occurred at epoch 12 with best_epoch = 2 and best_val_accuracy = 0.75


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 5.204   | val_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 4.66643 | val_accuracy: 0.375   |  0:00:00s
epoch 2  | loss: 5.06794 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 4.86533 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 4.49774 | val_accuracy: 0.25    |  0:00:00s
epoch 5  | loss: 3.74022 | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 3.24596 | val_accuracy: 0.25    |  0:00:00s
epoch 7  | loss: 3.44922 | val_accuracy: 0.375   |  0:00:01s
epoch 8  | loss: 2.86126 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 2.61963 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 2.62612 | val_accuracy: 0.375   |  0:00:01s


[I 2024-07-26 07:59:12,817] Trial 43 finished with value: 0.375 and parameters: {'n_d': 64, 'n_a': 33, 'n_steps': 8, 'gamma': 1.6093865379859045, 'lambda_sparse': 1.943543568770303e-05, 'learning_rate': 0.0007601815543087718, 'batch_size': 64, 'num_epochs': 64}. Best is trial 31 with value: 1.0.



Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.375


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.7827  | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 1.82302 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 1.08859 | val_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 0.66852 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 0.49719 | val_accuracy: 0.375   |  0:00:00s
epoch 5  | loss: 0.21772 | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 0.09279 | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.05716 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.05219 | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.05967 | val_accuracy: 0.5     |  0:00:01s
epoch 10 | loss: 0.07166 | val_accuracy: 0.625   |  0:00:01s
epoch 11 | loss: 0.01773 | val_accuracy: 0.5     |  0:00:01s
epoch 12 | loss: 0.01914 | val_accuracy: 0.875   |  0:00:01s
epoch 13 | loss: 0.01878 | val_accuracy: 0.75    |  0:00:01s
epoch 14 | loss: 0.0275  | val_accuracy: 0.75    |  0:00:01s
epoch 15 | loss: 0.01222 | val_accuracy: 0.875   |  0:00:01s
epoch 16 | loss: 0.00684

[I 2024-07-26 07:59:15,641] Trial 44 finished with value: 0.875 and parameters: {'n_d': 55, 'n_a': 30, 'n_steps': 7, 'gamma': 1.4704565203530249, 'lambda_sparse': 7.145267580087573e-05, 'learning_rate': 0.004771581777653516, 'batch_size': 64, 'num_epochs': 24}. Best is trial 31 with value: 1.0.


epoch 21 | loss: 0.00046 | val_accuracy: 0.875   |  0:00:02s
epoch 22 | loss: 0.00044 | val_accuracy: 0.75    |  0:00:02s

Early stopping occurred at epoch 22 with best_epoch = 12 and best_val_accuracy = 0.875


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.13245 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 3.10792 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 2.29688 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 2.47072 | val_accuracy: 0.5     |  0:00:00s
epoch 4  | loss: 1.97803 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 1.32653 | val_accuracy: 0.375   |  0:00:01s
epoch 6  | loss: 1.12633 | val_accuracy: 0.5     |  0:00:01s
epoch 7  | loss: 0.78939 | val_accuracy: 0.5     |  0:00:02s
epoch 8  | loss: 0.73261 | val_accuracy: 0.5     |  0:00:02s


[I 2024-07-26 07:59:18,150] Trial 45 finished with value: 0.5 and parameters: {'n_d': 42, 'n_a': 20, 'n_steps': 9, 'gamma': 1.5285527035662576, 'lambda_sparse': 3.9265925217367914e-05, 'learning_rate': 0.002424537988238844, 'batch_size': 64, 'num_epochs': 53}. Best is trial 31 with value: 1.0.


epoch 9  | loss: 0.45318 | val_accuracy: 0.5     |  0:00:02s
epoch 10 | loss: 0.40487 | val_accuracy: 0.5     |  0:00:02s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.5


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.82123 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 1.92292 | val_accuracy: 0.625   |  0:00:00s
epoch 2  | loss: 1.31763 | val_accuracy: 0.75    |  0:00:00s
epoch 3  | loss: 0.7775  | val_accuracy: 0.75    |  0:00:00s
epoch 4  | loss: 0.67585 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.44879 | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 0.25245 | val_accuracy: 0.5     |  0:00:01s
epoch 7  | loss: 0.1232  | val_accuracy: 0.75    |  0:00:01s
epoch 8  | loss: 0.17511 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.06495 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.22535 | val_accuracy: 0.875   |  0:00:01s
epoch 11 | loss: 0.04789 | val_accuracy: 0.75    |  0:00:01s
epoch 12 | loss: 0.05619 | val_accuracy: 0.75    |  0:00:01s
epoch 13 | loss: 0.01724 | val_accuracy: 0.75    |  0:00:01s
epoch 14 | loss: 0.04076 | val_accuracy: 0.5     |  0:00:01s
epoch 15 | loss: 0.02022 | val_accuracy: 0.75    |  0:00:01s
epoch 16 | loss: 0.01058

[I 2024-07-26 07:59:20,917] Trial 46 finished with value: 0.875 and parameters: {'n_d': 52, 'n_a': 25, 'n_steps': 8, 'gamma': 1.6762600629746514, 'lambda_sparse': 9.797660948053517e-06, 'learning_rate': 0.00929154883522301, 'batch_size': 64, 'num_epochs': 48}. Best is trial 31 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 4.29169 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 4.03317 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 4.12065 | val_accuracy: 0.25    |  0:00:00s
epoch 3  | loss: 3.95388 | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 3.66074 | val_accuracy: 0.25    |  0:00:00s
epoch 5  | loss: 3.28639 | val_accuracy: 0.25    |  0:00:00s
epoch 6  | loss: 3.46495 | val_accuracy: 0.25    |  0:00:00s
epoch 7  | loss: 2.85954 | val_accuracy: 0.25    |  0:00:01s
epoch 8  | loss: 3.20364 | val_accuracy: 0.25    |  0:00:01s
epoch 9  | loss: 3.21005 | val_accuracy: 0.25    |  0:00:01s


[I 2024-07-26 07:59:22,829] Trial 47 finished with value: 0.25 and parameters: {'n_d': 10, 'n_a': 41, 'n_steps': 10, 'gamma': 1.7407524029358228, 'lambda_sparse': 2.4215311554306948e-05, 'learning_rate': 0.0014882976094222834, 'batch_size': 32, 'num_epochs': 74}. Best is trial 31 with value: 1.0.


epoch 10 | loss: 3.56346 | val_accuracy: 0.25    |  0:00:01s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.25


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 3.05306 | val_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 2.80653 | val_accuracy: 0.125   |  0:00:00s
epoch 2  | loss: 2.21792 | val_accuracy: 0.125   |  0:00:00s
epoch 3  | loss: 1.49314 | val_accuracy: 0.25    |  0:00:00s
epoch 4  | loss: 0.76207 | val_accuracy: 0.125   |  0:00:00s
epoch 5  | loss: 0.4857  | val_accuracy: 0.125   |  0:00:00s
epoch 6  | loss: 0.27492 | val_accuracy: 0.375   |  0:00:01s
epoch 7  | loss: 0.19728 | val_accuracy: 0.625   |  0:00:01s
epoch 8  | loss: 0.11624 | val_accuracy: 0.625   |  0:00:01s
epoch 9  | loss: 0.10082 | val_accuracy: 0.625   |  0:00:01s
epoch 10 | loss: 0.24119 | val_accuracy: 0.375   |  0:00:01s
epoch 11 | loss: 0.06915 | val_accuracy: 0.25    |  0:00:01s
epoch 12 | loss: 0.0498  | val_accuracy: 0.375   |  0:00:01s
epoch 13 | loss: 0.0356  | val_accuracy: 0.375   |  0:00:01s
epoch 14 | loss: 0.02395 | val_accuracy: 0.625   |  0:00:01s
epoch 15 | loss: 0.01613 | val_accuracy: 0.625   |  0:00:01s
epoch 16 | loss: 0.01127

[I 2024-07-26 07:59:25,003] Trial 48 finished with value: 0.625 and parameters: {'n_d': 47, 'n_a': 37, 'n_steps': 6, 'gamma': 1.2304268068961273, 'lambda_sparse': 3.360291921322935e-06, 'learning_rate': 0.0031980524525252046, 'batch_size': 128, 'num_epochs': 85}. Best is trial 31 with value: 1.0.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 4.31063 | val_accuracy: 0.5     |  0:00:00s
epoch 1  | loss: 1.41809 | val_accuracy: 0.625   |  0:00:00s
epoch 2  | loss: 0.58963 | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 0.33952 | val_accuracy: 0.625   |  0:00:00s
epoch 4  | loss: 0.26917 | val_accuracy: 0.75    |  0:00:00s
epoch 5  | loss: 0.53059 | val_accuracy: 0.75    |  0:00:00s
epoch 6  | loss: 0.16661 | val_accuracy: 0.75    |  0:00:01s
epoch 7  | loss: 0.15403 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.24342 | val_accuracy: 0.5     |  0:00:01s
epoch 9  | loss: 0.27824 | val_accuracy: 0.5     |  0:00:01s
epoch 10 | loss: 0.24446 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.02642 | val_accuracy: 0.5     |  0:00:01s
epoch 12 | loss: 0.022   | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.00969 | val_accuracy: 0.625   |  0:00:01s


[I 2024-07-26 07:59:27,141] Trial 49 finished with value: 0.75 and parameters: {'n_d': 60, 'n_a': 29, 'n_steps': 9, 'gamma': 1.915282786924466, 'lambda_sparse': 0.0003325181149945809, 'learning_rate': 0.012373116095869423, 'batch_size': 64, 'num_epochs': 31}. Best is trial 31 with value: 1.0.


epoch 14 | loss: 0.1169  | val_accuracy: 0.625   |  0:00:01s

Early stopping occurred at epoch 14 with best_epoch = 4 and best_val_accuracy = 0.75




epoch 0  | loss: 3.94984 | val_accuracy: 0.125   |  0:00:00s
epoch 1  | loss: 2.64501 | val_accuracy: 0.25    |  0:00:00s
epoch 2  | loss: 1.8386  | val_accuracy: 0.375   |  0:00:00s
epoch 3  | loss: 1.26554 | val_accuracy: 0.375   |  0:00:00s
epoch 4  | loss: 0.85521 | val_accuracy: 0.5     |  0:00:00s
epoch 5  | loss: 0.54311 | val_accuracy: 0.5     |  0:00:00s
epoch 6  | loss: 0.5092  | val_accuracy: 0.5     |  0:00:00s
epoch 7  | loss: 0.33521 | val_accuracy: 0.5     |  0:00:01s
epoch 8  | loss: 0.40992 | val_accuracy: 0.375   |  0:00:01s
epoch 9  | loss: 0.23863 | val_accuracy: 0.375   |  0:00:01s
epoch 10 | loss: 0.17465 | val_accuracy: 0.5     |  0:00:01s
epoch 11 | loss: 0.31905 | val_accuracy: 0.375   |  0:00:01s
epoch 12 | loss: 0.43888 | val_accuracy: 0.625   |  0:00:01s
epoch 13 | loss: 0.21215 | val_accuracy: 0.625   |  0:00:01s
epoch 14 | loss: 0.23946 | val_accuracy: 0.625   |  0:00:01s
epoch 15 | loss: 0.10086 | val_accuracy: 0.75    |  0:00:01s
epoch 16 | loss: 0.05508

