In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

df =  pd.read_csv('winequality-red.csv',delimiter=';')
df.dropna(inplace=True)

df['Y'] = np.where(df['Y'] >= 6, 1, 0)

In [2]:
#df = df.sample(n=1000, random_state=42)

In [3]:
import pandas as pd
import numpy as np

Y_column = df['Y'].copy()
df.drop('Y', axis=1, inplace=True)

# Identify categorical data (change this based on your actual data)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Standardize only the continuous (non-categorical) columns
continuous_cols = df.columns.difference(categorical_cols)  # Gets the difference, i.e., continuous cols
df[continuous_cols] = (df[continuous_cols] - df[continuous_cols].mean()) / df[continuous_cols].std()

# Filter out outliers in continuous data (|z-score| > 5)
mask = (np.abs(df[continuous_cols]) < 5).all(axis=1)
df = df[mask]

# Reattach the target variable 'Y' to the DataFrame
df['Y'] = Y_column[mask]

In [4]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()

# Apply Label Encoding to each categorical column
for column in categorical_cols:
    # Ensure the column is of type object (string) or category
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        df[column] = le.fit_transform(df[column])
        
df['Y'], unique = pd.factorize(df['Y'])

In [5]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Dictionary of models and their reduced hyperparameter grids
models = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), {
        'C': [0.01, 0.1, 1],
        'solver': ['liblinear', 'lbfgs']
    }),
    'KNN': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    }),
    'Random Forest': (RandomForestClassifier(), {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10],
        'min_samples_split': [2, 5]
    }),
    'Gradient Boosting': (GradientBoostingClassifier(), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 4]
    }),
    'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 4]
    }),
    'LightGBM': (LGBMClassifier(), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 50]
    }),
    'CatBoost': (CatBoostClassifier(verbose=0), {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.1],
        'depth': [4, 6]
    })
}

# Dictionary to store results
results = {}

for name, (model, param_grid) in models.items():
    start_time = time.time()
    
    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    
    # Get the best model
    best_model = grid_search.best_estimator_

    # Measure training time for best parameters
    best_param_train_start = time.time()
    best_model.fit(X_train_scaled, y_train)
    best_param_train_time = time.time() - best_param_train_start

    # Measure inference time for best parameters
    inference_start_time = time.time()
    y_pred = best_model.predict(X_test_scaled)
    inference_time = time.time() - inference_start_time
    
    # Calculate total computation time
    computation_time = time.time() - start_time
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    if len(np.unique(y)) == 2:  # Binary classification
        auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled)[:, 1])
    else:  # Multiclass classification
        auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled), multi_class='ovr', average='macro')

    results[name] = {
        'Accuracy': accuracy,
        'AUC Score': auc,
        'Training Time (Best Params)': best_param_train_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': grid_search.best_params_
    }

# Convert results to a DataFrame
result = pd.DataFrame(results).T

print(result)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 361, number of negative: 415
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000231 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 847
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.465206 -> initscore=-0.139401
[LightGBM] [Info] Start training from score -0.139401
[LightGBM] [Info] Number of positive: 361, number of negative: 415
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000079 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 847
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.465206 -> initscore=-0.139401
[LightGBM] [Info] Start training from score -0.139401
                     Accur

In [6]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.neural_network import MLPClassifier

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the MLP model and its hyperparameter grid
mlp = MLPClassifier(max_iter=1000, random_state=42)
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}

# Perform GridSearchCV with StratifiedKFold
start_time = time.time()
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(mlp, param_grid=param_grid, cv=skf, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Measure training time for best parameters
best_param_train_start = time.time()
best_model.fit(X_train_scaled, y_train)
training_time = time.time() - best_param_train_start

# Measure inference time for best parameters
inference_start_time = time.time()
y_pred = best_model.predict(X_test_scaled)
inference_time = time.time() - inference_start_time

# Calculate total computation time
computation_time = time.time() - start_time

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
if len(np.unique(y)) == 2:  # Binary classification
    auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled)[:, 1])
else:  # Multiclass classification
    auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled), multi_class='ovr', average='macro')

# Store results in the existing result DataFrame
result.loc['MLP'] = [accuracy, auc, training_time, inference_time, computation_time, grid_search.best_params_]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in grid_search.best_params_.items():
    print(f"{param}: {value}")



                     Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression  0.748718  0.796644                    0.003991   
KNN                  0.753846  0.840861                    0.001994   
Decision Tree        0.728205  0.749789                    0.004987   
Random Forest             0.8  0.864816                    0.727075   
Gradient Boosting         0.8  0.861439                    0.513596   
XGBoost              0.764103  0.853525                      0.1127   
LightGBM             0.789744  0.866083                    0.057846   
CatBoost             0.753846  0.843288                    0.245005   
MLP                  0.764103  0.838863                    3.545068   

                    Inference Time (Best Params) Computation Time (Total)  \
Logistic Regression                          0.0                 4.049861   
KNN                                     0.004881                 0.248299   
Decision Tree                                0.0          



In [7]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

# Define the DNN model
class DNN(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim):
        super(DNN, self).__init__()
        layers = []
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(input_dim, hidden_dim))
            layers.append(nn.ReLU())
            input_dim = hidden_dim
        layers.append(nn.Linear(input_dim, output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# Define the objective function for Optuna
def objective(trial):
    # Define hyperparameters to tune
    hidden_dims = [trial.suggest_int(f'hidden_dim_{i}', 32, 256) for i in range(3)]
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y))
    model = DNN(input_dim, hidden_dims, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Start timing the entire process
start_time = time.time()

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final model with the best hyperparameters
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
best_model = DNN(input_dim, [best_params[f'hidden_dim_{i}'] for i in range(3)], output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['DNN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:19:59,336] A new study created in memory with name: no-name-7449ee2f-298a-4134-9e5c-edd4dc861a46


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:20:16,422] Trial 0 finished with value: 0.6717948717948717 and parameters: {'hidden_dim_0': 235, 'hidden_dim_1': 183, 'hidden_dim_2': 182, 'learning_rate': 0.039847770232168886, 'batch_size': 32, 'num_epochs': 66}. Best is trial 0 with value: 0.6717948717948717.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:20:16,803] Trial 1 finished with value: 0.7333333333333333 and parameters: {'hidden_dim_0': 136, 'hidden_dim_1': 64, 'hidden_dim_2': 253, 'learning_rate': 0.024940195511163176, 'batch_size': 256, 'num_epochs': 10}. Best is trial 1 with value: 0.7333333333333333.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:20:20,911] Trial 2 finished with value: 0.7743589743589744 and parameters: {'hidden_dim_0': 68, 'hidden_dim_1': 215, 'hidden_dim_2': 39, 'learning_rate': 0.00018052008818718822, 'batch_size': 64, 'num_epochs

                     Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression  0.748718  0.796644                    0.003991   
KNN                  0.753846  0.840861                    0.001994   
Decision Tree        0.728205  0.749789                    0.004987   
Random Forest             0.8  0.864816                    0.727075   
Gradient Boosting         0.8  0.861439                    0.513596   
XGBoost              0.764103  0.853525                      0.1127   
LightGBM             0.789744  0.866083                    0.057846   
CatBoost             0.753846  0.843288                    0.245005   
MLP                  0.764103  0.838863                    3.545068   
DNN                  0.774359  0.836116                    2.187616   

                    Inference Time (Best Params) Computation Time (Total)  \
Logistic Regression                          0.0                 4.049861   
KNN                                     0.004881                

In [8]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class CrossLayer(nn.Module):
    def __init__(self, input_dim):
        super(CrossLayer, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(input_dim, 1))
        self.bias = nn.Parameter(torch.Tensor(input_dim, 1))
        nn.init.xavier_uniform_(self.weight)
        nn.init.zeros_(self.bias)

    def forward(self, x0, x):
        x = x.unsqueeze(2)
        x0 = x0.unsqueeze(2)
        interaction = torch.matmul(x0, torch.matmul(x.transpose(1, 2), self.weight))
        return x0.squeeze(2) + interaction.squeeze(2) + self.bias.T

class DCN(nn.Module):
    def __init__(self, input_dim, cross_layers, hidden_layers, output_dim):
        super(DCN, self).__init__()
        self.cross_layers = nn.ModuleList([CrossLayer(input_dim) for _ in range(cross_layers)])
        
        deep_layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                deep_layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                deep_layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            deep_layers.append(nn.ReLU())
        self.deep_net = nn.Sequential(*deep_layers)
        
        self.final_layer = nn.Linear(input_dim + hidden_layers[-1], output_dim)

    def forward(self, x):
        cross_out = x
        for layer in self.cross_layers:
            cross_out = layer(x, cross_out)
        deep_out = self.deep_net(x)
        concat_out = torch.cat([cross_out, deep_out], dim=1)
        return self.final_layer(concat_out)

def objective(trial):
    # Define hyperparameters to tune
    cross_layers = trial.suggest_int('cross_layers', 1, 5)
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y))
    model = DCN(input_dim, cross_layers, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Start timing the entire process
start_time = time.time()

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final model with the best hyperparameters
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
best_model = DCN(input_dim, best_params['cross_layers'], 
                 [best_params[f'hidden_layer_{i}'] for i in range(3)], 
                 output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['DCN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:21:33,469] A new study created in memory with name: no-name-9b0175e2-7e6f-4a9b-9b07-0d9a97c7e8ab


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:21:36,380] Trial 0 finished with value: 0.6820512820512821 and parameters: {'cross_layers': 2, 'hidden_layer_0': 204, 'hidden_layer_1': 220, 'hidden_layer_2': 80, 'learning_rate': 0.0788376381887241, 'batch_size': 256, 'num_epochs': 45}. Best is trial 0 with value: 0.6820512820512821.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:21:42,154] Trial 1 finished with value: 0.7435897435897436 and parameters: {'cross_layers': 5, 'hidden_layer_0': 120, 'hidden_layer_1': 54, 'hidden_layer_2': 35, 'learning_rate': 0.007413982251349101, 'batch_size': 64, 'num_epochs': 55}. Best is trial 1 with value: 0.7435897435897436.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:21:43,161] Trial 2 finished with value: 0.7333333333333333 and parameters: {'cross_layers': 4, 'hidden_layer_0': 202, 'hidden_layer_1': 105, 'hidden_layer_2': 15

                     Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression  0.748718  0.796644                    0.003991   
KNN                  0.753846  0.840861                    0.001994   
Decision Tree        0.728205  0.749789                    0.004987   
Random Forest             0.8  0.864816                    0.727075   
Gradient Boosting         0.8  0.861439                    0.513596   
XGBoost              0.764103  0.853525                      0.1127   
LightGBM             0.789744  0.866083                    0.057846   
CatBoost             0.753846  0.843288                    0.245005   
MLP                  0.764103  0.838863                    3.545068   
DNN                  0.774359  0.836116                    2.187616   
DCN                  0.723077  0.778793                    5.800046   

                    Inference Time (Best Params) Computation Time (Total)  \
Logistic Regression                          0.0                 4.049

In [9]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class WideAndDeepNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(WideAndDeepNetwork, self).__init__()
        
        # Wide part
        self.wide = nn.Linear(input_dim, output_dim)
        
        # Deep part
        deep_layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                deep_layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                deep_layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            deep_layers.append(nn.ReLU())
        deep_layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.deep = nn.Sequential(*deep_layers)

    def forward(self, x):
        wide_out = self.wide(x)
        deep_out = self.deep(x)
        return wide_out + deep_out

def objective(trial):
    # Define hyperparameters to tune
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y))
    model = WideAndDeepNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Start timing the entire process
start_time = time.time()

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final model with the best hyperparameters
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
best_model = WideAndDeepNetwork(input_dim, 
                                [best_params[f'hidden_layer_{i}'] for i in range(3)], 
                                output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['Wide_and_Deep'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:23:41,009] A new study created in memory with name: no-name-71699cb6-bb7f-474c-9c91-096da33e2d23


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:23:51,254] Trial 0 finished with value: 0.7589743589743589 and parameters: {'hidden_layer_0': 134, 'hidden_layer_1': 83, 'hidden_layer_2': 109, 'learning_rate': 0.0007483649547044918, 'batch_size': 32, 'num_epochs': 83}. Best is trial 0 with value: 0.7589743589743589.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:23:52,307] Trial 1 finished with value: 0.7230769230769231 and parameters: {'hidden_layer_0': 42, 'hidden_layer_1': 150, 'hidden_layer_2': 214, 'learning_rate': 0.051399645412128415, 'batch_size': 128, 'num_epochs': 22}. Best is trial 0 with value: 0.7589743589743589.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:24:05,435] Trial 2 finished with value: 0.7333333333333333 and parameters: {'hidden_layer_0': 194, 'hidden_layer_1': 253, 'hidden_layer_2': 209, 'learning_rate': 0.026910724450281875, 'batch_size

                     Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression  0.748718  0.796644                    0.003991   
KNN                  0.753846  0.840861                    0.001994   
Decision Tree        0.728205  0.749789                    0.004987   
Random Forest             0.8  0.864816                    0.727075   
Gradient Boosting         0.8  0.861439                    0.513596   
XGBoost              0.764103  0.853525                      0.1127   
LightGBM             0.789744  0.866083                    0.057846   
CatBoost             0.753846  0.843288                    0.245005   
MLP                  0.764103  0.838863                    3.545068   
DNN                  0.774359  0.836116                    2.187616   
DCN                  0.723077  0.778793                    5.800046   
Wide_and_Deep        0.774359  0.835693                    2.907177   

                    Inference Time (Best Params) Computation Time (Total)  \

In [10]:
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(NeuralNetwork, self).__init__()
        layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_layers:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for XGBoost
    xgb_params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }

    # Train XGBoost model
    xgb_model = XGBClassifier(**xgb_params, use_label_encoder=False, eval_metric='mlogloss')
    xgb_model.fit(X_train_scaled, y_train)
    
    # Extract features using XGBoost
    X_train_transformed = xgb_model.apply(X_train_scaled)
    X_test_transformed = xgb_model.apply(X_test_scaled)
    
    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
    y_train_tensor = torch.LongTensor(y_train.values).to(device)
    y_test_tensor = torch.LongTensor(y_test.values).to(device)
    
    # Define hyperparameters to tune for Neural Network
    hidden_layers = []
    for i in range(3):  # Allow up to 3 hidden layers
        if trial.suggest_categorical(f'use_hidden_layer_{i}', [True, False]):
            hidden_layers.append(trial.suggest_int(f'hidden_layer_{i}', 32, 256))
    
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Neural Network model
    input_dim = X_train_transformed.shape[1]
    output_dim = len(np.unique(y))
    model = NeuralNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_transformed_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final XGBoost model with the best hyperparameters
xgb_best_params = {
    'n_estimators': best_params['n_estimators'],
    'max_depth': best_params['max_depth'],
    'learning_rate': best_params['xgb_learning_rate'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree']
}
xgb_model = XGBClassifier(**xgb_best_params, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_scaled, y_train)

# Extract features using XGBoost
X_train_transformed = xgb_model.apply(X_train_scaled)
X_test_transformed = xgb_model.apply(X_test_scaled)

# Convert to PyTorch tensors
X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

# Train the final Neural Network model with the best hyperparameters
nn_best_params = {
    'hidden_layers': [],
    'learning_rate': best_params['nn_learning_rate'],
    'batch_size': best_params['batch_size'],
    'num_epochs': best_params['num_epochs']
}

# Dynamically determine the number of hidden layers
for i in range(3):  # Assuming max 3 hidden layers
    if f'use_hidden_layer_{i}' in best_params and best_params[f'use_hidden_layer_{i}']:
        nn_best_params['hidden_layers'].append(best_params[f'hidden_layer_{i}'])

input_dim = X_train_transformed.shape[1]
output_dim = len(np.unique(y))
best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers'], output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(nn_best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_transformed_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Assuming 'result' DataFrame is defined elsewhere
result.loc['XGBoost + NN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")
# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['XGBoost + NN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:25:17,315] A new study created in memory with name: no-name-cc01aa6d-730a-48ca-b724-101c9adc575b


Using device: cpu


  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
Parameters: { "use_label_encoder" } are not used.

  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:25:25,298] Trial 0 finished with value: 0.7076923076923077 and parameters: {'n_estimators': 114, 'max_depth': 10, 'xgb_learning_rate': 0.049553802710495774, 'subsample': 0.9706575962810082, 'colsample_bytree': 0.6392478582249902, 'hidden_layer_0': 248, 'hidden_layer_1': 111, 'hidden_layer_2': 113, 'nn_learning_rate': 0.00048584002658937146, 'batch_size': 64, 'num_epochs': 91}. Best is trial 0 with value: 0.7076923076923077.
  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
Parameters: { "use_label_encoder" } are not used.

  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:25:32,940] Trial 1 finished with value: 0.5333333333333333 and parameters: {'n_estimators': 90, 'max_depth': 4, 'xgb_learnin

                     Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression  0.748718  0.796644                    0.003991   
KNN                  0.753846  0.840861                    0.001994   
Decision Tree        0.728205  0.749789                    0.004987   
Random Forest             0.8  0.864816                    0.727075   
Gradient Boosting         0.8  0.861439                    0.513596   
XGBoost              0.764103  0.853525                      0.1127   
LightGBM             0.789744  0.866083                    0.057846   
CatBoost             0.753846  0.843288                    0.245005   
MLP                  0.764103  0.838863                    3.545068   
DNN                  0.774359  0.836116                    2.187616   
DCN                  0.723077  0.778793                    5.800046   
Wide_and_Deep        0.774359  0.835693                    2.907177   
XGBoost + NN         0.697436    0.8209                    3.071482   

     

In [11]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from lightgbm import LGBMClassifier
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(NeuralNetwork, self).__init__()
        layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for LightGBM
    lgb_params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }

    # Train LightGBM model
    lgb_model = LGBMClassifier(**lgb_params)
    lgb_model.fit(X_train_scaled, y_train)
    
    # Extract features using LightGBM
    X_train_transformed = lgb_model.predict_proba(X_train_scaled)
    X_test_transformed = lgb_model.predict_proba(X_test_scaled)
    
    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
    y_train_tensor = torch.LongTensor(y_train.values).to(device)
    y_test_tensor = torch.LongTensor(y_test.values).to(device)
    
    # Define hyperparameters to tune for Neural Network
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Neural Network model
    input_dim = X_train_transformed.shape[1]
    output_dim = len(np.unique(y))
    model = NeuralNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_transformed_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final LightGBM model with the best hyperparameters
lgb_best_params = {
    'n_estimators': best_params['n_estimators'],
    'max_depth': best_params['max_depth'],
    'learning_rate': best_params['lgb_learning_rate'],
    'num_leaves': best_params['num_leaves'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree']
}
lgb_model = LGBMClassifier(**lgb_best_params)
lgb_model.fit(X_train_scaled, y_train)

# Extract features using LightGBM
X_train_transformed = lgb_model.predict_proba(X_train_scaled)
X_test_transformed = lgb_model.predict_proba(X_test_scaled)

# Convert to PyTorch tensors
X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

# Train the final Neural Network model with the best hyperparameters
nn_best_params = {
    'hidden_layers': [best_params[f'hidden_layer_{i}'] for i in range(3)],
    'learning_rate': best_params['nn_learning_rate'],
    'batch_size': best_params['batch_size'],
    'num_epochs': best_params['num_epochs']
}
input_dim = X_train_transformed.shape[1]
output_dim = len(np.unique(y))
best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers'], output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(nn_best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_transformed_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['LightGBM + NN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:27:04,788] A new study created in memory with name: no-name-9aafeffd-ea8f-43ea-8934-dd4af9c60c7b


Using device: cpu
[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:27:11,915] Trial 0 finished with value: 0.6820512820512821 and parameters: {'n_estimators': 171, 'max_depth': 6, 'lgb_learning_rate': 0.0007303950843652628, 'num_leaves': 91, 'subsample': 0.5890687346881405, 'colsample_bytree': 0.8765431388466591, 'hidden_layer_0': 149, 'hidden_layer_1': 251, 'hidden_layer_2': 174, 'nn_learning_rate': 0.07048332057737904, 'batch_size': 64, 'num_epochs': 80}. Best is trial 0 with value: 0.6820512820512821.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:27:17,370] Trial 1 finished with value: 0.676923076923077 and parameters: {'n_estimators': 155, 'max_depth': 5, 'lgb_learning_rate': 0.0008708741477612304, 'num_leaves': 72, 'subsample': 0.668279713210599, 'colsample_bytree': 0.9867303864548873, 'hidden_layer_0': 175, 'hidden_layer_1': 211, 'hidden_layer_2': 108, 'nn_learning_rate': 0.002780997393517395, 'batch_size': 128, 'num_epochs': 89}. Best is trial 0 with value: 0.6820512820512821.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:27:25,314] Trial 2 finished with value: 0.5333333333333333 and parameters: {'n_estimators': 99, 'max_depth': 9, 'lgb_learning_rate': 0.00018809740820026834, 'num_leaves': 83, 'subsample': 0.8558071191416066, 'colsample_bytree': 0.6885621058950526, 'hidden_layer_0': 101, 'hidden_layer_1': 249, 'hidden_layer_2': 44, 'nn_learning_rate': 0.04492118955836811, 'batch_size': 32, 'num_epochs': 59}. Best is trial 0 with value: 0.6820512820512821.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:27:27,128] Trial 3 finished with value: 0.7230769230769231 and parameters: {'n_estimators': 190, 'max_depth': 6, 'lgb_learning_rate': 0.003999048582494491, 'num_leaves': 92, 'subsample': 0.9126899052552253, 'colsample_bytree': 0.6609367499354242, 'hidden_layer_0': 93, 'hidden_layer_1': 153, 'hidden_layer_2': 251, 'nn_learning_rate': 0.01535996206204883, 'batch_size': 256, 'num_epochs': 45}. Best is trial 3 with value: 0.7230769230769231.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:27:31,697] Trial 4 finished with value: 0.7333333333333333 and parameters: {'n_estimators': 227, 'max_depth': 6, 'lgb_learning_rate': 0.0004287477771335191, 'num_leaves': 34, 'subsample': 0.9079524355272293, 'colsample_bytree': 0.515456175452567, 'hidden_layer_0': 85, 'hidden_layer_1': 35, 'hidden_layer_2': 255, 'nn_learning_rate': 0.035820036633057065, 'batch_size': 128, 'num_epochs': 100}. Best is trial 4 with value: 0.7333333333333333.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:27:42,803] Trial 5 finished with value: 0.7076923076923077 and parameters: {'n_estimators': 192, 'max_depth': 9, 'lgb_learning_rate': 0.00894143720380435, 'num_leaves': 40, 'subsample': 0.9554407256290511, 'colsample_bytree': 0.9239478559256313, 'hidden_layer_0': 248, 'hidden_layer_1': 83, 'hidden_layer_2': 76, 'nn_learning_rate': 0.08525988993132252, 'batch_size': 64, 'num_epochs': 57}. Best is trial 4 with value: 0.7333333333333333.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:27:59,522] Trial 6 finished with value: 0.7282051282051282 and parameters: {'n_estimators': 274, 'max_depth': 7, 'lgb_learning_rate': 0.006831094283558482, 'num_leaves': 73, 'subsample': 0.8214206558165986, 'colsample_bytree': 0.700787644833244, 'hidden_layer_0': 145, 'hidden_layer_1': 67, 'hidden_layer_2': 145, 'nn_learning_rate': 0.00873366652098684, 'batch_size': 64, 'num_epochs': 63}. Best is trial 4 with value: 0.7333333333333333.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:28:00,662] Trial 7 finished with value: 0.7025641025641025 and parameters: {'n_estimators': 215, 'max_depth': 9, 'lgb_learning_rate': 0.003963805837895997, 'num_leaves': 81, 'subsample': 0.7891176705458761, 'colsample_bytree': 0.8523388048006024, 'hidden_layer_0': 234, 'hidden_layer_1': 189, 'hidden_layer_2': 49, 'nn_learning_rate': 0.0007543437947927383, 'batch_size': 256, 'num_epochs': 25}. Best is trial 4 with value: 0.7333333333333333.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:28:05,124] Trial 8 finished with value: 0.717948717948718 and parameters: {'n_estimators': 161, 'max_depth': 6, 'lgb_learning_rate': 0.0006437790679925271, 'num_leaves': 80, 'subsample': 0.828882209245673, 'colsample_bytree': 0.7228628139116648, 'hidden_layer_0': 153, 'hidden_layer_1': 49, 'hidden_layer_2': 171, 'nn_learning_rate': 0.00023123293141331843, 'batch_size': 128, 'num_epochs': 93}. Best is trial 4 with value: 0.7333333333333333.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:28:07,891] Trial 9 finished with value: 0.6615384615384615 and parameters: {'n_estimators': 280, 'max_depth': 10, 'lgb_learning_rate': 0.0001985706301163377, 'num_leaves': 25, 'subsample': 0.7055449540123271, 'colsample_bytree': 0.5021867428615998, 'hidden_layer_0': 210, 'hidden_layer_1': 140, 'hidden_layer_2': 187, 'nn_learning_rate': 0.00011802097547372629, 'batch_size': 128, 'num_epochs': 50}. Best is trial 4 with value: 0.7333333333333333.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001924 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:28:09,856] Trial 10 finished with value: 0.7333333333333333 and parameters: {'n_estimators': 239, 'max_depth': 3, 'lgb_learning_rate': 0.09719286093110663, 'num_leaves': 51, 'subsample': 0.9937278019888522, 'colsample_bytree': 0.5323509792326571, 'hidden_layer_0': 42, 'hidden_layer_1': 108, 'hidden_layer_2': 248, 'nn_learning_rate': 0.003201004006439562, 'batch_size': 32, 'num_epochs': 15}. Best is trial 4 with value: 0.7333333333333333.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:28:11,384] Trial 11 finished with value: 0.7435897435897436 and parameters: {'n_estimators': 241, 'max_depth': 3, 'lgb_learning_rate': 0.09485852647530857, 'num_leaves': 48, 'subsample': 0.996469927459809, 'colsample_bytree': 0.5134722418584569, 'hidden_layer_0': 39, 'hidden_layer_1': 107, 'hidden_layer_2': 253, 'nn_learning_rate': 0.0018083302232842315, 'batch_size': 32, 'num_epochs': 10}. Best is trial 11 with value: 0.7435897435897436.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:28:14,537] Trial 12 finished with value: 0.7487179487179487 and parameters: {'n_estimators': 245, 'max_depth': 3, 'lgb_learning_rate': 0.07112849300838935, 'num_leaves': 34, 'subsample': 0.8964531405074034, 'colsample_bytree': 0.5985622269567084, 'hidden_layer_0': 35, 'hidden_layer_1': 32, 'hidden_layer_2': 222, 'nn_learning_rate': 0.0011237283168162103, 'batch_size': 32, 'num_epochs': 31}. Best is trial 12 with value: 0.7487179487179487.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:28:18,533] Trial 13 finished with value: 0.7589743589743589 and parameters: {'n_estimators': 300, 'max_depth': 3, 'lgb_learning_rate': 0.09233454666079594, 'num_leaves': 55, 'subsample': 0.9889088645886437, 'colsample_bytree': 0.6045886889998339, 'hidden_layer_0': 33, 'hidden_layer_1': 106, 'hidden_layer_2': 204, 'nn_learning_rate': 0.0010399074870392705, 'batch_size': 32, 'num_epochs': 30}. Best is trial 13 with value: 0.7589743589743589.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000129 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:28:22,352] Trial 14 finished with value: 0.7487179487179487 and parameters: {'n_estimators': 300, 'max_depth': 4, 'lgb_learning_rate': 0.02325585618882458, 'num_leaves': 60, 'subsample': 0.9041381374435047, 'colsample_bytree': 0.6034063335610549, 'hidden_layer_0': 66, 'hidden_layer_1': 95, 'hidden_layer_2': 214, 'nn_learning_rate': 0.0006531631915918229, 'batch_size': 32, 'num_epochs': 32}. Best is trial 13 with value: 0.7589743589743589.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:28:27,531] Trial 15 finished with value: 0.7743589743589744 and parameters: {'n_estimators': 269, 'max_depth': 4, 'lgb_learning_rate': 0.03063915257442697, 'num_leaves': 23, 'subsample': 0.507096330591258, 'colsample_bytree': 0.6170710476981663, 'hidden_layer_0': 116, 'hidden_layer_1': 146, 'hidden_layer_2': 213, 'nn_learning_rate': 0.0007141130417522767, 'batch_size': 32, 'num_epochs': 37}. Best is trial 15 with value: 0.7743589743589744.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:28:32,887] Trial 16 finished with value: 0.7487179487179487 and parameters: {'n_estimators': 120, 'max_depth': 4, 'lgb_learning_rate': 0.033109392180075456, 'num_leaves': 22, 'subsample': 0.5041831774613531, 'colsample_bytree': 0.7823146315500309, 'hidden_layer_0': 114, 'hidden_layer_1': 141, 'hidden_layer_2': 206, 'nn_learning_rate': 0.0003797243442536347, 'batch_size': 32, 'num_epochs': 41}. Best is trial 15 with value: 0.7743589743589744.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:28:35,773] Trial 17 finished with value: 0.7230769230769231 and parameters: {'n_estimators': 51, 'max_depth': 4, 'lgb_learning_rate': 0.02461576887855604, 'num_leaves': 61, 'subsample': 0.6155082398448791, 'colsample_bytree': 0.6103809217280667, 'hidden_layer_0': 187, 'hidden_layer_1': 173, 'hidden_layer_2': 128, 'nn_learning_rate': 0.005676138744619477, 'batch_size': 32, 'num_epochs': 21}. Best is trial 15 with value: 0.7743589743589744.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:28:40,552] Trial 18 finished with value: 0.7538461538461538 and parameters: {'n_estimators': 272, 'max_depth': 5, 'lgb_learning_rate': 0.04308524863136817, 'num_leaves': 50, 'subsample': 0.5239516435239182, 'colsample_bytree': 0.7838011324150067, 'hidden_layer_0': 119, 'hidden_layer_1': 119, 'hidden_layer_2': 154, 'nn_learning_rate': 0.0002648370320288155, 'batch_size': 32, 'num_epochs': 39}. Best is trial 15 with value: 0.7743589743589744.
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222


[I 2024-07-26 21:28:43,605] Trial 19 finished with value: 0.7435897435897436 and parameters: {'n_estimators': 293, 'max_depth': 5, 'lgb_learning_rate': 0.012423604624350425, 'num_leaves': 100, 'subsample': 0.7010085370253263, 'colsample_bytree': 0.646119129265326, 'hidden_layer_0': 69, 'hidden_layer_1': 171, 'hidden_layer_2': 197, 'nn_learning_rate': 0.0015715305325937172, 'batch_size': 256, 'num_epochs': 74}. Best is trial 15 with value: 0.7743589743589744.


[LightGBM] [Info] Number of positive: 362, number of negative: 414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466495 -> initscore=-0.134222
[LightGBM] [Info] Start training from score -0.134222
                     Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression  0.748718  0.796644                    0.003991   
KNN                  0.753846  0.840861                    0.001994   
Decision Tree        0.728205  0.749789                    0.004987   
Random Forest             0.8  0.864816                    0.727075   
Gradient Boosting         0.8  0.861439                    0.513596   
XGBoost              0.764103  0.853525                      0.1127   
LightGBM        

In [12]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(NeuralNetwork, self).__init__()
        layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for CatBoost
    catboost_params = {
        'iterations': trial.suggest_int('iterations', 50, 300),
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('catboost_learning_rate', 1e-4, 1e-1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-4, 1e-1),
        'border_count': trial.suggest_int('border_count', 32, 255)
    }

    # Train CatBoost model
    catboost_model = CatBoostClassifier(**catboost_params, verbose=0)
    catboost_model.fit(X_train_scaled, y_train)
    
    # Extract features using CatBoost
    X_train_transformed = catboost_model.predict_proba(X_train_scaled)
    X_test_transformed = catboost_model.predict_proba(X_test_scaled)
    
    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
    y_train_tensor = torch.LongTensor(y_train.values).to(device)
    y_test_tensor = torch.LongTensor(y_test.values).to(device)
    
    # Define hyperparameters to tune for Neural Network
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Neural Network model
    input_dim = X_train_transformed.shape[1]
    output_dim = len(np.unique(y))
    model = NeuralNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_transformed_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final CatBoost model with the best hyperparameters
catboost_best_params = {
    'iterations': best_params['iterations'],
    'depth': best_params['depth'],
    'learning_rate': best_params['catboost_learning_rate'],
    'l2_leaf_reg': best_params['l2_leaf_reg'],
    'border_count': best_params['border_count']
}
catboost_model = CatBoostClassifier(**catboost_best_params, verbose=0)
catboost_model.fit(X_train_scaled, y_train)

# Extract features using CatBoost
X_train_transformed = catboost_model.predict_proba(X_train_scaled)
X_test_transformed = catboost_model.predict_proba(X_test_scaled)

# Convert to PyTorch tensors
X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

# Train the final Neural Network model with the best hyperparameters
nn_best_params = {
    'hidden_layers': [best_params[f'hidden_layer_{i}'] for i in range(3)],
    'learning_rate': best_params['nn_learning_rate'],
    'batch_size': best_params['batch_size'],
    'num_epochs': best_params['num_epochs']
}
input_dim = X_train_transformed.shape[1]
output_dim = len(np.unique(y))
best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers'], output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(nn_best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_transformed_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['CatBoost + NN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:28:49,038] A new study created in memory with name: no-name-172a0f46-1890-408d-a37b-c702268bd013


Using device: cpu


  'learning_rate': trial.suggest_loguniform('catboost_learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:28:51,519] Trial 0 finished with value: 0.5333333333333333 and parameters: {'iterations': 248, 'depth': 6, 'catboost_learning_rate': 0.0001450537062924843, 'l2_leaf_reg': 0.021428272894819977, 'border_count': 161, 'hidden_layer_0': 235, 'hidden_layer_1': 253, 'hidden_layer_2': 34, 'nn_learning_rate': 0.06195799767824983, 'batch_size': 256, 'num_epochs': 45}. Best is trial 0 with value: 0.5333333333333333.
  'learning_rate': trial.suggest_loguniform('catboost_learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:28:58,387] Trial 1 finished with value: 0.7384615384615385 and parameters: {'iterations': 300, 

                     Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression  0.748718  0.796644                    0.003991   
KNN                  0.753846  0.840861                    0.001994   
Decision Tree        0.728205  0.749789                    0.004987   
Random Forest             0.8  0.864816                    0.727075   
Gradient Boosting         0.8  0.861439                    0.513596   
XGBoost              0.764103  0.853525                      0.1127   
LightGBM             0.789744  0.866083                    0.057846   
CatBoost             0.753846  0.843288                    0.245005   
MLP                  0.764103  0.838863                    3.545068   
DNN                  0.774359  0.836116                    2.187616   
DCN                  0.723077  0.778793                    5.800046   
Wide_and_Deep        0.774359  0.835693                    2.907177   
XGBoost + NN         0.697436    0.8209                    3.071482   
LightG

In [13]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class AutoInt(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_heads, num_layers):
        super(AutoInt, self).__init__()
        self.embedding = nn.Linear(input_dim, embedding_dim)
        self.attention_layers = nn.ModuleList([
            nn.MultiheadAttention(embedding_dim, num_heads) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        for attn_layer in self.attention_layers:
            x, _ = attn_layer(x, x, x)
        x = x.squeeze(1)
        x = self.fc(x)
        return x

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(NeuralNetwork, self).__init__()
        layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for AutoInt
    num_heads = trial.suggest_int('num_heads', 1, 8)
    embedding_dim = trial.suggest_int('embedding_dim', num_heads, 64, step=num_heads)
    num_layers = trial.suggest_int('num_layers', 1, 3)
    
    # Train AutoInt model
    autoint_model = AutoInt(X_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
    optimizer = optim.Adam(autoint_model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor.float().unsqueeze(1))
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    for epoch in range(10):  # Fixed number of epochs for AutoInt
        autoint_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = autoint_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    
    # Extract features using AutoInt
    autoint_model.eval()
    with torch.no_grad():
        X_train_transformed = autoint_model.embedding(X_train_tensor).cpu().numpy()
        X_test_transformed = autoint_model.embedding(X_test_tensor).cpu().numpy()
    
    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
    
    # Define hyperparameters to tune for Neural Network
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Neural Network model
    input_dim = X_train_transformed.shape[1]
    output_dim = len(np.unique(y))
    model = NeuralNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_transformed_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final AutoInt model with the best hyperparameters
embedding_dim = best_params['embedding_dim']
num_heads = best_params['num_heads']
num_layers = best_params['num_layers']
autoint_model = AutoInt(X_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
optimizer = optim.Adam(autoint_model.parameters(), lr=0.001)
criterion = nn.MSELoss()

train_dataset = TensorDataset(X_train_tensor, y_train_tensor.float().unsqueeze(1))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

for epoch in range(10):  # Fixed number of epochs for AutoInt
    autoint_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = autoint_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

# Extract features using AutoInt
autoint_model.eval()
with torch.no_grad():
    X_train_transformed = autoint_model.embedding(X_train_tensor).cpu().numpy()
    X_test_transformed = autoint_model.embedding(X_test_tensor).cpu().numpy()

# Convert to PyTorch tensors
X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)

# Train the final Neural Network model with the best hyperparameters
nn_best_params = {
    'hidden_layers': [best_params[f'hidden_layer_{i}'] for i in range(3)],
    'learning_rate': best_params['nn_learning_rate'],
    'batch_size': best_params['batch_size'],
    'num_epochs': best_params['num_epochs']
}
input_dim = X_train_transformed.shape[1]
output_dim = len(np.unique(y))
best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers'], output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(nn_best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_transformed_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['AutoInt'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:30:48,199] A new study created in memory with name: no-name-9eccdf7f-7363-40f3-98b7-3f01ae87034a


Using device: cpu


  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:30:56,223] Trial 0 finished with value: 0.7794871794871795 and parameters: {'num_heads': 8, 'embedding_dim': 56, 'num_layers': 3, 'hidden_layer_0': 138, 'hidden_layer_1': 155, 'hidden_layer_2': 68, 'nn_learning_rate': 0.0008061114172516176, 'batch_size': 64, 'num_epochs': 61}. Best is trial 0 with value: 0.7794871794871795.
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:30:59,681] Trial 1 finished with value: 0.7128205128205128 and parameters: {'num_heads': 3, 'embedding_dim': 57, 'num_layers': 3, 'hidden_layer_0': 234, 'hidden_layer_1': 173, 'hidden_layer_2': 50, 'nn_learning_rate': 0.04208325164012932, 'batch_size': 128, 'num_epochs': 18}. Best is trial 0 with value: 0.7794871794871795.
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:31:02,832] Trial 2 finished with value: 0.7743589743589744 and p

                     Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression  0.748718  0.796644                    0.003991   
KNN                  0.753846  0.840861                    0.001994   
Decision Tree        0.728205  0.749789                    0.004987   
Random Forest             0.8  0.864816                    0.727075   
Gradient Boosting         0.8  0.861439                    0.513596   
XGBoost              0.764103  0.853525                      0.1127   
LightGBM             0.789744  0.866083                    0.057846   
CatBoost             0.753846  0.843288                    0.245005   
MLP                  0.764103  0.838863                    3.545068   
DNN                  0.774359  0.836116                    2.187616   
DCN                  0.723077  0.778793                    5.800046   
Wide_and_Deep        0.774359  0.835693                    2.907177   
XGBoost + NN         0.697436    0.8209                    3.071482   
LightG

In [14]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class FTTransformer(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_heads, num_layers):
        super(FTTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, embedding_dim)
        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embedding_dim, input_dim)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        for transformer_layer in self.transformer_layers:
            x = transformer_layer(x)
        x = x.squeeze(1)
        x = self.fc(x)
        return x

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(NeuralNetwork, self).__init__()
        layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for FT-Transformer
    num_heads = trial.suggest_int('num_heads', 1, 8)
    embedding_dim = trial.suggest_int('embedding_dim', num_heads, 64, step=num_heads)
    num_layers = trial.suggest_int('num_layers', 1, 3)
    
    # Train FT-Transformer model
    ft_transformer_model = FTTransformer(X_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
    optimizer = optim.Adam(ft_transformer_model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor.float().unsqueeze(1))
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    for epoch in range(10):  # Fixed number of epochs for FT-Transformer
        ft_transformer_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = ft_transformer_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    
    # Extract features using FT-Transformer
    ft_transformer_model.eval()
    with torch.no_grad():
        X_train_transformed = ft_transformer_model.embedding(X_train_tensor).cpu().numpy()
        X_test_transformed = ft_transformer_model.embedding(X_test_tensor).cpu().numpy()
    
    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
    
    # Define hyperparameters to tune for Neural Network
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
    nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Neural Network model
    input_dim = X_train_transformed.shape[1]
    output_dim = len(np.unique(y))
    model = NeuralNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_transformed_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final FT-Transformer model with the best hyperparameters
embedding_dim = best_params['embedding_dim']
num_heads = best_params['num_heads']
num_layers = best_params['num_layers']
ft_transformer_model = FTTransformer(X_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
optimizer = optim.Adam(ft_transformer_model.parameters(), lr=0.001)
criterion = nn.MSELoss()

train_dataset = TensorDataset(X_train_tensor, y_train_tensor.float().unsqueeze(1))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

for epoch in range(10):  # Fixed number of epochs for FT-Transformer
    ft_transformer_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = ft_transformer_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

# Extract features using FT-Transformer
ft_transformer_model.eval()
with torch.no_grad():
    X_train_transformed = ft_transformer_model.embedding(X_train_tensor).cpu().numpy()
    X_test_transformed = ft_transformer_model.embedding(X_test_tensor).cpu().numpy()

# Convert to PyTorch tensors
X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)

# Train the final Neural Network model with the best hyperparameters
nn_best_params = {
    'hidden_layers': [best_params[f'hidden_layer_{i}'] for i in range(3)],
    'learning_rate': best_params['nn_learning_rate'],
    'batch_size': best_params['batch_size'],
    'num_epochs': best_params['num_epochs']
}
input_dim = X_train_transformed.shape[1]
output_dim = len(np.unique(y))
best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers'], output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(nn_best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_transformed_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['FT-Transformer'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:33:22,387] A new study created in memory with name: no-name-da008a59-395d-4d19-8668-ae5a3353f64b


Using device: cpu


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:33:34,711] Trial 0 finished with value: 0.764102564102564 and parameters: {'num_heads': 4, 'embedding_dim': 40, 'num_layers': 1, 'hidden_layer_0': 194, 'hidden_layer_1': 153, 'hidden_layer_2': 73, 'nn_learning_rate': 0.00039393719638241317, 'batch_size': 64, 'num_epochs': 98}. Best is trial 0 with value: 0.764102564102564.
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:33:42,431] Trial 1 finished with value: 0.7487179487179487 and parameters: {'num_heads': 4, 'embedding_dim': 40, 'num_layers': 2, 'hidden_layer_0': 38, 'hidden_layer_1': 233, 'hidden_layer_2': 62, 'nn_learning_rate': 0.010375821295189151, 'b

                     Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression  0.748718  0.796644                    0.003991   
KNN                  0.753846  0.840861                    0.001994   
Decision Tree        0.728205  0.749789                    0.004987   
Random Forest             0.8  0.864816                    0.727075   
Gradient Boosting         0.8  0.861439                    0.513596   
XGBoost              0.764103  0.853525                      0.1127   
LightGBM             0.789744  0.866083                    0.057846   
CatBoost             0.753846  0.843288                    0.245005   
MLP                  0.764103  0.838863                    3.545068   
DNN                  0.774359  0.836116                    2.187616   
DCN                  0.723077  0.778793                    5.800046   
Wide_and_Deep        0.774359  0.835693                    2.907177   
XGBoost + NN         0.697436    0.8209                    3.071482   
LightG

In [15]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim):
        super(NeuralNetwork, self).__init__()
        layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                layers.append(nn.Linear(input_dim, hidden_layers[i]))
            else:
                layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_layers[-1], output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def objective(trial):
    # Define hyperparameters to tune for Neural Network
    num_layers = trial.suggest_int('num_layers', 1, 5)
    hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(num_layers)]
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the Neural Network model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y))
    model = NeuralNetwork(input_dim, hidden_layers, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final Neural Network model with the best hyperparameters
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
best_model = NeuralNetwork(input_dim, 
                           [best_params[f'hidden_layer_{i}'] for i in range(best_params['num_layers'])], 
                           output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        proba = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        auc = roc_auc_score(y_true, proba)
    else:  # Multi-class classification
        proba = torch.softmax(outputs, dim=1).cpu().numpy()
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['Neural Architecture Search'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:37:52,750] A new study created in memory with name: no-name-ecd9127e-b104-49a2-86c9-6f016ad41ae4


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:37:54,045] Trial 0 finished with value: 0.6974358974358974 and parameters: {'num_layers': 5, 'hidden_layer_0': 256, 'hidden_layer_1': 195, 'hidden_layer_2': 65, 'hidden_layer_3': 63, 'hidden_layer_4': 35, 'learning_rate': 0.027251445591806936, 'batch_size': 256, 'num_epochs': 30}. Best is trial 0 with value: 0.6974358974358974.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:37:58,271] Trial 1 finished with value: 0.7538461538461538 and parameters: {'num_layers': 2, 'hidden_layer_0': 130, 'hidden_layer_1': 203, 'learning_rate': 0.0010551988083198915, 'batch_size': 64, 'num_epochs': 58}. Best is trial 1 with value: 0.7538461538461538.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:37:59,592] Trial 2 finished with value: 0.7692307692307693 and parameters: {'num_layers': 2, 'hidden_layer_0': 245, 'hidden_layer_1': 164, 

                            Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression         0.748718  0.796644                    0.003991   
KNN                         0.753846  0.840861                    0.001994   
Decision Tree               0.728205  0.749789                    0.004987   
Random Forest                    0.8  0.864816                    0.727075   
Gradient Boosting                0.8  0.861439                    0.513596   
XGBoost                     0.764103  0.853525                      0.1127   
LightGBM                    0.789744  0.866083                    0.057846   
CatBoost                    0.753846  0.843288                    0.245005   
MLP                         0.764103  0.838863                    3.545068   
DNN                         0.774359  0.836116                    2.187616   
DCN                         0.723077  0.778793                    5.800046   
Wide_and_Deep               0.774359  0.835693                  

In [16]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class NODE(nn.Module):
    def __init__(self, input_dim, num_layers, num_trees, tree_dim, output_dim):
        super(NODE, self).__init__()
        self.layers = nn.ModuleList()
        for _ in range(num_layers):
            layer = nn.ModuleList()
            for _ in range(num_trees):
                tree = nn.Sequential(
                    nn.Linear(input_dim, tree_dim),
                    nn.ReLU(),
                    nn.Linear(tree_dim, 1)
                )
                layer.append(tree)
            self.layers.append(layer)
        self.output = nn.Linear(num_layers * num_trees, output_dim)

    def forward(self, x):
        tree_outputs = []
        for layer in self.layers:
            layer_outputs = []
            for tree in layer:
                layer_outputs.append(tree(x))
            layer_output = torch.cat(layer_outputs, dim=1)
            tree_outputs.append(layer_output)
        x = torch.cat(tree_outputs, dim=1)
        return self.output(x)

def objective(trial):
    # Define hyperparameters to tune for NODE
    num_layers = trial.suggest_int('num_layers', 1, 5)
    num_trees = trial.suggest_int('num_trees', 1, 10)
    tree_dim = trial.suggest_int('tree_dim', 8, 64)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the NODE model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y))
    model = NODE(input_dim, num_layers, num_trees, tree_dim, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final NODE model with the best hyperparameters
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
best_model = NODE(input_dim, 
                  best_params['num_layers'], 
                  best_params['num_trees'], 
                  best_params['tree_dim'], 
                  output_dim).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    
    _, predicted = torch.max(outputs, 1)
    y_pred = predicted.cpu().numpy()
    proba = torch.softmax(outputs, dim=1).cpu().numpy()

    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        auc = roc_auc_score(y_true, proba[:, 1])
    else:  # Multi-class classification
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['NODE'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:39:35,442] A new study created in memory with name: no-name-41b9623f-8eba-40ce-8cca-f15a80fda323


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:39:56,396] Trial 0 finished with value: 0.7487179487179487 and parameters: {'num_layers': 4, 'num_trees': 8, 'tree_dim': 23, 'learning_rate': 0.051167047104067796, 'batch_size': 128, 'num_epochs': 90}. Best is trial 0 with value: 0.7487179487179487.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:40:28,571] Trial 1 finished with value: 0.7589743589743589 and parameters: {'num_layers': 4, 'num_trees': 8, 'tree_dim': 42, 'learning_rate': 0.00017034100178255545, 'batch_size': 32, 'num_epochs': 56}. Best is trial 1 with value: 0.7589743589743589.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:40:58,759] Trial 2 finished with value: 0.7538461538461538 and parameters: {'num_layers': 4, 'num_trees': 6, 'tree_dim': 39, 'learning_rate': 0.0019947496524994153, 'batch_size': 32, 'num_epochs': 58}. Best is trial 1 with value: 0.

                            Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression         0.748718  0.796644                    0.003991   
KNN                         0.753846  0.840861                    0.001994   
Decision Tree               0.728205  0.749789                    0.004987   
Random Forest                    0.8  0.864816                    0.727075   
Gradient Boosting                0.8  0.861439                    0.513596   
XGBoost                     0.764103  0.853525                      0.1127   
LightGBM                    0.789744  0.866083                    0.057846   
CatBoost                    0.753846  0.843288                    0.245005   
MLP                         0.764103  0.838863                    3.545068   
DNN                         0.774359  0.836116                    2.187616   
DCN                         0.723077  0.778793                    5.800046   
Wide_and_Deep               0.774359  0.835693                  

In [17]:
import pandas as pd
import numpy as np
import time
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from pytorch_tabnet.tab_model import TabNetClassifier
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

def objective(trial):
    # Define hyperparameters to tune for TabNet
    n_d = trial.suggest_int('n_d', 8, 64)
    n_a = trial.suggest_int('n_a', 8, 64)
    n_steps = trial.suggest_int('n_steps', 3, 10)
    gamma = trial.suggest_float('gamma', 1.0, 2.0)
    lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the TabNet model
    model = TabNetClassifier(
        n_d=n_d,
        n_a=n_a,
        n_steps=n_steps,
        gamma=gamma,
        lambda_sparse=lambda_sparse,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=learning_rate),
        device_name=device
    )

    # Training
    model.fit(
        X_train=X_train_scaled, y_train=y_train.values,
        eval_set=[(X_test_scaled, y_test.values)],
        eval_name=['val'],
        eval_metric=['accuracy'],
        max_epochs=num_epochs,
        patience=10,
        batch_size=batch_size,
        virtual_batch_size=batch_size // 2,
        num_workers=0,
        drop_last=False
    )

    # Evaluation
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final TabNet model with the best hyperparameters
best_model = TabNetClassifier(
    n_d=best_params['n_d'],
    n_a=best_params['n_a'],
    n_steps=best_params['n_steps'],
    gamma=best_params['gamma'],
    lambda_sparse=best_params['lambda_sparse'],
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=best_params['learning_rate']),
    device_name=device
)

training_start_time = time.time()
best_model.fit(
    X_train=X_train_scaled, y_train=y_train.values,
    eval_set=[(X_test_scaled, y_test.values)],
    eval_name=['val'],
    eval_metric=['accuracy'],
    max_epochs=best_params['num_epochs'],
    patience=10,
    batch_size=best_params['batch_size'],
    virtual_batch_size=best_params['batch_size'] // 2,
    num_workers=0,
    drop_last=False
)
training_time = time.time() - training_start_time

# Evaluation
y_pred = best_model.predict(X_test_scaled)
inference_start_time = time.time()
y_pred_proba = best_model.predict_proba(X_test_scaled)
inference_time = time.time() - inference_start_time

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
if len(np.unique(y)) == 2:  # Binary classification
    auc = roc_auc_score(y_test, y_pred_proba[:, 1])
else:  # Multiclass classification
    auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['TabNet'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:44:31,670] A new study created in memory with name: no-name-8ee6defe-9ae0-49e3-84b8-860d75358470


Using device: cpu


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.3455  | val_accuracy: 0.53846 |  0:00:02s
epoch 1  | loss: 1.23687 | val_accuracy: 0.54872 |  0:00:04s
epoch 2  | loss: 1.10571 | val_accuracy: 0.6     |  0:00:06s
epoch 3  | loss: 1.05679 | val_accuracy: 0.56923 |  0:00:09s
epoch 4  | loss: 1.11113 | val_accuracy: 0.58974 |  0:00:11s
epoch 5  | loss: 1.04627 | val_accuracy: 0.61538 |  0:00:13s
epoch 6  | loss: 0.92673 | val_accuracy: 0.57949 |  0:00:15s
epoch 7  | loss: 0.8847  | val_accuracy: 0.58974 |  0:00:18s
epoch 8  | loss: 0.91821 | val_accuracy: 0.62564 |  0:00:20s
epoch 9  | loss: 0.9308  | val_accuracy: 0.58974 |  0:00:22s
epoch 10 | loss: 0.87777 | val_accuracy: 0.59487 |  0:00:24s
epoch 11 | loss: 0.8088  | val_accuracy: 0.61026 |  0:00:27s
epoch 12 | loss: 0.84667 | val_accuracy: 0.61538 |  0:00:29s
epoch 13 | loss: 0.80574 | val_accuracy: 0.61026 |  0:00:31s
epoch 14 | loss: 0.82285 | val_accuracy: 0.61538 |  0:00:33s
epoch 15 | loss: 0.78058 | val_accuracy: 0.64103 |  0:00:36s
epoch 16 | loss: 0.82244

[I 2024-07-26 21:45:32,038] Trial 0 finished with value: 0.6410256410256411 and parameters: {'n_d': 59, 'n_a': 33, 'n_steps': 6, 'gamma': 1.5476170306282893, 'lambda_sparse': 8.999343538588935e-06, 'learning_rate': 0.00039879252081584095, 'batch_size': 32, 'num_epochs': 75}. Best is trial 0 with value: 0.6410256410256411.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.18003 | val_accuracy: 0.57949 |  0:00:00s
epoch 1  | loss: 0.91123 | val_accuracy: 0.55897 |  0:00:01s
epoch 2  | loss: 0.84461 | val_accuracy: 0.61538 |  0:00:03s
epoch 3  | loss: 0.76336 | val_accuracy: 0.59487 |  0:00:04s
epoch 4  | loss: 0.70471 | val_accuracy: 0.66667 |  0:00:05s
epoch 5  | loss: 0.67972 | val_accuracy: 0.6359  |  0:00:06s
epoch 6  | loss: 0.65276 | val_accuracy: 0.61538 |  0:00:07s
epoch 7  | loss: 0.69059 | val_accuracy: 0.65128 |  0:00:08s
epoch 8  | loss: 0.6712  | val_accuracy: 0.6359  |  0:00:10s
epoch 9  | loss: 0.60784 | val_accuracy: 0.64615 |  0:00:11s
epoch 10 | loss: 0.60861 | val_accuracy: 0.65641 |  0:00:12s
epoch 11 | loss: 0.59797 | val_accuracy: 0.67179 |  0:00:13s
epoch 12 | loss: 0.59042 | val_accuracy: 0.67179 |  0:00:14s
epoch 13 | loss: 0.58979 | val_accuracy: 0.68205 |  0:00:15s
epoch 14 | loss: 0.58307 | val_accuracy: 0.67692 |  0:00:16s
epoch 15 | loss: 0.59343 | val_accuracy: 0.67179 |  0:00:16s
epoch 16 | loss: 0.55876

[I 2024-07-26 21:46:05,056] Trial 1 finished with value: 0.6974358974358974 and parameters: {'n_d': 40, 'n_a': 57, 'n_steps': 4, 'gamma': 1.3414110165602908, 'lambda_sparse': 0.0008024543100408923, 'learning_rate': 0.0013123960913818625, 'batch_size': 64, 'num_epochs': 38}. Best is trial 1 with value: 0.6974358974358974.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.65466 | val_accuracy: 0.49744 |  0:00:00s
epoch 1  | loss: 1.61414 | val_accuracy: 0.49231 |  0:00:01s
epoch 2  | loss: 1.6684  | val_accuracy: 0.44615 |  0:00:01s
epoch 3  | loss: 1.63902 | val_accuracy: 0.44103 |  0:00:02s
epoch 4  | loss: 1.70561 | val_accuracy: 0.52821 |  0:00:02s
epoch 5  | loss: 1.54254 | val_accuracy: 0.49744 |  0:00:03s
epoch 6  | loss: 1.5023  | val_accuracy: 0.48205 |  0:00:04s
epoch 7  | loss: 1.58504 | val_accuracy: 0.48718 |  0:00:04s
epoch 8  | loss: 1.62683 | val_accuracy: 0.48718 |  0:00:05s
epoch 9  | loss: 1.52458 | val_accuracy: 0.47692 |  0:00:05s
epoch 10 | loss: 1.5991  | val_accuracy: 0.47179 |  0:00:06s
epoch 11 | loss: 1.54844 | val_accuracy: 0.48205 |  0:00:06s
epoch 12 | loss: 1.5282  | val_accuracy: 0.48718 |  0:00:07s
epoch 13 | loss: 1.43934 | val_accuracy: 0.52308 |  0:00:07s
epoch 14 | loss: 1.43686 | val_accuracy: 0.47692 |  0:00:08s

Early stopping occurred at epoch 14 with best_epoch = 4 and best_val_accuracy = 0.52

[I 2024-07-26 21:46:13,824] Trial 2 finished with value: 0.5282051282051282 and parameters: {'n_d': 19, 'n_a': 13, 'n_steps': 10, 'gamma': 1.5509700315695467, 'lambda_sparse': 2.3398906937090753e-06, 'learning_rate': 0.00018917775478587736, 'batch_size': 256, 'num_epochs': 28}. Best is trial 1 with value: 0.6974358974358974.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.04211 | val_accuracy: 0.62051 |  0:00:01s
epoch 1  | loss: 0.64577 | val_accuracy: 0.65641 |  0:00:02s
epoch 2  | loss: 0.63166 | val_accuracy: 0.61538 |  0:00:04s
epoch 3  | loss: 0.66746 | val_accuracy: 0.63077 |  0:00:05s
epoch 4  | loss: 0.60867 | val_accuracy: 0.65641 |  0:00:05s
epoch 5  | loss: 0.58108 | val_accuracy: 0.62051 |  0:00:06s
epoch 6  | loss: 0.56772 | val_accuracy: 0.6     |  0:00:07s
epoch 7  | loss: 0.55775 | val_accuracy: 0.6359  |  0:00:08s
epoch 8  | loss: 0.56667 | val_accuracy: 0.68205 |  0:00:10s
epoch 9  | loss: 0.57901 | val_accuracy: 0.69744 |  0:00:11s
epoch 10 | loss: 0.56908 | val_accuracy: 0.71795 |  0:00:12s
epoch 11 | loss: 0.56374 | val_accuracy: 0.69744 |  0:00:14s
epoch 12 | loss: 0.56633 | val_accuracy: 0.70256 |  0:00:15s
epoch 13 | loss: 0.54468 | val_accuracy: 0.72821 |  0:00:16s
epoch 14 | loss: 0.54174 | val_accuracy: 0.70769 |  0:00:18s
epoch 15 | loss: 0.55033 | val_accuracy: 0.67179 |  0:00:19s
epoch 16 | loss: 0.57272

[I 2024-07-26 21:46:45,352] Trial 3 finished with value: 0.7282051282051282 and parameters: {'n_d': 11, 'n_a': 16, 'n_steps': 9, 'gamma': 1.404230164236873, 'lambda_sparse': 1.2337829675273215e-06, 'learning_rate': 0.07134761793748194, 'batch_size': 64, 'num_epochs': 70}. Best is trial 3 with value: 0.7282051282051282.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.29735 | val_accuracy: 0.5641  |  0:00:03s
epoch 1  | loss: 1.11579 | val_accuracy: 0.55385 |  0:00:06s
epoch 2  | loss: 1.06315 | val_accuracy: 0.57436 |  0:00:10s
epoch 3  | loss: 0.88803 | val_accuracy: 0.57949 |  0:00:13s
epoch 4  | loss: 0.90835 | val_accuracy: 0.6     |  0:00:17s
epoch 5  | loss: 0.91372 | val_accuracy: 0.58974 |  0:00:19s
epoch 6  | loss: 0.86397 | val_accuracy: 0.61026 |  0:00:23s
epoch 7  | loss: 0.86803 | val_accuracy: 0.61538 |  0:00:26s
epoch 8  | loss: 0.82269 | val_accuracy: 0.62051 |  0:00:29s
epoch 9  | loss: 0.83786 | val_accuracy: 0.6359  |  0:00:31s
epoch 10 | loss: 0.81276 | val_accuracy: 0.68718 |  0:00:34s
epoch 11 | loss: 0.78183 | val_accuracy: 0.68205 |  0:00:36s
epoch 12 | loss: 0.71076 | val_accuracy: 0.64103 |  0:00:38s
epoch 13 | loss: 0.80223 | val_accuracy: 0.62564 |  0:00:41s
epoch 14 | loss: 0.73746 | val_accuracy: 0.65641 |  0:00:43s
epoch 15 | loss: 0.77867 | val_accuracy: 0.64103 |  0:00:46s
epoch 16 | loss: 0.77091

[I 2024-07-26 21:47:44,925] Trial 4 finished with value: 0.6871794871794872 and parameters: {'n_d': 56, 'n_a': 39, 'n_steps': 7, 'gamma': 1.8213532806930595, 'lambda_sparse': 3.980811905742596e-06, 'learning_rate': 0.0013708519968903326, 'batch_size': 32, 'num_epochs': 94}. Best is trial 3 with value: 0.7282051282051282.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.36585 | val_accuracy: 0.65128 |  0:00:00s
epoch 1  | loss: 1.02572 | val_accuracy: 0.66667 |  0:00:01s
epoch 2  | loss: 0.90372 | val_accuracy: 0.60513 |  0:00:02s
epoch 3  | loss: 0.7925  | val_accuracy: 0.62564 |  0:00:03s
epoch 4  | loss: 0.80796 | val_accuracy: 0.65641 |  0:00:04s
epoch 5  | loss: 0.82694 | val_accuracy: 0.64615 |  0:00:05s
epoch 6  | loss: 0.78873 | val_accuracy: 0.64103 |  0:00:06s
epoch 7  | loss: 0.70785 | val_accuracy: 0.63077 |  0:00:07s
epoch 8  | loss: 0.84992 | val_accuracy: 0.58974 |  0:00:08s
epoch 9  | loss: 0.92667 | val_accuracy: 0.64103 |  0:00:09s
epoch 10 | loss: 0.78721 | val_accuracy: 0.62564 |  0:00:09s
epoch 11 | loss: 0.8261  | val_accuracy: 0.62564 |  0:00:10s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_accuracy = 0.66667


[I 2024-07-26 21:47:56,309] Trial 5 finished with value: 0.6666666666666666 and parameters: {'n_d': 42, 'n_a': 30, 'n_steps': 9, 'gamma': 1.6142740263713264, 'lambda_sparse': 1.2647803192835822e-06, 'learning_rate': 0.004554398523193363, 'batch_size': 128, 'num_epochs': 56}. Best is trial 3 with value: 0.7282051282051282.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.94312 | val_accuracy: 0.47692 |  0:00:00s
epoch 1  | loss: 1.77209 | val_accuracy: 0.48205 |  0:00:01s
epoch 2  | loss: 1.63438 | val_accuracy: 0.46667 |  0:00:02s
epoch 3  | loss: 1.44056 | val_accuracy: 0.46154 |  0:00:03s
epoch 4  | loss: 1.37958 | val_accuracy: 0.50769 |  0:00:04s
epoch 5  | loss: 1.44444 | val_accuracy: 0.47692 |  0:00:05s
epoch 6  | loss: 1.29807 | val_accuracy: 0.48718 |  0:00:05s
epoch 7  | loss: 1.27879 | val_accuracy: 0.50769 |  0:00:06s
epoch 8  | loss: 1.19025 | val_accuracy: 0.51282 |  0:00:07s
epoch 9  | loss: 1.16078 | val_accuracy: 0.50256 |  0:00:08s
epoch 10 | loss: 1.15595 | val_accuracy: 0.52821 |  0:00:09s
epoch 11 | loss: 1.21282 | val_accuracy: 0.54872 |  0:00:10s
epoch 12 | loss: 1.00202 | val_accuracy: 0.54359 |  0:00:10s
epoch 13 | loss: 1.11962 | val_accuracy: 0.53333 |  0:00:11s
epoch 14 | loss: 1.023   | val_accuracy: 0.54359 |  0:00:12s
epoch 15 | loss: 1.07079 | val_accuracy: 0.53333 |  0:00:13s
epoch 16 | loss: 0.98627

[I 2024-07-26 21:48:20,554] Trial 6 finished with value: 0.5846153846153846 and parameters: {'n_d': 17, 'n_a': 13, 'n_steps': 10, 'gamma': 1.9909554932764073, 'lambda_sparse': 1.1833416998961726e-05, 'learning_rate': 0.0012965475065764526, 'batch_size': 128, 'num_epochs': 43}. Best is trial 3 with value: 0.7282051282051282.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.89265 | val_accuracy: 0.53846 |  0:00:01s
epoch 1  | loss: 2.40044 | val_accuracy: 0.52308 |  0:00:03s
epoch 2  | loss: 1.64817 | val_accuracy: 0.68205 |  0:00:05s
epoch 3  | loss: 0.64844 | val_accuracy: 0.55897 |  0:00:07s
epoch 4  | loss: 0.67787 | val_accuracy: 0.67692 |  0:00:09s
epoch 5  | loss: 0.60283 | val_accuracy: 0.66667 |  0:00:11s
epoch 6  | loss: 0.55835 | val_accuracy: 0.65128 |  0:00:13s
epoch 7  | loss: 0.59571 | val_accuracy: 0.69231 |  0:00:14s
epoch 8  | loss: 0.55819 | val_accuracy: 0.65641 |  0:00:16s
epoch 9  | loss: 0.57792 | val_accuracy: 0.71282 |  0:00:18s
epoch 10 | loss: 0.5819  | val_accuracy: 0.72308 |  0:00:20s
epoch 11 | loss: 0.58184 | val_accuracy: 0.70769 |  0:00:22s
epoch 12 | loss: 0.59785 | val_accuracy: 0.67179 |  0:00:24s
Stop training because you reached max_epochs = 13 with best_epoch = 10 and best_val_accuracy = 0.72308


[I 2024-07-26 21:48:45,508] Trial 7 finished with value: 0.7230769230769231 and parameters: {'n_d': 63, 'n_a': 54, 'n_steps': 8, 'gamma': 1.8129968435339547, 'lambda_sparse': 2.9241409019451484e-05, 'learning_rate': 0.03411868127500712, 'batch_size': 64, 'num_epochs': 13}. Best is trial 3 with value: 0.7282051282051282.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.78655 | val_accuracy: 0.56923 |  0:00:01s
epoch 1  | loss: 1.55849 | val_accuracy: 0.58462 |  0:00:02s
epoch 2  | loss: 1.50347 | val_accuracy: 0.58462 |  0:00:04s
epoch 3  | loss: 1.38113 | val_accuracy: 0.55385 |  0:00:05s
epoch 4  | loss: 1.28113 | val_accuracy: 0.58462 |  0:00:07s
epoch 5  | loss: 1.18666 | val_accuracy: 0.57436 |  0:00:08s
epoch 6  | loss: 1.08573 | val_accuracy: 0.61026 |  0:00:10s
epoch 7  | loss: 1.1432  | val_accuracy: 0.58974 |  0:00:11s
epoch 8  | loss: 1.06433 | val_accuracy: 0.56923 |  0:00:12s
epoch 9  | loss: 1.03129 | val_accuracy: 0.63077 |  0:00:14s
epoch 10 | loss: 0.93909 | val_accuracy: 0.62051 |  0:00:15s
epoch 11 | loss: 0.94224 | val_accuracy: 0.62051 |  0:00:17s
epoch 12 | loss: 0.91405 | val_accuracy: 0.63077 |  0:00:18s
epoch 13 | loss: 0.97575 | val_accuracy: 0.6359  |  0:00:19s
epoch 14 | loss: 0.87155 | val_accuracy: 0.61538 |  0:00:21s
epoch 15 | loss: 0.87649 | val_accuracy: 0.64615 |  0:00:22s
epoch 16 | loss: 0.84449

[I 2024-07-26 21:49:36,284] Trial 8 finished with value: 0.6871794871794872 and parameters: {'n_d': 28, 'n_a': 42, 'n_steps': 7, 'gamma': 1.1794628755577543, 'lambda_sparse': 0.0006027494074851571, 'learning_rate': 0.00023648338170391018, 'batch_size': 64, 'num_epochs': 35}. Best is trial 3 with value: 0.7282051282051282.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.12046 | val_accuracy: 0.62051 |  0:00:00s
epoch 1  | loss: 1.47043 | val_accuracy: 0.57949 |  0:00:01s
epoch 2  | loss: 1.56357 | val_accuracy: 0.5641  |  0:00:01s
epoch 3  | loss: 1.9029  | val_accuracy: 0.52308 |  0:00:02s
epoch 4  | loss: 2.07697 | val_accuracy: 0.66154 |  0:00:02s
epoch 5  | loss: 0.92697 | val_accuracy: 0.60513 |  0:00:03s
epoch 6  | loss: 0.70518 | val_accuracy: 0.66154 |  0:00:03s
epoch 7  | loss: 0.77203 | val_accuracy: 0.66667 |  0:00:04s
epoch 8  | loss: 0.65628 | val_accuracy: 0.64103 |  0:00:04s
epoch 9  | loss: 0.61985 | val_accuracy: 0.67692 |  0:00:05s
epoch 10 | loss: 0.58931 | val_accuracy: 0.66667 |  0:00:05s
epoch 11 | loss: 0.55519 | val_accuracy: 0.67692 |  0:00:06s
epoch 12 | loss: 0.5697  | val_accuracy: 0.62564 |  0:00:06s
epoch 13 | loss: 0.55774 | val_accuracy: 0.65128 |  0:00:07s
epoch 14 | loss: 0.69059 | val_accuracy: 0.67692 |  0:00:07s
epoch 15 | loss: 0.56876 | val_accuracy: 0.66667 |  0:00:08s
epoch 16 | loss: 0.56167

[I 2024-07-26 21:49:47,936] Trial 9 finished with value: 0.7025641025641025 and parameters: {'n_d': 48, 'n_a': 41, 'n_steps': 7, 'gamma': 1.4511936041145181, 'lambda_sparse': 0.0005605219569625983, 'learning_rate': 0.03800782640949999, 'batch_size': 256, 'num_epochs': 21}. Best is trial 3 with value: 0.7282051282051282.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 0.725   | val_accuracy: 0.62051 |  0:00:00s
epoch 1  | loss: 0.61806 | val_accuracy: 0.64103 |  0:00:01s
epoch 2  | loss: 0.5714  | val_accuracy: 0.61538 |  0:00:01s
epoch 3  | loss: 0.60143 | val_accuracy: 0.64615 |  0:00:02s
epoch 4  | loss: 0.58444 | val_accuracy: 0.74872 |  0:00:02s
epoch 5  | loss: 0.59563 | val_accuracy: 0.67179 |  0:00:03s
epoch 6  | loss: 0.55921 | val_accuracy: 0.67179 |  0:00:04s
epoch 7  | loss: 0.55897 | val_accuracy: 0.69231 |  0:00:05s
epoch 8  | loss: 0.54446 | val_accuracy: 0.68718 |  0:00:05s
epoch 9  | loss: 0.56299 | val_accuracy: 0.67692 |  0:00:06s
epoch 10 | loss: 0.57437 | val_accuracy: 0.68205 |  0:00:07s
epoch 11 | loss: 0.56951 | val_accuracy: 0.68718 |  0:00:07s
epoch 12 | loss: 0.55318 | val_accuracy: 0.69231 |  0:00:08s
epoch 13 | loss: 0.53782 | val_accuracy: 0.69231 |  0:00:08s
epoch 14 | loss: 0.56136 | val_accuracy: 0.67692 |  0:00:09s

Early stopping occurred at epoch 14 with best_epoch = 4 and best_val_accuracy = 0.74

[I 2024-07-26 21:49:58,694] Trial 10 finished with value: 0.7487179487179487 and parameters: {'n_d': 11, 'n_a': 23, 'n_steps': 3, 'gamma': 1.0028132741310292, 'lambda_sparse': 0.00011931663455668031, 'learning_rate': 0.08997719231506193, 'batch_size': 64, 'num_epochs': 72}. Best is trial 10 with value: 0.7487179487179487.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 0.73411 | val_accuracy: 0.55897 |  0:00:01s
epoch 1  | loss: 0.60479 | val_accuracy: 0.70769 |  0:00:02s
epoch 2  | loss: 0.57832 | val_accuracy: 0.60513 |  0:00:03s
epoch 3  | loss: 0.56852 | val_accuracy: 0.65128 |  0:00:04s
epoch 4  | loss: 0.57539 | val_accuracy: 0.70769 |  0:00:07s
epoch 5  | loss: 0.55346 | val_accuracy: 0.71282 |  0:00:08s
epoch 6  | loss: 0.60003 | val_accuracy: 0.71282 |  0:00:09s
epoch 7  | loss: 0.58335 | val_accuracy: 0.67179 |  0:00:11s
epoch 8  | loss: 0.57541 | val_accuracy: 0.67692 |  0:00:12s
epoch 9  | loss: 0.55548 | val_accuracy: 0.68718 |  0:00:13s
epoch 10 | loss: 0.5339  | val_accuracy: 0.70769 |  0:00:14s
epoch 11 | loss: 0.5391  | val_accuracy: 0.69744 |  0:00:15s
epoch 12 | loss: 0.55718 | val_accuracy: 0.68205 |  0:00:15s
epoch 13 | loss: 0.54926 | val_accuracy: 0.71795 |  0:00:16s
epoch 14 | loss: 0.52958 | val_accuracy: 0.72308 |  0:00:16s
epoch 15 | loss: 0.52479 | val_accuracy: 0.71282 |  0:00:17s
epoch 16 | loss: 0.53687

[I 2024-07-26 21:50:26,526] Trial 11 finished with value: 0.7333333333333333 and parameters: {'n_d': 10, 'n_a': 24, 'n_steps': 3, 'gamma': 1.019914815326508, 'lambda_sparse': 0.00011787027972966686, 'learning_rate': 0.06500824374847447, 'batch_size': 64, 'num_epochs': 69}. Best is trial 10 with value: 0.7487179487179487.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 0.85509 | val_accuracy: 0.60513 |  0:00:00s
epoch 1  | loss: 0.66413 | val_accuracy: 0.68205 |  0:00:01s
epoch 2  | loss: 0.61612 | val_accuracy: 0.66667 |  0:00:01s
epoch 3  | loss: 0.57155 | val_accuracy: 0.6359  |  0:00:02s
epoch 4  | loss: 0.54151 | val_accuracy: 0.66154 |  0:00:02s
epoch 5  | loss: 0.55906 | val_accuracy: 0.67179 |  0:00:03s
epoch 6  | loss: 0.54475 | val_accuracy: 0.64103 |  0:00:04s
epoch 7  | loss: 0.53734 | val_accuracy: 0.71282 |  0:00:05s
epoch 8  | loss: 0.52914 | val_accuracy: 0.68718 |  0:00:05s
epoch 9  | loss: 0.53314 | val_accuracy: 0.70256 |  0:00:06s
epoch 10 | loss: 0.54296 | val_accuracy: 0.68718 |  0:00:07s
epoch 11 | loss: 0.50815 | val_accuracy: 0.66667 |  0:00:07s
epoch 12 | loss: 0.51784 | val_accuracy: 0.67179 |  0:00:08s
epoch 13 | loss: 0.52516 | val_accuracy: 0.66667 |  0:00:09s
epoch 14 | loss: 0.51352 | val_accuracy: 0.68205 |  0:00:09s
epoch 15 | loss: 0.53195 | val_accuracy: 0.68718 |  0:00:10s
epoch 16 | loss: 0.53236

[I 2024-07-26 21:50:38,306] Trial 12 finished with value: 0.7128205128205128 and parameters: {'n_d': 10, 'n_a': 23, 'n_steps': 3, 'gamma': 1.007226909344278, 'lambda_sparse': 0.00014435615750468147, 'learning_rate': 0.013181071475732711, 'batch_size': 64, 'num_epochs': 83}. Best is trial 10 with value: 0.7487179487179487.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.01223 | val_accuracy: 0.63077 |  0:00:01s
epoch 1  | loss: 0.74535 | val_accuracy: 0.73333 |  0:00:02s
epoch 2  | loss: 0.63896 | val_accuracy: 0.69744 |  0:00:03s
epoch 3  | loss: 0.58618 | val_accuracy: 0.73333 |  0:00:04s
epoch 4  | loss: 0.58325 | val_accuracy: 0.70769 |  0:00:05s
epoch 5  | loss: 0.56194 | val_accuracy: 0.69744 |  0:00:06s
epoch 6  | loss: 0.5624  | val_accuracy: 0.64103 |  0:00:07s
epoch 7  | loss: 0.55993 | val_accuracy: 0.68718 |  0:00:08s
epoch 8  | loss: 0.55517 | val_accuracy: 0.70769 |  0:00:09s
epoch 9  | loss: 0.54822 | val_accuracy: 0.70769 |  0:00:10s
epoch 10 | loss: 0.56334 | val_accuracy: 0.71795 |  0:00:12s
epoch 11 | loss: 0.56111 | val_accuracy: 0.71282 |  0:00:14s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_accuracy = 0.73333


[I 2024-07-26 21:50:53,475] Trial 13 finished with value: 0.7333333333333333 and parameters: {'n_d': 28, 'n_a': 24, 'n_steps': 5, 'gamma': 1.0012248384119773, 'lambda_sparse': 0.0001245041106542963, 'learning_rate': 0.09313275025245546, 'batch_size': 64, 'num_epochs': 59}. Best is trial 10 with value: 0.7487179487179487.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 0.98997 | val_accuracy: 0.60513 |  0:00:00s
epoch 1  | loss: 0.6521  | val_accuracy: 0.56923 |  0:00:02s
epoch 2  | loss: 0.59444 | val_accuracy: 0.62564 |  0:00:03s
epoch 3  | loss: 0.59227 | val_accuracy: 0.62564 |  0:00:04s
epoch 4  | loss: 0.58246 | val_accuracy: 0.65641 |  0:00:04s
epoch 5  | loss: 0.57851 | val_accuracy: 0.62051 |  0:00:05s
epoch 6  | loss: 0.57358 | val_accuracy: 0.64103 |  0:00:05s
epoch 7  | loss: 0.56627 | val_accuracy: 0.66154 |  0:00:06s
epoch 8  | loss: 0.56012 | val_accuracy: 0.67179 |  0:00:07s
epoch 9  | loss: 0.5514  | val_accuracy: 0.67692 |  0:00:07s
epoch 10 | loss: 0.52606 | val_accuracy: 0.68718 |  0:00:08s
epoch 11 | loss: 0.52105 | val_accuracy: 0.69744 |  0:00:08s
epoch 12 | loss: 0.51747 | val_accuracy: 0.69231 |  0:00:09s
epoch 13 | loss: 0.53074 | val_accuracy: 0.71795 |  0:00:10s
epoch 14 | loss: 0.50838 | val_accuracy: 0.70769 |  0:00:10s
epoch 15 | loss: 0.52697 | val_accuracy: 0.69231 |  0:00:11s
epoch 16 | loss: 0.54188

[I 2024-07-26 21:51:16,973] Trial 14 finished with value: 0.7435897435897436 and parameters: {'n_d': 26, 'n_a': 22, 'n_steps': 3, 'gamma': 1.1770541747833232, 'lambda_sparse': 0.00011336981078064036, 'learning_rate': 0.012671095734378352, 'batch_size': 64, 'num_epochs': 98}. Best is trial 10 with value: 0.7487179487179487.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.19408 | val_accuracy: 0.61538 |  0:00:00s
epoch 1  | loss: 0.7105  | val_accuracy: 0.65641 |  0:00:01s
epoch 2  | loss: 0.68093 | val_accuracy: 0.67179 |  0:00:02s
epoch 3  | loss: 0.67994 | val_accuracy: 0.70256 |  0:00:03s
epoch 4  | loss: 0.62574 | val_accuracy: 0.68718 |  0:00:03s
epoch 5  | loss: 0.61391 | val_accuracy: 0.66154 |  0:00:04s
epoch 6  | loss: 0.6218  | val_accuracy: 0.66154 |  0:00:05s
epoch 7  | loss: 0.59396 | val_accuracy: 0.68718 |  0:00:05s
epoch 8  | loss: 0.60163 | val_accuracy: 0.65641 |  0:00:06s
epoch 9  | loss: 0.59205 | val_accuracy: 0.69231 |  0:00:07s
epoch 10 | loss: 0.56455 | val_accuracy: 0.70769 |  0:00:07s
epoch 11 | loss: 0.56053 | val_accuracy: 0.67692 |  0:00:08s
epoch 12 | loss: 0.55965 | val_accuracy: 0.69744 |  0:00:10s
epoch 13 | loss: 0.54866 | val_accuracy: 0.74359 |  0:00:11s
epoch 14 | loss: 0.54653 | val_accuracy: 0.72308 |  0:00:12s
epoch 15 | loss: 0.53961 | val_accuracy: 0.71282 |  0:00:14s
epoch 16 | loss: 0.54778

[I 2024-07-26 21:51:40,210] Trial 15 finished with value: 0.7435897435897436 and parameters: {'n_d': 28, 'n_a': 8, 'n_steps': 4, 'gamma': 1.2276909650001775, 'lambda_sparse': 4.609742620157796e-05, 'learning_rate': 0.01046104598396773, 'batch_size': 64, 'num_epochs': 99}. Best is trial 10 with value: 0.7487179487179487.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.07586 | val_accuracy: 0.5641  |  0:00:00s
epoch 1  | loss: 0.81028 | val_accuracy: 0.64103 |  0:00:01s
epoch 2  | loss: 0.70333 | val_accuracy: 0.63077 |  0:00:02s
epoch 3  | loss: 0.63239 | val_accuracy: 0.60513 |  0:00:04s
epoch 4  | loss: 0.6014  | val_accuracy: 0.69744 |  0:00:05s
epoch 5  | loss: 0.56139 | val_accuracy: 0.66667 |  0:00:06s
epoch 6  | loss: 0.56018 | val_accuracy: 0.67179 |  0:00:06s
epoch 7  | loss: 0.54414 | val_accuracy: 0.65128 |  0:00:07s
epoch 8  | loss: 0.52857 | val_accuracy: 0.68205 |  0:00:07s
epoch 9  | loss: 0.52169 | val_accuracy: 0.70256 |  0:00:08s
epoch 10 | loss: 0.51725 | val_accuracy: 0.69231 |  0:00:08s
epoch 11 | loss: 0.53137 | val_accuracy: 0.72821 |  0:00:09s
epoch 12 | loss: 0.52413 | val_accuracy: 0.70256 |  0:00:09s
epoch 13 | loss: 0.52285 | val_accuracy: 0.69231 |  0:00:10s
epoch 14 | loss: 0.49649 | val_accuracy: 0.68718 |  0:00:10s
epoch 15 | loss: 0.49719 | val_accuracy: 0.71282 |  0:00:11s
epoch 16 | loss: 0.49657

[I 2024-07-26 21:51:55,510] Trial 16 finished with value: 0.7282051282051282 and parameters: {'n_d': 22, 'n_a': 21, 'n_steps': 5, 'gamma': 1.180796063007985, 'lambda_sparse': 0.00024659741018132205, 'learning_rate': 0.015507565148104318, 'batch_size': 128, 'num_epochs': 89}. Best is trial 10 with value: 0.7487179487179487.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.18468 | val_accuracy: 0.64103 |  0:00:00s
epoch 1  | loss: 0.8381  | val_accuracy: 0.62564 |  0:00:00s
epoch 2  | loss: 0.7023  | val_accuracy: 0.63077 |  0:00:00s
epoch 3  | loss: 0.62516 | val_accuracy: 0.58462 |  0:00:01s
epoch 4  | loss: 0.61198 | val_accuracy: 0.64103 |  0:00:01s
epoch 5  | loss: 0.55528 | val_accuracy: 0.65128 |  0:00:01s
epoch 6  | loss: 0.54548 | val_accuracy: 0.63077 |  0:00:02s
epoch 7  | loss: 0.54303 | val_accuracy: 0.6359  |  0:00:02s
epoch 8  | loss: 0.54383 | val_accuracy: 0.63077 |  0:00:02s
epoch 9  | loss: 0.54389 | val_accuracy: 0.66667 |  0:00:03s
epoch 10 | loss: 0.55261 | val_accuracy: 0.68205 |  0:00:03s
epoch 11 | loss: 0.49234 | val_accuracy: 0.68205 |  0:00:04s
epoch 12 | loss: 0.53854 | val_accuracy: 0.6359  |  0:00:04s
epoch 13 | loss: 0.50797 | val_accuracy: 0.68205 |  0:00:04s
epoch 14 | loss: 0.49345 | val_accuracy: 0.69231 |  0:00:05s
epoch 15 | loss: 0.50098 | val_accuracy: 0.65641 |  0:00:05s
epoch 16 | loss: 0.48586

[I 2024-07-26 21:52:09,397] Trial 17 finished with value: 0.7282051282051282 and parameters: {'n_d': 34, 'n_a': 47, 'n_steps': 4, 'gamma': 1.1213119598089212, 'lambda_sparse': 4.5657879639362306e-05, 'learning_rate': 0.007294674115382554, 'batch_size': 256, 'num_epochs': 82}. Best is trial 10 with value: 0.7487179487179487.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 0.74272 | val_accuracy: 0.66154 |  0:00:01s
epoch 1  | loss: 0.60172 | val_accuracy: 0.65641 |  0:00:02s
epoch 2  | loss: 0.5981  | val_accuracy: 0.64103 |  0:00:03s
epoch 3  | loss: 0.59535 | val_accuracy: 0.66667 |  0:00:05s
epoch 4  | loss: 0.57755 | val_accuracy: 0.68718 |  0:00:07s
epoch 5  | loss: 0.56265 | val_accuracy: 0.72821 |  0:00:08s
epoch 6  | loss: 0.54351 | val_accuracy: 0.71282 |  0:00:09s
epoch 7  | loss: 0.57042 | val_accuracy: 0.69744 |  0:00:10s
epoch 8  | loss: 0.54942 | val_accuracy: 0.69744 |  0:00:12s
epoch 9  | loss: 0.5601  | val_accuracy: 0.70256 |  0:00:13s
epoch 10 | loss: 0.52986 | val_accuracy: 0.73333 |  0:00:15s
epoch 11 | loss: 0.55152 | val_accuracy: 0.70256 |  0:00:16s
epoch 12 | loss: 0.51986 | val_accuracy: 0.70256 |  0:00:17s
epoch 13 | loss: 0.56096 | val_accuracy: 0.71282 |  0:00:19s
epoch 14 | loss: 0.54058 | val_accuracy: 0.72821 |  0:00:20s
epoch 15 | loss: 0.5455  | val_accuracy: 0.69744 |  0:00:21s
epoch 16 | loss: 0.53142

[I 2024-07-26 21:52:41,086] Trial 18 finished with value: 0.7333333333333333 and parameters: {'n_d': 18, 'n_a': 30, 'n_steps': 3, 'gamma': 1.3014580853472242, 'lambda_sparse': 0.00032963018224346937, 'learning_rate': 0.027742108297274918, 'batch_size': 32, 'num_epochs': 100}. Best is trial 10 with value: 0.7487179487179487.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.06468 | val_accuracy: 0.6359  |  0:00:01s
epoch 1  | loss: 0.70026 | val_accuracy: 0.68205 |  0:00:03s
epoch 2  | loss: 0.72526 | val_accuracy: 0.67692 |  0:00:05s
epoch 3  | loss: 0.67881 | val_accuracy: 0.69231 |  0:00:07s
epoch 4  | loss: 0.6566  | val_accuracy: 0.65128 |  0:00:09s
epoch 5  | loss: 0.64935 | val_accuracy: 0.67692 |  0:00:11s
epoch 6  | loss: 0.58683 | val_accuracy: 0.65641 |  0:00:12s
epoch 7  | loss: 0.58752 | val_accuracy: 0.68205 |  0:00:16s
epoch 8  | loss: 0.61817 | val_accuracy: 0.69231 |  0:00:17s
epoch 9  | loss: 0.60115 | val_accuracy: 0.70769 |  0:00:20s
epoch 10 | loss: 0.57936 | val_accuracy: 0.68718 |  0:00:21s
epoch 11 | loss: 0.58927 | val_accuracy: 0.68718 |  0:00:23s
epoch 12 | loss: 0.558   | val_accuracy: 0.70769 |  0:00:25s
epoch 13 | loss: 0.56991 | val_accuracy: 0.70769 |  0:00:28s
epoch 14 | loss: 0.56246 | val_accuracy: 0.70769 |  0:00:30s
epoch 15 | loss: 0.54717 | val_accuracy: 0.69231 |  0:00:32s
epoch 16 | loss: 0.5657 

[I 2024-07-26 21:53:41,798] Trial 19 finished with value: 0.7794871794871795 and parameters: {'n_d': 33, 'n_a': 8, 'n_steps': 5, 'gamma': 1.108918887339927, 'lambda_sparse': 6.34631345516932e-05, 'learning_rate': 0.0034942897183386835, 'batch_size': 64, 'num_epochs': 63}. Best is trial 19 with value: 0.7794871794871795.


epoch 0  | loss: 1.06468 | val_accuracy: 0.6359  |  0:00:00s
epoch 1  | loss: 0.70026 | val_accuracy: 0.68205 |  0:00:01s
epoch 2  | loss: 0.72526 | val_accuracy: 0.67692 |  0:00:02s
epoch 3  | loss: 0.67881 | val_accuracy: 0.69231 |  0:00:03s
epoch 4  | loss: 0.6566  | val_accuracy: 0.65128 |  0:00:04s
epoch 5  | loss: 0.64935 | val_accuracy: 0.67692 |  0:00:05s
epoch 6  | loss: 0.58683 | val_accuracy: 0.65641 |  0:00:05s
epoch 7  | loss: 0.58752 | val_accuracy: 0.68205 |  0:00:06s
epoch 8  | loss: 0.61817 | val_accuracy: 0.69231 |  0:00:07s
epoch 9  | loss: 0.60115 | val_accuracy: 0.70769 |  0:00:08s
epoch 10 | loss: 0.57936 | val_accuracy: 0.68718 |  0:00:09s
epoch 11 | loss: 0.58927 | val_accuracy: 0.68718 |  0:00:09s
epoch 12 | loss: 0.558   | val_accuracy: 0.70769 |  0:00:10s
epoch 13 | loss: 0.56991 | val_accuracy: 0.70769 |  0:00:11s
epoch 14 | loss: 0.56246 | val_accuracy: 0.70769 |  0:00:12s
epoch 15 | loss: 0.54717 | val_accuracy: 0.69231 |  0:00:13s
epoch 16 | loss: 0.5657 



                            Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression         0.748718  0.796644                    0.003991   
KNN                         0.753846  0.840861                    0.001994   
Decision Tree               0.728205  0.749789                    0.004987   
Random Forest                    0.8  0.864816                    0.727075   
Gradient Boosting                0.8  0.861439                    0.513596   
XGBoost                     0.764103  0.853525                      0.1127   
LightGBM                    0.789744  0.866083                    0.057846   
CatBoost                    0.753846  0.843288                    0.245005   
MLP                         0.764103  0.838863                    3.545068   
DNN                         0.774359  0.836116                    2.187616   
DCN                         0.723077  0.778793                    5.800046   
Wide_and_Deep               0.774359  0.835693                  

In [18]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class KAN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(KAN, self).__init__()
        self.hidden_layer = nn.Linear(input_dim, hidden_dim)
        self.activation = nn.ReLU()
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h = self.activation(self.hidden_layer(x))
        out = self.output_layer(h)
        return out

def objective(trial):
    # Define hyperparameters to tune for KAN
    hidden_dim = trial.suggest_int('hidden_dim', 32, 256)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the KAN model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y))
    model = KAN(input_dim, hidden_dim, output_dim).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final KAN model with the best hyperparameters
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
best_model = KAN(input_dim, 
                 best_params['hidden_dim'], 
                 output_dim).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    
    _, predicted = torch.max(outputs, 1)
    y_pred = predicted.cpu().numpy()
    proba = torch.softmax(outputs, dim=1).cpu().numpy()

    accuracy = accuracy_score(y_true, y_pred)
    
    if output_dim == 2:  # Binary classification
        auc = roc_auc_score(y_true, proba[:, 1])
    else:  # Multi-class classification
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['KAN'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:54:38,885] A new study created in memory with name: no-name-d6b10731-f73c-45c5-ad13-a68318d79bfe


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:54:41,019] Trial 0 finished with value: 0.7230769230769231 and parameters: {'hidden_dim': 186, 'learning_rate': 0.0001924859163305034, 'batch_size': 128, 'num_epochs': 54}. Best is trial 0 with value: 0.7230769230769231.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:54:41,940] Trial 1 finished with value: 0.7282051282051282 and parameters: {'hidden_dim': 63, 'learning_rate': 0.0007027299733553533, 'batch_size': 32, 'num_epochs': 19}. Best is trial 1 with value: 0.7282051282051282.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:54:43,477] Trial 2 finished with value: 0.7333333333333333 and parameters: {'hidden_dim': 51, 'learning_rate': 0.07606912126560084, 'batch_size': 256, 'num_epochs': 63}. Best is trial 2 with value: 0.7333333333333333.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 

                            Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression         0.748718  0.796644                    0.003991   
KNN                         0.753846  0.840861                    0.001994   
Decision Tree               0.728205  0.749789                    0.004987   
Random Forest                    0.8  0.864816                    0.727075   
Gradient Boosting                0.8  0.861439                    0.513596   
XGBoost                     0.764103  0.853525                      0.1127   
LightGBM                    0.789744  0.866083                    0.057846   
CatBoost                    0.753846  0.843288                    0.245005   
MLP                         0.764103  0.838863                    3.545068   
DNN                         0.774359  0.836116                    2.187616   
DCN                         0.723077  0.778793                    5.800046   
Wide_and_Deep               0.774359  0.835693                  

In [19]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

class SAINT(nn.Module):
    def __init__(self, input_dim, num_classes, dim, depth, heads, mlp_dim, dropout=0.1):
        super(SAINT, self).__init__()
        self.embeds = nn.Linear(input_dim, dim)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim, dropout=dropout),
            num_layers=depth
        )
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, x):
        x = self.embeds(x)
        x = x.unsqueeze(1)  # Add sequence dimension
        x = self.transformer(x)
        x = x.squeeze(1)  # Remove sequence dimension
        return self.mlp_head(x)

def objective(trial):
    # Define hyperparameters to tune for SAINT
    heads = trial.suggest_int('heads', 1, 8)
    dim = trial.suggest_int('dim', heads, 256, step=heads)  # Ensure dim is divisible by heads
    depth = trial.suggest_int('depth', 1, 6)
    mlp_dim = trial.suggest_int('mlp_dim', 32, 256)
    dropout = trial.suggest_float('dropout', 0.0, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the SAINT model
    input_dim = X_train.shape[1]
    num_classes = len(np.unique(y))
    model = SAINT(input_dim, num_classes, dim, depth, heads, mlp_dim, dropout).to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final SAINT model with the best hyperparameters
input_dim = X_train.shape[1]
num_classes = len(np.unique(y))
best_model = SAINT(input_dim, num_classes, 
                   best_params['dim'], 
                   best_params['depth'], 
                   best_params['heads'], 
                   best_params['mlp_dim'], 
                   best_params['dropout']).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

training_start_time = time.time()
for epoch in range(best_params['num_epochs']):
    best_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

training_time = time.time() - training_start_time

# Evaluation
best_model.eval()
with torch.no_grad():
    inference_start_time = time.time()
    outputs = best_model(X_test_tensor)
    inference_time = time.time() - inference_start_time

    # Convert to numpy for metric calculation
    y_true = y_test_tensor.cpu().numpy()
    
    _, predicted = torch.max(outputs, 1)
    y_pred = predicted.cpu().numpy()
    proba = torch.softmax(outputs, dim=1).cpu().numpy()

    accuracy = accuracy_score(y_true, y_pred)
    
    if num_classes == 2:  # Binary classification
        auc = roc_auc_score(y_true, proba[:, 1])
    else:  # Multi-class classification
        auc = roc_auc_score(y_true, proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['SAINT'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 21:55:23,406] A new study created in memory with name: no-name-ef99e6f9-4a27-4d29-b141-0008901e9a55


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:55:28,225] Trial 0 finished with value: 0.4666666666666667 and parameters: {'heads': 7, 'dim': 245, 'depth': 2, 'mlp_dim': 66, 'dropout': 0.38122637242381197, 'learning_rate': 0.03438537350792628, 'batch_size': 128, 'num_epochs': 10}. Best is trial 0 with value: 0.4666666666666667.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:56:07,752] Trial 1 finished with value: 0.5897435897435898 and parameters: {'heads': 5, 'dim': 255, 'depth': 6, 'mlp_dim': 52, 'dropout': 0.27613722450196104, 'learning_rate': 0.001214249767907352, 'batch_size': 256, 'num_epochs': 66}. Best is trial 1 with value: 0.5897435897435898.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-07-26 21:56:56,549] Trial 2 finished with value: 0.7282051282051282 and parameters: {'heads': 7, 'dim': 91, 'depth': 6, 'mlp_dim': 140, 'dropout': 0.3209336072844951, 'learnin

                            Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression         0.748718  0.796644                    0.003991   
KNN                         0.753846  0.840861                    0.001994   
Decision Tree               0.728205  0.749789                    0.004987   
Random Forest                    0.8  0.864816                    0.727075   
Gradient Boosting                0.8  0.861439                    0.513596   
XGBoost                     0.764103  0.853525                      0.1127   
LightGBM                    0.789744  0.866083                    0.057846   
CatBoost                    0.753846  0.843288                    0.245005   
MLP                         0.764103  0.838863                    3.545068   
DNN                         0.774359  0.836116                    2.187616   
DCN                         0.723077  0.778793                    5.800046   
Wide_and_Deep               0.774359  0.835693                  

In [20]:
import pandas as pd
import numpy as np
import time
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from pytorch_tabnet.tab_model import TabNetClassifier
import optuna

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Start timing the entire process
start_time = time.time()

# Assuming df is already defined
X = df.drop('Y', axis=1)
y = df['Y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

def objective(trial):
    # Define hyperparameters to tune for VIME-like model (using TabNet as proxy)
    n_d = trial.suggest_int('n_d', 8, 64)
    n_a = trial.suggest_int('n_a', 8, 64)
    n_steps = trial.suggest_int('n_steps', 3, 10)
    gamma = trial.suggest_float('gamma', 1.0, 2.0)
    lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    num_epochs = trial.suggest_int('num_epochs', 10, 100)

    # Create the TabNet model
    model = TabNetClassifier(
        n_d=n_d,
        n_a=n_a,
        n_steps=n_steps,
        gamma=gamma,
        lambda_sparse=lambda_sparse,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=learning_rate),
        device_name=device
    )

    # Training
    model.fit(
        X_train=X_train_scaled, y_train=y_train.values,
        eval_set=[(X_test_scaled, y_test.values)],
        eval_name=['val'],
        eval_metric=['accuracy'],
        max_epochs=num_epochs,
        patience=10,
        batch_size=batch_size,
        virtual_batch_size=batch_size // 2,
        num_workers=0,
        drop_last=False
    )

    # Evaluation
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

# Train the final TabNet model with the best hyperparameters
best_model = TabNetClassifier(
    n_d=best_params['n_d'],
    n_a=best_params['n_a'],
    n_steps=best_params['n_steps'],
    gamma=best_params['gamma'],
    lambda_sparse=best_params['lambda_sparse'],
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=best_params['learning_rate']),
    device_name=device
)

training_start_time = time.time()
best_model.fit(
    X_train=X_train_scaled, y_train=y_train.values,
    eval_set=[(X_test_scaled, y_test.values)],
    eval_name=['val'],
    eval_metric=['accuracy'],
    max_epochs=best_params['num_epochs'],
    patience=10,
    batch_size=best_params['batch_size'],
    virtual_batch_size=best_params['batch_size'] // 2,
    num_workers=0,
    drop_last=False
)
training_time = time.time() - training_start_time

# Evaluation
y_pred = best_model.predict(X_test_scaled)
inference_start_time = time.time()
y_pred_proba = best_model.predict_proba(X_test_scaled)
inference_time = time.time() - inference_start_time

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
if len(np.unique(y)) == 2:  # Binary classification
    auc = roc_auc_score(y_test, y_pred_proba[:, 1])
else:  # Multiclass classification
    auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')

# Calculate total computation time
computation_time = time.time() - start_time

# Store results in the existing result DataFrame
result.loc['VIME'] = [accuracy, auc, training_time, inference_time, computation_time, best_params]

print(result)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

[I 2024-07-26 22:06:02,049] A new study created in memory with name: no-name-4e434e35-c773-4e31-bf25-2bb6ea7b287c


Using device: cpu


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.88328 | val_accuracy: 0.51795 |  0:00:01s
epoch 1  | loss: 1.15528 | val_accuracy: 0.6     |  0:00:01s
epoch 2  | loss: 1.03478 | val_accuracy: 0.5641  |  0:00:03s
epoch 3  | loss: 0.94845 | val_accuracy: 0.60513 |  0:00:04s
epoch 4  | loss: 0.82715 | val_accuracy: 0.61538 |  0:00:05s
epoch 5  | loss: 0.62127 | val_accuracy: 0.62564 |  0:00:05s
epoch 6  | loss: 0.65437 | val_accuracy: 0.60513 |  0:00:06s
epoch 7  | loss: 0.64205 | val_accuracy: 0.68718 |  0:00:07s
epoch 8  | loss: 0.67313 | val_accuracy: 0.59487 |  0:00:08s
epoch 9  | loss: 0.70581 | val_accuracy: 0.64615 |  0:00:09s
epoch 10 | loss: 0.74422 | val_accuracy: 0.64615 |  0:00:10s
epoch 11 | loss: 0.60619 | val_accuracy: 0.66667 |  0:00:11s
epoch 12 | loss: 0.58805 | val_accuracy: 0.61538 |  0:00:13s
epoch 13 | loss: 0.58355 | val_accuracy: 0.65128 |  0:00:14s
epoch 14 | loss: 0.57404 | val_accuracy: 0.66154 |  0:00:15s
epoch 15 | loss: 0.58828 | val_accuracy: 0.68205 |  0:00:16s
epoch 16 | loss: 0.60357

[I 2024-07-26 22:06:21,717] Trial 0 finished with value: 0.6871794871794872 and parameters: {'n_d': 25, 'n_a': 55, 'n_steps': 9, 'gamma': 1.9612932175244775, 'lambda_sparse': 1.5699959648691783e-05, 'learning_rate': 0.03553586705153356, 'batch_size': 128, 'num_epochs': 34}. Best is trial 0 with value: 0.6871794871794872.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.47676 | val_accuracy: 0.50769 |  0:00:00s
epoch 1  | loss: 0.86558 | val_accuracy: 0.55385 |  0:00:00s
epoch 2  | loss: 0.71355 | val_accuracy: 0.59487 |  0:00:01s
epoch 3  | loss: 0.7631  | val_accuracy: 0.60513 |  0:00:01s
epoch 4  | loss: 0.80486 | val_accuracy: 0.6359  |  0:00:02s
epoch 5  | loss: 0.63896 | val_accuracy: 0.60513 |  0:00:02s
epoch 6  | loss: 0.63563 | val_accuracy: 0.66667 |  0:00:03s
epoch 7  | loss: 0.58952 | val_accuracy: 0.68718 |  0:00:03s
epoch 8  | loss: 0.56929 | val_accuracy: 0.65128 |  0:00:04s
epoch 9  | loss: 0.60126 | val_accuracy: 0.67692 |  0:00:04s
epoch 10 | loss: 0.58348 | val_accuracy: 0.69231 |  0:00:05s
epoch 11 | loss: 0.61497 | val_accuracy: 0.6359  |  0:00:05s
epoch 12 | loss: 0.62353 | val_accuracy: 0.67179 |  0:00:05s
epoch 13 | loss: 0.54518 | val_accuracy: 0.65128 |  0:00:06s
epoch 14 | loss: 0.58905 | val_accuracy: 0.66667 |  0:00:06s
epoch 15 | loss: 0.54842 | val_accuracy: 0.65128 |  0:00:07s
epoch 16 | loss: 0.59893

[I 2024-07-26 22:06:31,568] Trial 1 finished with value: 0.6923076923076923 and parameters: {'n_d': 62, 'n_a': 49, 'n_steps': 5, 'gamma': 1.654980627930387, 'lambda_sparse': 2.4182017114232987e-06, 'learning_rate': 0.002657059212656804, 'batch_size': 256, 'num_epochs': 52}. Best is trial 1 with value: 0.6923076923076923.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 4.70057 | val_accuracy: 0.56923 |  0:00:00s
epoch 1  | loss: 1.40543 | val_accuracy: 0.61538 |  0:00:00s
epoch 2  | loss: 1.19291 | val_accuracy: 0.58974 |  0:00:01s
epoch 3  | loss: 0.76889 | val_accuracy: 0.58462 |  0:00:01s
epoch 4  | loss: 0.63891 | val_accuracy: 0.58462 |  0:00:02s
epoch 5  | loss: 0.58485 | val_accuracy: 0.61538 |  0:00:02s
epoch 6  | loss: 0.57306 | val_accuracy: 0.66667 |  0:00:03s
epoch 7  | loss: 0.56439 | val_accuracy: 0.6     |  0:00:03s
epoch 8  | loss: 0.54318 | val_accuracy: 0.67179 |  0:00:04s
epoch 9  | loss: 0.54122 | val_accuracy: 0.64615 |  0:00:04s
epoch 10 | loss: 0.55174 | val_accuracy: 0.6     |  0:00:05s
epoch 11 | loss: 0.61805 | val_accuracy: 0.57436 |  0:00:05s
epoch 12 | loss: 0.60409 | val_accuracy: 0.60513 |  0:00:06s
epoch 13 | loss: 0.60081 | val_accuracy: 0.63077 |  0:00:06s
epoch 14 | loss: 0.55015 | val_accuracy: 0.6     |  0:00:07s
epoch 15 | loss: 0.56708 | val_accuracy: 0.65128 |  0:00:07s
epoch 16 | loss: 0.54928

[I 2024-07-26 22:06:40,767] Trial 2 finished with value: 0.6717948717948717 and parameters: {'n_d': 63, 'n_a': 43, 'n_steps': 5, 'gamma': 1.223088639494534, 'lambda_sparse': 1.2437835631239294e-06, 'learning_rate': 0.06787040404699463, 'batch_size': 256, 'num_epochs': 39}. Best is trial 1 with value: 0.6923076923076923.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.08794 | val_accuracy: 0.48718 |  0:00:01s
epoch 1  | loss: 1.04908 | val_accuracy: 0.47179 |  0:00:03s
epoch 2  | loss: 0.89947 | val_accuracy: 0.47692 |  0:00:04s
epoch 3  | loss: 0.90516 | val_accuracy: 0.49744 |  0:00:06s
epoch 4  | loss: 0.88597 | val_accuracy: 0.52821 |  0:00:08s
epoch 5  | loss: 0.82173 | val_accuracy: 0.49231 |  0:00:09s
epoch 6  | loss: 0.80319 | val_accuracy: 0.52821 |  0:00:10s
epoch 7  | loss: 0.81732 | val_accuracy: 0.51795 |  0:00:12s
epoch 8  | loss: 0.75448 | val_accuracy: 0.53846 |  0:00:13s
epoch 9  | loss: 0.77094 | val_accuracy: 0.57949 |  0:00:15s
epoch 10 | loss: 0.75104 | val_accuracy: 0.62564 |  0:00:16s
epoch 11 | loss: 0.78321 | val_accuracy: 0.62051 |  0:00:18s
epoch 12 | loss: 0.75977 | val_accuracy: 0.58462 |  0:00:19s
epoch 13 | loss: 0.74527 | val_accuracy: 0.60513 |  0:00:21s
epoch 14 | loss: 0.72776 | val_accuracy: 0.55897 |  0:00:22s
epoch 15 | loss: 0.74338 | val_accuracy: 0.53846 |  0:00:23s
epoch 16 | loss: 0.74885

[I 2024-07-26 22:07:12,619] Trial 3 finished with value: 0.6256410256410256 and parameters: {'n_d': 8, 'n_a': 22, 'n_steps': 6, 'gamma': 1.836387208302698, 'lambda_sparse': 0.00027972725077287237, 'learning_rate': 0.000722995582126127, 'batch_size': 32, 'num_epochs': 98}. Best is trial 1 with value: 0.6923076923076923.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.08711 | val_accuracy: 0.52821 |  0:00:00s
epoch 1  | loss: 0.95701 | val_accuracy: 0.58462 |  0:00:00s
epoch 2  | loss: 0.82448 | val_accuracy: 0.66154 |  0:00:01s
epoch 3  | loss: 0.80476 | val_accuracy: 0.67692 |  0:00:01s
epoch 4  | loss: 0.762   | val_accuracy: 0.67179 |  0:00:02s
epoch 5  | loss: 0.77394 | val_accuracy: 0.69231 |  0:00:02s
epoch 6  | loss: 0.7314  | val_accuracy: 0.68205 |  0:00:03s
epoch 7  | loss: 0.70217 | val_accuracy: 0.68205 |  0:00:03s
epoch 8  | loss: 0.68799 | val_accuracy: 0.67692 |  0:00:04s
epoch 9  | loss: 0.70371 | val_accuracy: 0.69231 |  0:00:04s
epoch 10 | loss: 0.63584 | val_accuracy: 0.69231 |  0:00:04s
epoch 11 | loss: 0.62409 | val_accuracy: 0.71282 |  0:00:05s
epoch 12 | loss: 0.66376 | val_accuracy: 0.69231 |  0:00:05s
epoch 13 | loss: 0.62097 | val_accuracy: 0.70256 |  0:00:06s
epoch 14 | loss: 0.62068 | val_accuracy: 0.70769 |  0:00:06s
epoch 15 | loss: 0.62996 | val_accuracy: 0.71282 |  0:00:06s
epoch 16 | loss: 0.59327

[I 2024-07-26 22:07:25,244] Trial 4 finished with value: 0.7282051282051282 and parameters: {'n_d': 58, 'n_a': 23, 'n_steps': 3, 'gamma': 1.4949923485646366, 'lambda_sparse': 0.00022078454573570624, 'learning_rate': 0.0004491821337873118, 'batch_size': 128, 'num_epochs': 29}. Best is trial 4 with value: 0.7282051282051282.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.32642 | val_accuracy: 0.64615 |  0:00:01s
epoch 1  | loss: 0.70338 | val_accuracy: 0.61026 |  0:00:03s
epoch 2  | loss: 0.61983 | val_accuracy: 0.66667 |  0:00:05s
epoch 3  | loss: 0.58908 | val_accuracy: 0.6359  |  0:00:07s
epoch 4  | loss: 0.60174 | val_accuracy: 0.67692 |  0:00:09s
epoch 5  | loss: 0.63881 | val_accuracy: 0.70256 |  0:00:11s
epoch 6  | loss: 0.57826 | val_accuracy: 0.68718 |  0:00:14s
epoch 7  | loss: 0.57884 | val_accuracy: 0.71282 |  0:00:16s
epoch 8  | loss: 0.56155 | val_accuracy: 0.68718 |  0:00:18s
epoch 9  | loss: 0.55539 | val_accuracy: 0.65641 |  0:00:20s
epoch 10 | loss: 0.57132 | val_accuracy: 0.70256 |  0:00:23s
epoch 11 | loss: 0.55171 | val_accuracy: 0.71282 |  0:00:24s
epoch 12 | loss: 0.56378 | val_accuracy: 0.71795 |  0:00:26s
epoch 13 | loss: 0.56245 | val_accuracy: 0.72308 |  0:00:28s
epoch 14 | loss: 0.55521 | val_accuracy: 0.73333 |  0:00:30s
epoch 15 | loss: 0.54629 | val_accuracy: 0.71795 |  0:00:31s
epoch 16 | loss: 0.5386 

[I 2024-07-26 22:08:15,496] Trial 5 finished with value: 0.7333333333333333 and parameters: {'n_d': 48, 'n_a': 50, 'n_steps': 5, 'gamma': 1.4128893552268842, 'lambda_sparse': 5.779193004449135e-05, 'learning_rate': 0.05854127294781648, 'batch_size': 32, 'num_epochs': 85}. Best is trial 5 with value: 0.7333333333333333.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.07406 | val_accuracy: 0.57949 |  0:00:01s
epoch 1  | loss: 1.84265 | val_accuracy: 0.55897 |  0:00:03s
epoch 2  | loss: 1.77555 | val_accuracy: 0.54359 |  0:00:05s
epoch 3  | loss: 1.64133 | val_accuracy: 0.52821 |  0:00:07s
epoch 4  | loss: 1.48044 | val_accuracy: 0.55897 |  0:00:09s
epoch 5  | loss: 1.30863 | val_accuracy: 0.54359 |  0:00:12s
epoch 6  | loss: 1.27771 | val_accuracy: 0.5641  |  0:00:14s
epoch 7  | loss: 1.24047 | val_accuracy: 0.57436 |  0:00:16s
epoch 8  | loss: 1.11613 | val_accuracy: 0.57949 |  0:00:18s
epoch 9  | loss: 1.10209 | val_accuracy: 0.5641  |  0:00:20s
epoch 10 | loss: 1.02838 | val_accuracy: 0.56923 |  0:00:22s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_accuracy = 0.57949


[I 2024-07-26 22:08:39,255] Trial 6 finished with value: 0.5794871794871795 and parameters: {'n_d': 54, 'n_a': 42, 'n_steps': 6, 'gamma': 1.3347420752920536, 'lambda_sparse': 4.536029794521018e-06, 'learning_rate': 0.00011509941764771661, 'batch_size': 32, 'num_epochs': 92}. Best is trial 5 with value: 0.7333333333333333.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.74543 | val_accuracy: 0.51795 |  0:00:02s
epoch 1  | loss: 1.54756 | val_accuracy: 0.50769 |  0:00:05s
epoch 2  | loss: 1.60461 | val_accuracy: 0.53846 |  0:00:08s
epoch 3  | loss: 1.39158 | val_accuracy: 0.54359 |  0:00:10s
epoch 4  | loss: 1.38244 | val_accuracy: 0.54872 |  0:00:13s
epoch 5  | loss: 1.33098 | val_accuracy: 0.5641  |  0:00:17s
epoch 6  | loss: 1.42078 | val_accuracy: 0.53846 |  0:00:21s
epoch 7  | loss: 1.27209 | val_accuracy: 0.5641  |  0:00:23s
epoch 8  | loss: 1.21233 | val_accuracy: 0.53333 |  0:00:26s
epoch 9  | loss: 1.26424 | val_accuracy: 0.5641  |  0:00:29s
epoch 10 | loss: 1.18271 | val_accuracy: 0.57436 |  0:00:32s
epoch 11 | loss: 1.14847 | val_accuracy: 0.54872 |  0:00:34s
epoch 12 | loss: 1.12524 | val_accuracy: 0.52308 |  0:00:37s
epoch 13 | loss: 1.06377 | val_accuracy: 0.51282 |  0:00:41s
epoch 14 | loss: 1.12152 | val_accuracy: 0.55897 |  0:00:44s
epoch 15 | loss: 1.1613  | val_accuracy: 0.54872 |  0:00:47s
epoch 16 | loss: 1.1666 

[I 2024-07-26 22:09:58,687] Trial 7 finished with value: 0.6564102564102564 and parameters: {'n_d': 31, 'n_a': 47, 'n_steps': 9, 'gamma': 1.7107384275359525, 'lambda_sparse': 2.19497404424172e-06, 'learning_rate': 0.00011607507600617685, 'batch_size': 32, 'num_epochs': 27}. Best is trial 5 with value: 0.7333333333333333.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.59291 | val_accuracy: 0.46667 |  0:00:02s
epoch 1  | loss: 1.39128 | val_accuracy: 0.44615 |  0:00:04s
epoch 2  | loss: 1.09584 | val_accuracy: 0.50256 |  0:00:07s
epoch 3  | loss: 1.12521 | val_accuracy: 0.50769 |  0:00:09s
epoch 4  | loss: 0.97416 | val_accuracy: 0.57949 |  0:00:11s
epoch 5  | loss: 1.00793 | val_accuracy: 0.55385 |  0:00:14s
epoch 6  | loss: 0.83649 | val_accuracy: 0.56923 |  0:00:16s
epoch 7  | loss: 0.84505 | val_accuracy: 0.57436 |  0:00:18s
epoch 8  | loss: 0.85968 | val_accuracy: 0.58462 |  0:00:21s
epoch 9  | loss: 0.7496  | val_accuracy: 0.59487 |  0:00:24s
epoch 10 | loss: 0.73191 | val_accuracy: 0.61026 |  0:00:26s
epoch 11 | loss: 0.74879 | val_accuracy: 0.60513 |  0:00:29s
epoch 12 | loss: 0.76167 | val_accuracy: 0.62564 |  0:00:32s
epoch 13 | loss: 0.73047 | val_accuracy: 0.61026 |  0:00:35s
epoch 14 | loss: 0.7209  | val_accuracy: 0.64103 |  0:00:37s
epoch 15 | loss: 0.72402 | val_accuracy: 0.67179 |  0:00:40s
epoch 16 | loss: 0.75449

[I 2024-07-26 22:11:24,245] Trial 8 finished with value: 0.7025641025641025 and parameters: {'n_d': 46, 'n_a': 52, 'n_steps': 7, 'gamma': 1.504904713892269, 'lambda_sparse': 1.936495763845598e-05, 'learning_rate': 0.0007392877341858976, 'batch_size': 32, 'num_epochs': 78}. Best is trial 5 with value: 0.7333333333333333.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 0.83577 | val_accuracy: 0.59487 |  0:00:00s
epoch 1  | loss: 0.66056 | val_accuracy: 0.64615 |  0:00:00s
epoch 2  | loss: 0.63301 | val_accuracy: 0.64615 |  0:00:00s
epoch 3  | loss: 0.58083 | val_accuracy: 0.72308 |  0:00:01s
epoch 4  | loss: 0.59479 | val_accuracy: 0.73333 |  0:00:01s
epoch 5  | loss: 0.54701 | val_accuracy: 0.71282 |  0:00:01s
epoch 6  | loss: 0.55524 | val_accuracy: 0.65641 |  0:00:02s
epoch 7  | loss: 0.55357 | val_accuracy: 0.65128 |  0:00:02s
epoch 8  | loss: 0.54112 | val_accuracy: 0.66667 |  0:00:03s
epoch 9  | loss: 0.53087 | val_accuracy: 0.66667 |  0:00:03s
epoch 10 | loss: 0.5437  | val_accuracy: 0.69231 |  0:00:03s
epoch 11 | loss: 0.54106 | val_accuracy: 0.67179 |  0:00:03s
epoch 12 | loss: 0.54541 | val_accuracy: 0.68205 |  0:00:04s
epoch 13 | loss: 0.54076 | val_accuracy: 0.68205 |  0:00:04s


[I 2024-07-26 22:11:29,296] Trial 9 finished with value: 0.7333333333333333 and parameters: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.864532166875735, 'lambda_sparse': 1.1227648446765894e-06, 'learning_rate': 0.00822026148716611, 'batch_size': 128, 'num_epochs': 81}. Best is trial 5 with value: 0.7333333333333333.


epoch 14 | loss: 0.53923 | val_accuracy: 0.69744 |  0:00:04s

Early stopping occurred at epoch 14 with best_epoch = 4 and best_val_accuracy = 0.73333


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.86321 | val_accuracy: 0.61538 |  0:00:01s
epoch 1  | loss: 1.01606 | val_accuracy: 0.6     |  0:00:03s
epoch 2  | loss: 0.85721 | val_accuracy: 0.61538 |  0:00:05s
epoch 3  | loss: 0.75975 | val_accuracy: 0.66667 |  0:00:07s
epoch 4  | loss: 0.81256 | val_accuracy: 0.62564 |  0:00:08s
epoch 5  | loss: 0.62794 | val_accuracy: 0.65128 |  0:00:10s
epoch 6  | loss: 0.6372  | val_accuracy: 0.68205 |  0:00:12s
epoch 7  | loss: 0.58164 | val_accuracy: 0.63077 |  0:00:13s
epoch 8  | loss: 0.59999 | val_accuracy: 0.69744 |  0:00:15s
epoch 9  | loss: 0.53638 | val_accuracy: 0.67179 |  0:00:17s
epoch 10 | loss: 0.55047 | val_accuracy: 0.65641 |  0:00:19s
epoch 11 | loss: 0.53529 | val_accuracy: 0.68205 |  0:00:21s
epoch 12 | loss: 0.55651 | val_accuracy: 0.69744 |  0:00:23s
epoch 13 | loss: 0.51592 | val_accuracy: 0.70256 |  0:00:24s
epoch 14 | loss: 0.54308 | val_accuracy: 0.76923 |  0:00:26s
epoch 15 | loss: 0.5295  | val_accuracy: 0.71795 |  0:00:28s
epoch 16 | loss: 0.56023

[I 2024-07-26 22:12:19,808] Trial 10 finished with value: 0.7692307692307693 and parameters: {'n_d': 44, 'n_a': 64, 'n_steps': 8, 'gamma': 1.0536806680260573, 'lambda_sparse': 0.0009651741843731034, 'learning_rate': 0.014909371860316725, 'batch_size': 64, 'num_epochs': 65}. Best is trial 10 with value: 0.7692307692307693.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.67551 | val_accuracy: 0.57949 |  0:00:02s
epoch 1  | loss: 0.91818 | val_accuracy: 0.59487 |  0:00:04s
epoch 2  | loss: 0.76845 | val_accuracy: 0.57436 |  0:00:07s
epoch 3  | loss: 0.66997 | val_accuracy: 0.62564 |  0:00:09s
epoch 4  | loss: 0.64081 | val_accuracy: 0.62051 |  0:00:13s
epoch 5  | loss: 0.57597 | val_accuracy: 0.68205 |  0:00:15s
epoch 6  | loss: 0.61934 | val_accuracy: 0.72821 |  0:00:17s
epoch 7  | loss: 0.56822 | val_accuracy: 0.75897 |  0:00:19s
epoch 8  | loss: 0.58899 | val_accuracy: 0.73333 |  0:00:20s
epoch 9  | loss: 0.57647 | val_accuracy: 0.73333 |  0:00:22s
epoch 10 | loss: 0.53334 | val_accuracy: 0.69231 |  0:00:24s
epoch 11 | loss: 0.58584 | val_accuracy: 0.64103 |  0:00:26s
epoch 12 | loss: 0.57667 | val_accuracy: 0.62564 |  0:00:28s
epoch 13 | loss: 0.54785 | val_accuracy: 0.67179 |  0:00:30s
epoch 14 | loss: 0.53163 | val_accuracy: 0.67692 |  0:00:32s
epoch 15 | loss: 0.52438 | val_accuracy: 0.74359 |  0:00:34s
epoch 16 | loss: 0.53225

[I 2024-07-26 22:12:59,229] Trial 11 finished with value: 0.7589743589743589 and parameters: {'n_d': 43, 'n_a': 64, 'n_steps': 8, 'gamma': 1.097541715979277, 'lambda_sparse': 0.0009383845836298559, 'learning_rate': 0.019471490331383366, 'batch_size': 64, 'num_epochs': 67}. Best is trial 10 with value: 0.7692307692307693.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.38387 | val_accuracy: 0.62051 |  0:00:02s
epoch 1  | loss: 0.99445 | val_accuracy: 0.69744 |  0:00:03s
epoch 2  | loss: 0.85932 | val_accuracy: 0.71282 |  0:00:05s
epoch 3  | loss: 0.79368 | val_accuracy: 0.71282 |  0:00:07s
epoch 4  | loss: 0.67337 | val_accuracy: 0.63077 |  0:00:08s
epoch 5  | loss: 0.75427 | val_accuracy: 0.66667 |  0:00:10s
epoch 6  | loss: 0.64566 | val_accuracy: 0.66154 |  0:00:12s
epoch 7  | loss: 0.5644  | val_accuracy: 0.65128 |  0:00:14s
epoch 8  | loss: 0.53521 | val_accuracy: 0.6359  |  0:00:16s
epoch 9  | loss: 0.53326 | val_accuracy: 0.67179 |  0:00:18s
epoch 10 | loss: 0.53919 | val_accuracy: 0.69231 |  0:00:20s
epoch 11 | loss: 0.52142 | val_accuracy: 0.69744 |  0:00:21s
epoch 12 | loss: 0.50656 | val_accuracy: 0.71795 |  0:00:23s
epoch 13 | loss: 0.49952 | val_accuracy: 0.73846 |  0:00:25s
epoch 14 | loss: 0.5179  | val_accuracy: 0.73333 |  0:00:27s
epoch 15 | loss: 0.51551 | val_accuracy: 0.72308 |  0:00:29s
epoch 16 | loss: 0.50081

[I 2024-07-26 22:13:52,074] Trial 12 finished with value: 0.7487179487179487 and parameters: {'n_d': 38, 'n_a': 64, 'n_steps': 8, 'gamma': 1.0225020269811136, 'lambda_sparse': 0.0009305941441358778, 'learning_rate': 0.010654302434232382, 'batch_size': 64, 'num_epochs': 61}. Best is trial 10 with value: 0.7692307692307693.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 2.52814 | val_accuracy: 0.61026 |  0:00:02s
epoch 1  | loss: 1.39212 | val_accuracy: 0.64615 |  0:00:04s
epoch 2  | loss: 1.116   | val_accuracy: 0.65128 |  0:00:06s
epoch 3  | loss: 0.82908 | val_accuracy: 0.72821 |  0:00:08s
epoch 4  | loss: 0.66796 | val_accuracy: 0.67692 |  0:00:10s
epoch 5  | loss: 0.63114 | val_accuracy: 0.65641 |  0:00:13s
epoch 6  | loss: 0.60503 | val_accuracy: 0.64615 |  0:00:15s
epoch 7  | loss: 0.57009 | val_accuracy: 0.70256 |  0:00:17s
epoch 8  | loss: 0.56346 | val_accuracy: 0.72308 |  0:00:19s
epoch 9  | loss: 0.64884 | val_accuracy: 0.67179 |  0:00:21s
epoch 10 | loss: 0.62235 | val_accuracy: 0.70256 |  0:00:23s
epoch 11 | loss: 0.57914 | val_accuracy: 0.71795 |  0:00:25s
epoch 12 | loss: 0.56637 | val_accuracy: 0.67692 |  0:00:27s
epoch 13 | loss: 0.5456  | val_accuracy: 0.68718 |  0:00:30s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_accuracy = 0.72821


[I 2024-07-26 22:14:23,115] Trial 13 finished with value: 0.7282051282051282 and parameters: {'n_d': 42, 'n_a': 63, 'n_steps': 10, 'gamma': 1.0010939230641154, 'lambda_sparse': 0.0009554191077814104, 'learning_rate': 0.019310823579486002, 'batch_size': 64, 'num_epochs': 66}. Best is trial 10 with value: 0.7692307692307693.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.08702 | val_accuracy: 0.59487 |  0:00:01s
epoch 1  | loss: 0.8354  | val_accuracy: 0.69231 |  0:00:03s
epoch 2  | loss: 0.76382 | val_accuracy: 0.64103 |  0:00:05s
epoch 3  | loss: 0.78146 | val_accuracy: 0.67179 |  0:00:07s
epoch 4  | loss: 0.76232 | val_accuracy: 0.67692 |  0:00:09s
epoch 5  | loss: 0.77418 | val_accuracy: 0.69231 |  0:00:10s
epoch 6  | loss: 0.6962  | val_accuracy: 0.66667 |  0:00:12s
epoch 7  | loss: 0.68205 | val_accuracy: 0.71795 |  0:00:14s
epoch 8  | loss: 0.65973 | val_accuracy: 0.72308 |  0:00:16s
epoch 9  | loss: 0.6593  | val_accuracy: 0.67692 |  0:00:18s
Stop training because you reached max_epochs = 10 with best_epoch = 8 and best_val_accuracy = 0.72308


[I 2024-07-26 22:14:42,856] Trial 14 finished with value: 0.7230769230769231 and parameters: {'n_d': 31, 'n_a': 59, 'n_steps': 8, 'gamma': 1.1794831160910817, 'lambda_sparse': 0.0002578983847684253, 'learning_rate': 0.0036691049977703766, 'batch_size': 64, 'num_epochs': 10}. Best is trial 10 with value: 0.7692307692307693.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.33553 | val_accuracy: 0.57949 |  0:00:01s
epoch 1  | loss: 0.99244 | val_accuracy: 0.58974 |  0:00:03s
epoch 2  | loss: 0.83556 | val_accuracy: 0.59487 |  0:00:04s
epoch 3  | loss: 0.76208 | val_accuracy: 0.63077 |  0:00:06s
epoch 4  | loss: 0.61018 | val_accuracy: 0.60513 |  0:00:07s
epoch 5  | loss: 0.75727 | val_accuracy: 0.61026 |  0:00:09s
epoch 6  | loss: 0.64777 | val_accuracy: 0.64615 |  0:00:10s
epoch 7  | loss: 0.58232 | val_accuracy: 0.61538 |  0:00:12s
epoch 8  | loss: 0.62005 | val_accuracy: 0.66667 |  0:00:14s
epoch 9  | loss: 0.56622 | val_accuracy: 0.67692 |  0:00:15s
epoch 10 | loss: 0.58128 | val_accuracy: 0.66667 |  0:00:17s
epoch 11 | loss: 0.56318 | val_accuracy: 0.69231 |  0:00:19s
epoch 12 | loss: 0.57402 | val_accuracy: 0.6359  |  0:00:21s
epoch 13 | loss: 0.60607 | val_accuracy: 0.67179 |  0:00:22s
epoch 14 | loss: 0.59626 | val_accuracy: 0.70256 |  0:00:24s
epoch 15 | loss: 0.55107 | val_accuracy: 0.67179 |  0:00:25s
epoch 16 | loss: 0.58588

[I 2024-07-26 22:15:39,943] Trial 15 finished with value: 0.7333333333333333 and parameters: {'n_d': 50, 'n_a': 30, 'n_steps': 8, 'gamma': 1.1356078007331136, 'lambda_sparse': 8.44268409753167e-05, 'learning_rate': 0.021529987841165235, 'batch_size': 64, 'num_epochs': 69}. Best is trial 10 with value: 0.7692307692307693.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.25674 | val_accuracy: 0.54872 |  0:00:02s
epoch 1  | loss: 0.95389 | val_accuracy: 0.61538 |  0:00:04s
epoch 2  | loss: 0.79866 | val_accuracy: 0.67692 |  0:00:06s
epoch 3  | loss: 0.7904  | val_accuracy: 0.65128 |  0:00:08s
epoch 4  | loss: 0.7908  | val_accuracy: 0.6359  |  0:00:10s
epoch 5  | loss: 0.72806 | val_accuracy: 0.66154 |  0:00:12s
epoch 6  | loss: 0.73316 | val_accuracy: 0.67179 |  0:00:13s
epoch 7  | loss: 0.77101 | val_accuracy: 0.64103 |  0:00:15s
epoch 8  | loss: 0.85507 | val_accuracy: 0.64615 |  0:00:17s
epoch 9  | loss: 0.68954 | val_accuracy: 0.65641 |  0:00:19s
epoch 10 | loss: 0.72392 | val_accuracy: 0.62051 |  0:00:21s
epoch 11 | loss: 0.94064 | val_accuracy: 0.62051 |  0:00:23s
epoch 12 | loss: 0.90019 | val_accuracy: 0.64615 |  0:00:24s

Early stopping occurred at epoch 12 with best_epoch = 2 and best_val_accuracy = 0.67692


[I 2024-07-26 22:16:05,849] Trial 16 finished with value: 0.676923076923077 and parameters: {'n_d': 37, 'n_a': 36, 'n_steps': 10, 'gamma': 1.285331090166534, 'lambda_sparse': 0.0004984515747833953, 'learning_rate': 0.004332465188586322, 'batch_size': 64, 'num_epochs': 51}. Best is trial 10 with value: 0.7692307692307693.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.38351 | val_accuracy: 0.67179 |  0:00:00s
epoch 1  | loss: 0.64832 | val_accuracy: 0.6     |  0:00:02s
epoch 2  | loss: 0.6603  | val_accuracy: 0.61538 |  0:00:03s
epoch 3  | loss: 0.67697 | val_accuracy: 0.47692 |  0:00:04s
epoch 4  | loss: 0.69816 | val_accuracy: 0.64103 |  0:00:05s
epoch 5  | loss: 0.58287 | val_accuracy: 0.65641 |  0:00:07s
epoch 6  | loss: 0.60594 | val_accuracy: 0.68718 |  0:00:08s
epoch 7  | loss: 0.5895  | val_accuracy: 0.64615 |  0:00:09s
epoch 8  | loss: 0.58006 | val_accuracy: 0.68718 |  0:00:10s
epoch 9  | loss: 0.58752 | val_accuracy: 0.70256 |  0:00:11s
epoch 10 | loss: 0.57782 | val_accuracy: 0.67692 |  0:00:13s
epoch 11 | loss: 0.59959 | val_accuracy: 0.68205 |  0:00:14s
epoch 12 | loss: 0.55179 | val_accuracy: 0.70256 |  0:00:15s
epoch 13 | loss: 0.5887  | val_accuracy: 0.69231 |  0:00:16s
epoch 14 | loss: 0.55777 | val_accuracy: 0.70256 |  0:00:17s
epoch 15 | loss: 0.5708  | val_accuracy: 0.70769 |  0:00:19s
epoch 16 | loss: 0.58068

[I 2024-07-26 22:16:51,337] Trial 17 finished with value: 0.764102564102564 and parameters: {'n_d': 24, 'n_a': 8, 'n_steps': 7, 'gamma': 1.104921695089, 'lambda_sparse': 0.00011834813690312385, 'learning_rate': 0.09067809977281778, 'batch_size': 64, 'num_epochs': 72}. Best is trial 10 with value: 0.7692307692307693.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.36853 | val_accuracy: 0.63077 |  0:00:01s
epoch 1  | loss: 0.66708 | val_accuracy: 0.67179 |  0:00:02s
epoch 2  | loss: 0.61813 | val_accuracy: 0.6     |  0:00:04s
epoch 3  | loss: 0.59657 | val_accuracy: 0.63077 |  0:00:05s
epoch 4  | loss: 0.65278 | val_accuracy: 0.63077 |  0:00:07s
epoch 5  | loss: 0.60921 | val_accuracy: 0.64615 |  0:00:08s
epoch 6  | loss: 0.65094 | val_accuracy: 0.66667 |  0:00:09s
epoch 7  | loss: 0.58791 | val_accuracy: 0.66154 |  0:00:10s
epoch 8  | loss: 0.59263 | val_accuracy: 0.66667 |  0:00:11s
epoch 9  | loss: 0.61428 | val_accuracy: 0.67179 |  0:00:13s
epoch 10 | loss: 0.59989 | val_accuracy: 0.62564 |  0:00:14s
epoch 11 | loss: 0.57562 | val_accuracy: 0.67179 |  0:00:15s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_accuracy = 0.67179


[I 2024-07-26 22:17:07,980] Trial 18 finished with value: 0.6717948717948717 and parameters: {'n_d': 22, 'n_a': 10, 'n_steps': 7, 'gamma': 1.328126807120161, 'lambda_sparse': 0.00010280566361474043, 'learning_rate': 0.09972131386652579, 'batch_size': 64, 'num_epochs': 44}. Best is trial 10 with value: 0.7692307692307693.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 1.32713 | val_accuracy: 0.63077 |  0:00:01s
epoch 1  | loss: 0.69975 | val_accuracy: 0.61026 |  0:00:02s
epoch 2  | loss: 0.63773 | val_accuracy: 0.60513 |  0:00:03s
epoch 3  | loss: 0.69081 | val_accuracy: 0.64615 |  0:00:04s
epoch 4  | loss: 0.58557 | val_accuracy: 0.67692 |  0:00:05s
epoch 5  | loss: 0.60118 | val_accuracy: 0.67179 |  0:00:06s
epoch 6  | loss: 0.6362  | val_accuracy: 0.61538 |  0:00:07s
epoch 7  | loss: 0.63918 | val_accuracy: 0.70256 |  0:00:08s
epoch 8  | loss: 0.57456 | val_accuracy: 0.71282 |  0:00:10s
epoch 9  | loss: 0.59137 | val_accuracy: 0.68718 |  0:00:11s
epoch 10 | loss: 0.55959 | val_accuracy: 0.72308 |  0:00:12s
epoch 11 | loss: 0.55957 | val_accuracy: 0.70256 |  0:00:13s
epoch 12 | loss: 0.549   | val_accuracy: 0.66154 |  0:00:15s
epoch 13 | loss: 0.58846 | val_accuracy: 0.67179 |  0:00:16s
epoch 14 | loss: 0.57942 | val_accuracy: 0.72308 |  0:00:17s
epoch 15 | loss: 0.55671 | val_accuracy: 0.68718 |  0:00:18s
epoch 16 | loss: 0.57221

[I 2024-07-26 22:17:33,676] Trial 19 finished with value: 0.7230769230769231 and parameters: {'n_d': 16, 'n_a': 8, 'n_steps': 9, 'gamma': 1.0609717646774035, 'lambda_sparse': 4.068106497222082e-05, 'learning_rate': 0.041600966352928884, 'batch_size': 64, 'num_epochs': 74}. Best is trial 10 with value: 0.7692307692307693.


epoch 0  | loss: 1.86321 | val_accuracy: 0.61538 |  0:00:01s
epoch 1  | loss: 1.01606 | val_accuracy: 0.6     |  0:00:03s
epoch 2  | loss: 0.85721 | val_accuracy: 0.61538 |  0:00:05s
epoch 3  | loss: 0.75975 | val_accuracy: 0.66667 |  0:00:07s
epoch 4  | loss: 0.81256 | val_accuracy: 0.62564 |  0:00:09s
epoch 5  | loss: 0.62794 | val_accuracy: 0.65128 |  0:00:11s
epoch 6  | loss: 0.6372  | val_accuracy: 0.68205 |  0:00:14s
epoch 7  | loss: 0.58164 | val_accuracy: 0.63077 |  0:00:17s
epoch 8  | loss: 0.59999 | val_accuracy: 0.69744 |  0:00:19s
epoch 9  | loss: 0.53638 | val_accuracy: 0.67179 |  0:00:20s
epoch 10 | loss: 0.55047 | val_accuracy: 0.65641 |  0:00:22s
epoch 11 | loss: 0.53529 | val_accuracy: 0.68205 |  0:00:24s
epoch 12 | loss: 0.55651 | val_accuracy: 0.69744 |  0:00:25s
epoch 13 | loss: 0.51592 | val_accuracy: 0.70256 |  0:00:27s
epoch 14 | loss: 0.54308 | val_accuracy: 0.76923 |  0:00:29s
epoch 15 | loss: 0.5295  | val_accuracy: 0.71795 |  0:00:32s
epoch 16 | loss: 0.56023



                            Accuracy AUC Score Training Time (Best Params)  \
Logistic Regression         0.748718  0.796644                    0.003991   
KNN                         0.753846  0.840861                    0.001994   
Decision Tree               0.728205  0.749789                    0.004987   
Random Forest                    0.8  0.864816                    0.727075   
Gradient Boosting                0.8  0.861439                    0.513596   
XGBoost                     0.764103  0.853525                      0.1127   
LightGBM                    0.789744  0.866083                    0.057846   
CatBoost                    0.753846  0.843288                    0.245005   
MLP                         0.764103  0.838863                    3.545068   
DNN                         0.774359  0.836116                    2.187616   
DCN                         0.723077  0.778793                    5.800046   
Wide_and_Deep               0.774359  0.835693                  

In [21]:
result.to_csv("result_white", index=True)