In [505]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

def encode_categorical_data(df, target_column='Y'):
    df_encoded = df.copy()
    le = LabelEncoder()
    categorical_cols = df_encoded.select_dtypes(include=['object', 'category']).columns
    categorical_cols = [col for col in categorical_cols if col != target_column]
    
    for column in categorical_cols:
        if df_encoded[column].dtype == 'object' or df_encoded[column].dtype.name == 'category':
            df_encoded[column] = le.fit_transform(df_encoded[column].astype(str))
    
    return df_encoded

In [506]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler

def robust_transform(X):
    scaler = RobustScaler()
    if isinstance(X, pd.DataFrame):
        return pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    else:
        X = np.asarray(X)
        return scaler.fit_transform(X)

def apply_robust_transform(X, y):
    X_transformed = robust_transform(X)
    return X_transformed, y

In [507]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

def apply_synthetic_data_to_training(X, y, test_size=0.2, random_state=42):
    # Ensure X is a DataFrame and reset its index
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
    X = X.reset_index(drop=True)
    
    # Ensure y is a Series and reset its index
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    y = y.reset_index(drop=True)
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Combine X_train and y_train into a single DataFrame
    train_data = pd.concat([X_train, y_train.rename('Y')], axis=1)
    
    # Generate synthetic data using Decision Tree
    synthetic_data = generate_data_decision_tree(train_data)
    
    # Combine the synthetic data with the original training data
    augmented_train_data = pd.concat([train_data, synthetic_data], ignore_index=True)
    
    # Separate features and target for the augmented training data
    X_train_augmented = augmented_train_data.drop('Y', axis=1)
    y_train_augmented = augmented_train_data['Y']
    
    # Combine the augmented training data with the original test data
    X_combined = pd.concat([X_train_augmented, X_test], ignore_index=True)
    y_combined = pd.concat([y_train_augmented, y_test], ignore_index=True)
    
    return X_combined, y_combined

def generate_data_decision_tree(train_data):
    """Generate synthetic data using Decision Tree."""
    X = train_data.drop('Y', axis=1)
    y = train_data['Y']
    
    dt = DecisionTreeRegressor(random_state=42)
    dt.fit(X, y)
    
    synthetic_X = pd.DataFrame(columns=X.columns)
    synthetic_y = []
    
    for _ in range(len(train_data)):
        sample = X.sample(n=1, replace=True)
        synthetic_sample = sample.copy()
        
        for feature in X.columns:
            if np.random.rand() < 0.5:  # 50% chance to modify each feature
                feature_min = X[feature].min()
                feature_max = X[feature].max()
                synthetic_sample[feature] = np.random.uniform(feature_min, feature_max)
        
        synthetic_X = pd.concat([synthetic_X, synthetic_sample], ignore_index=True)
        synthetic_y.append(dt.predict(synthetic_sample)[0])
    
    synthetic_y = pd.Series(synthetic_y, name='Y')
    
    synthetic_data = pd.concat([synthetic_X, synthetic_y], axis=1)
    return synthetic_data


In [508]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

def model_comparison(df, target_column, test_size=0.2, random_state=42, cv=5):
    # Prepare the data
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Dictionary of models and their reduced hyperparameter grids
    models = {
        'Linear Regression': (LinearRegression(), {}),
        'Ridge': (Ridge(), {
            'alpha': [0.1, 1.0, 10.0],
            'solver': ['auto', 'svd', 'cholesky']
        }),
        'Lasso': (Lasso(), {
            'alpha': [0.1, 1.0, 10.0],
            'selection': ['cyclic', 'random']
        }),
        'KNN': (KNeighborsRegressor(), {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }),
        'Decision Tree': (DecisionTreeRegressor(), {
            'max_depth': [None, 10, 20, 40],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
        }),
        'Random Forest': (RandomForestRegressor(), {
            'n_estimators': [100, 200, 400],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [2, 4]
        }),
        'Gradient Boosting': (GradientBoostingRegressor(), {
            'n_estimators': [100, 200, 400],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 4]
        }),
        'XGBoost': (XGBRegressor(), {
            'n_estimators': [100, 200, 400],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 4],
            'gamma': [0, 0.1]
        }),
        'LightGBM': (LGBMRegressor(), {
            'n_estimators': [100, 200, 400],
            'learning_rate': [0.01, 0.05, 0.1],
            'num_leaves': [31, 50, 70]
        }),
        'CatBoost': (CatBoostRegressor(verbose=0), {
            'iterations': [100, 200, 400],
            'learning_rate': [0.01, 0.05, 0.1],
            'depth': [4, 6, 8]
        })
    }
    
    # Dictionary to store results
    results = {}

    for name, (model, param_grid) in models.items():
        start_time = time.time()
        
        # Perform GridSearchCV
        grid_search = GridSearchCV(model, param_grid=param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train_scaled, y_train)
        
        # Get the best model
        best_model = grid_search.best_estimator_

        # Measure training time for best parameters
        best_param_train_start = time.time()
        best_model.fit(X_train_scaled, y_train)
        best_param_train_time = time.time() - best_param_train_start

        # Measure inference time for best parameters
        inference_start_time = time.time()
        y_pred = best_model.predict(X_test_scaled)
        inference_time = time.time() - inference_start_time
        
        # Calculate total computation time
        computation_time = time.time() - start_time
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Perform cross-validation
        cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=cv, scoring='neg_mean_squared_error')
        cv_rmse = np.sqrt(-cv_scores)

        results[name] = {
            'MSE': mse,
            'RMSE': rmse,
            'MAE': mae,
            'R2 Score': r2,
            'CV Mean RMSE': np.mean(cv_rmse),
            'CV Std RMSE': np.std(cv_rmse),
            'Training Time (Best Params)': best_param_train_time,
            'Inference Time (Best Params)': inference_time,
            'Computation Time (Total)': computation_time,
            'Best Parameters': grid_search.best_params_
        }

    # Convert results to a DataFrame
    result_df = pd.DataFrame(results).T

    return result_df


In [509]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neural_network import MLPRegressor

def mlp_comparison(X, y, result_df, test_size=0.2, random_state=42, cv=3):
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define the MLP model and its hyperparameter grid
    mlp = MLPRegressor(max_iter=1000, random_state=random_state)
    param_grid = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    }

    # Perform GridSearchCV with KFold
    start_time = time.time()
    kf = KFold(n_splits=cv, shuffle=True, random_state=random_state)
    grid_search = GridSearchCV(mlp, param_grid=param_grid, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Measure training time for best parameters
    best_param_train_start = time.time()
    best_model.fit(X_train_scaled, y_train)
    training_time = time.time() - best_param_train_start

    # Measure inference time for best parameters
    inference_start_time = time.time()
    y_pred = best_model.predict(X_test_scaled)
    inference_time = time.time() - inference_start_time

    # Calculate total computation time
    computation_time = time.time() - start_time

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Perform cross-validation
    cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=cv, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores)

    # Store results in the existing result DataFrame
    result_df.loc['MLP'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': np.mean(cv_rmse),
        'CV Std RMSE': np.std(cv_rmse),
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': grid_search.best_params_
    }

    return result_df, grid_search.best_params_


In [510]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna

def dnn_comparison(X, y, result_df, test_size=0.2, random_state=42, n_trials=10, n_folds=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_train_tensor = torch.FloatTensor(y_train.values.reshape(-1, 1)).to(device)
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    y_test_tensor = torch.FloatTensor(y_test.values.reshape(-1, 1)).to(device)

    # Define the DNN model
    class DNN(nn.Module):
        def __init__(self, input_dim, hidden_dims):
            super(DNN, self).__init__()
            layers = []
            for hidden_dim in hidden_dims:
                layers.append(nn.Linear(input_dim, hidden_dim))
                layers.append(nn.ReLU())
                input_dim = hidden_dim
            layers.append(nn.Linear(input_dim, 1))
            self.network = nn.Sequential(*layers)
        
        def forward(self, x):
            return self.network(x).squeeze()

    # Define the objective function for Optuna
    def objective(trial):
        hidden_dims = [trial.suggest_int(f'hidden_dim_{i}', 32, 256) for i in range(3)]
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        num_epochs = trial.suggest_int('num_epochs', 10, 100)

        input_dim = X_train.shape[1]
        model = DNN(input_dim, hidden_dims).to(device)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        for epoch in range(num_epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            predictions = model(X_test_tensor)
            mse = mean_squared_error(y_test_tensor.cpu().numpy(), predictions.cpu().numpy())

        return mse

    start_time = time.time()

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params

    input_dim = X_train.shape[1]
    best_model = DNN(input_dim, [best_params[f'hidden_dim_{i}'] for i in range(3)]).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

    training_start_time = time.time()
    for epoch in range(best_params['num_epochs']):
        best_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = best_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    training_time = time.time() - training_start_time

    best_model.eval()
    with torch.no_grad():
        inference_start_time = time.time()
        predictions = best_model(X_test_tensor)
        inference_time = time.time() - inference_start_time

        y_pred = predictions.cpu().numpy()
        y_true = y_test_tensor.cpu().numpy()
        
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

    cv_rmse = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_scaled)):
        X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
        y_fold_train, y_fold_val = y_train.values[train_index], y_train.values[val_index]

        X_fold_train_tensor = torch.FloatTensor(X_fold_train).to(device)
        y_fold_train_tensor = torch.FloatTensor(y_fold_train.reshape(-1, 1)).to(device)
        X_fold_val_tensor = torch.FloatTensor(X_fold_val).to(device)
        y_fold_val_tensor = torch.FloatTensor(y_fold_val.reshape(-1, 1)).to(device)

        fold_model = DNN(input_dim, [best_params[f'hidden_dim_{i}'] for i in range(3)]).to(device)
        fold_optimizer = optim.Adam(fold_model.parameters(), lr=best_params['learning_rate'])
        fold_dataset = TensorDataset(X_fold_train_tensor, y_fold_train_tensor)
        fold_loader = DataLoader(fold_dataset, batch_size=best_params['batch_size'], shuffle=True)

        for epoch in range(best_params['num_epochs']):
            fold_model.train()
            for batch_X, batch_y in fold_loader:
                fold_optimizer.zero_grad()
                outputs = fold_model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                fold_optimizer.step()

        fold_model.eval()
        with torch.no_grad():
            fold_predictions = fold_model(X_fold_val_tensor)
            fold_mse = mean_squared_error(y_fold_val_tensor.cpu().numpy(), fold_predictions.cpu().numpy())
            cv_rmse.append(np.sqrt(fold_mse))

    computation_time = time.time() - start_time

    result_df.loc['DNN'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': np.mean(cv_rmse),
        'CV Std RMSE': np.std(cv_rmse),
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': best_params
    }

    return result_df, best_params


In [511]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna

def dcn_comparison(X, y, result_df, test_size=0.2, random_state=42, n_trials=10, n_folds=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_train_tensor = torch.FloatTensor(y_train.values.reshape(-1, 1)).to(device)
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    y_test_tensor = torch.FloatTensor(y_test.values.reshape(-1, 1)).to(device)

    class CrossLayer(nn.Module):
        def __init__(self, input_dim):
            super(CrossLayer, self).__init__()
            self.weight = nn.Parameter(torch.Tensor(input_dim, 1))
            self.bias = nn.Parameter(torch.Tensor(input_dim, 1))
            nn.init.xavier_uniform_(self.weight)
            nn.init.zeros_(self.bias)

        def forward(self, x0, x):
            x = x.unsqueeze(2)
            x0 = x0.unsqueeze(2)
            interaction = torch.matmul(x0, torch.matmul(x.transpose(1, 2), self.weight))
            return x0.squeeze(2) + interaction.squeeze(2) + self.bias.T

    class DCN(nn.Module):
        def __init__(self, input_dim, cross_layers, hidden_layers):
            super(DCN, self).__init__()
            self.cross_layers = nn.ModuleList([CrossLayer(input_dim) for _ in range(cross_layers)])
            
            deep_layers = []
            for i in range(len(hidden_layers)):
                if i == 0:
                    deep_layers.append(nn.Linear(input_dim, hidden_layers[i]))
                else:
                    deep_layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
                deep_layers.append(nn.ReLU())
            self.deep_net = nn.Sequential(*deep_layers)
            
            self.final_layer = nn.Linear(input_dim + hidden_layers[-1], 1)

        def forward(self, x):
            cross_out = x
            for layer in self.cross_layers:
                cross_out = layer(x, cross_out)
            deep_out = self.deep_net(x)
            concat_out = torch.cat([cross_out, deep_out], dim=1)
            return self.final_layer(concat_out).squeeze()

    def objective(trial):
        cross_layers = trial.suggest_int('cross_layers', 1, 5)
        hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        num_epochs = trial.suggest_int('num_epochs', 10, 100)

        input_dim = X_train.shape[1]
        model = DCN(input_dim, cross_layers, hidden_layers).to(device)

        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        for epoch in range(num_epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y.squeeze())
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            predictions = model(X_test_tensor)
            mse = mean_squared_error(y_test_tensor.cpu().numpy(), predictions.cpu().numpy())

        return mse

    start_time = time.time()

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params

    input_dim = X_train.shape[1]
    best_model = DCN(input_dim, best_params['cross_layers'], 
                     [best_params[f'hidden_layer_{i}'] for i in range(3)]).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

    training_start_time = time.time()
    for epoch in range(best_params['num_epochs']):
        best_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = best_model(batch_X)
            loss = criterion(outputs, batch_y.squeeze())
            loss.backward()
            optimizer.step()

    training_time = time.time() - training_start_time

    best_model.eval()
    with torch.no_grad():
        inference_start_time = time.time()
        predictions = best_model(X_test_tensor)
        inference_time = time.time() - inference_start_time

        y_pred = predictions.cpu().numpy()
        y_true = y_test_tensor.cpu().numpy().squeeze()
        
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

    # Cross-validation
    cv_rmse = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_scaled)):
        X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
        y_fold_train, y_fold_val = y_train.values[train_index], y_train.values[val_index]

        X_fold_train_tensor = torch.FloatTensor(X_fold_train).to(device)
        y_fold_train_tensor = torch.FloatTensor(y_fold_train.reshape(-1, 1)).to(device)
        X_fold_val_tensor = torch.FloatTensor(X_fold_val).to(device)
        y_fold_val_tensor = torch.FloatTensor(y_fold_val.reshape(-1, 1)).to(device)

        fold_model = DCN(input_dim, best_params['cross_layers'], 
                         [best_params[f'hidden_layer_{i}'] for i in range(3)]).to(device)
        fold_optimizer = optim.Adam(fold_model.parameters(), lr=best_params['learning_rate'])
        fold_dataset = TensorDataset(X_fold_train_tensor, y_fold_train_tensor)
        fold_loader = DataLoader(fold_dataset, batch_size=best_params['batch_size'], shuffle=True)

        for epoch in range(best_params['num_epochs']):
            fold_model.train()
            for batch_X, batch_y in fold_loader:
                fold_optimizer.zero_grad()
                outputs = fold_model(batch_X)
                loss = criterion(outputs, batch_y.squeeze())
                loss.backward()
                fold_optimizer.step()

        fold_model.eval()
        with torch.no_grad():
            fold_predictions = fold_model(X_fold_val_tensor)
            fold_mse = mean_squared_error(y_fold_val_tensor.cpu().numpy(), fold_predictions.cpu().numpy())
            cv_rmse.append(np.sqrt(fold_mse))

    computation_time = time.time() - start_time

    result_df.loc['DCN'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': np.mean(cv_rmse),
        'CV Std RMSE': np.std(cv_rmse),
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': best_params
    }

    return result_df, best_params


In [512]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna

def wide_and_deep_comparison(X, y, result_df, test_size=0.2, random_state=42, n_trials=10, n_folds=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_train_tensor = torch.FloatTensor(y_train.values.reshape(-1, 1)).to(device)
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    y_test_tensor = torch.FloatTensor(y_test.values.reshape(-1, 1)).to(device)

    class WideAndDeepNetwork(nn.Module):
        def __init__(self, input_dim, hidden_layers):
            super(WideAndDeepNetwork, self).__init__()
            
            # Wide part
            self.wide = nn.Linear(input_dim, 1)
            
            # Deep part
            deep_layers = []
            for i in range(len(hidden_layers)):
                if i == 0:
                    deep_layers.append(nn.Linear(input_dim, hidden_layers[i]))
                else:
                    deep_layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
                deep_layers.append(nn.ReLU())
            deep_layers.append(nn.Linear(hidden_layers[-1], 1))
            self.deep = nn.Sequential(*deep_layers)

        def forward(self, x):
            wide_out = self.wide(x)
            deep_out = self.deep(x)
            return wide_out + deep_out

    def objective(trial):
        hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        num_epochs = trial.suggest_int('num_epochs', 10, 100)

        input_dim = X_train.shape[1]
        model = WideAndDeepNetwork(input_dim, hidden_layers).to(device)

        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        for epoch in range(num_epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            predictions = model(X_test_tensor)
            mse = mean_squared_error(y_test_tensor.cpu().numpy(), predictions.cpu().numpy())

        return mse

    start_time = time.time()

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params

    input_dim = X_train.shape[1]
    best_model = WideAndDeepNetwork(input_dim, 
                                    [best_params[f'hidden_layer_{i}'] for i in range(3)]).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

    training_start_time = time.time()
    for epoch in range(best_params['num_epochs']):
        best_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = best_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    training_time = time.time() - training_start_time

    best_model.eval()
    with torch.no_grad():
        inference_start_time = time.time()
        predictions = best_model(X_test_tensor)
        inference_time = time.time() - inference_start_time

        y_pred = predictions.cpu().numpy().squeeze()
        y_true = y_test_tensor.cpu().numpy().squeeze()
        
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

    # Cross-validation
    cv_rmse = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_scaled)):
        X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
        y_fold_train, y_fold_val = y_train.values[train_index], y_train.values[val_index]

        X_fold_train_tensor = torch.FloatTensor(X_fold_train).to(device)
        y_fold_train_tensor = torch.FloatTensor(y_fold_train.reshape(-1, 1)).to(device)
        X_fold_val_tensor = torch.FloatTensor(X_fold_val).to(device)
        y_fold_val_tensor = torch.FloatTensor(y_fold_val.reshape(-1, 1)).to(device)

        fold_model = WideAndDeepNetwork(input_dim, 
                                        [best_params[f'hidden_layer_{i}'] for i in range(3)]).to(device)
        fold_optimizer = optim.Adam(fold_model.parameters(), lr=best_params['learning_rate'])
        fold_dataset = TensorDataset(X_fold_train_tensor, y_fold_train_tensor)
        fold_loader = DataLoader(fold_dataset, batch_size=best_params['batch_size'], shuffle=True)

        for epoch in range(best_params['num_epochs']):
            fold_model.train()
            for batch_X, batch_y in fold_loader:
                fold_optimizer.zero_grad()
                outputs = fold_model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                fold_optimizer.step()

        fold_model.eval()
        with torch.no_grad():
            fold_predictions = fold_model(X_fold_val_tensor)
            fold_mse = mean_squared_error(y_fold_val_tensor.cpu().numpy(), fold_predictions.cpu().numpy())
            cv_rmse.append(np.sqrt(fold_mse))

    computation_time = time.time() - start_time

    result_df.loc['Wide_and_Deep'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': np.mean(cv_rmse),
        'CV Std RMSE': np.std(cv_rmse),
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': best_params
    }

    return result_df, best_params


In [513]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import optuna

def xgb_nn_comparison(X, y, result_df, test_size=0.2, random_state=42, n_trials=10, n_folds=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    start_time = time.time()

    # Ensure X and y are numpy arrays
    X = X.values if isinstance(X, pd.DataFrame) else X
    y = y.values if isinstance(y, pd.Series) else y

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    class NeuralNetwork(nn.Module):
        def __init__(self, input_dim, hidden_layers):
            super(NeuralNetwork, self).__init__()
            layers = []
            prev_dim = input_dim
            for hidden_dim in hidden_layers:
                layers.append(nn.Linear(prev_dim, hidden_dim))
                layers.append(nn.ReLU())
                prev_dim = hidden_dim
            layers.append(nn.Linear(prev_dim, 1))
            self.network = nn.Sequential(*layers)
        
        def forward(self, x):
            return self.network(x).squeeze()

    def objective(trial):
        # Define hyperparameters to tune for XGBoost
        xgb_params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
        }

        # Train XGBoost model
        xgb_model = XGBRegressor(**xgb_params)
        xgb_model.fit(X_train_scaled, y_train)
        
        # Extract features using XGBoost
        X_train_transformed = xgb_model.apply(X_train_scaled)
        X_test_transformed = xgb_model.apply(X_test_scaled)
        
        # Convert to PyTorch tensors
        X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
        X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
        y_train_tensor = torch.FloatTensor(y_train.reshape(-1, 1)).to(device)
        y_test_tensor = torch.FloatTensor(y_test.reshape(-1, 1)).to(device)
        
        # Define hyperparameters to tune for Neural Network
        hidden_layers = []
        for i in range(3):  # Allow up to 3 hidden layers
            if trial.suggest_categorical(f'use_hidden_layer_{i}', [True, False]):
                hidden_layers.append(trial.suggest_int(f'hidden_layer_{i}', 32, 256))
        
        nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        num_epochs = trial.suggest_int('num_epochs', 10, 100)

        # Create the Neural Network model
        input_dim = X_train_transformed.shape[1]
        model = NeuralNetwork(input_dim, hidden_layers).to(device)

        # Define loss function and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

        # Create DataLoader
        train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        # Training
        for epoch in range(num_epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            predictions = model(X_test_transformed_tensor)
            mse = mean_squared_error(y_test_tensor.cpu().numpy(), predictions.cpu().numpy())

        return mse

    # Perform hyperparameter tuning with Optuna
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    # Get the best hyperparameters
    best_params = study.best_params

    # Train the final XGBoost model with the best hyperparameters
    xgb_best_params = {
        'n_estimators': best_params['n_estimators'],
        'max_depth': best_params['max_depth'],
        'learning_rate': best_params['xgb_learning_rate'],
        'subsample': best_params['subsample'],
        'colsample_bytree': best_params['colsample_bytree']
    }
    xgb_model = XGBRegressor(**xgb_best_params)
    xgb_model.fit(X_train_scaled, y_train)

    # Extract features using XGBoost
    X_train_transformed = xgb_model.apply(X_train_scaled)
    X_test_transformed = xgb_model.apply(X_test_scaled)

    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
    y_train_tensor = torch.FloatTensor(y_train.reshape(-1, 1)).to(device)
    y_test_tensor = torch.FloatTensor(y_test.reshape(-1, 1)).to(device)

    # Train the final Neural Network model with the best hyperparameters
    nn_best_params = {
        'hidden_layers': [],
        'learning_rate': best_params['nn_learning_rate'],
        'batch_size': best_params['batch_size'],
        'num_epochs': best_params['num_epochs']
    }

    for i in range(3):  # Assuming max 3 hidden layers
        if f'use_hidden_layer_{i}' in best_params and best_params[f'use_hidden_layer_{i}']:
            nn_best_params['hidden_layers'].append(best_params[f'hidden_layer_{i}'])

    input_dim = X_train_transformed.shape[1]
    best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers']).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

    training_start_time = time.time()
    for epoch in range(nn_best_params['num_epochs']):
        best_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = best_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    training_time = time.time() - training_start_time

    # Evaluation
    best_model.eval()
    with torch.no_grad():
        inference_start_time = time.time()
        predictions = best_model(X_test_transformed_tensor)
        inference_time = time.time() - inference_start_time

        y_pred = predictions.cpu().numpy()
        y_true = y_test_tensor.cpu().numpy().squeeze()
        
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

    # Cross-validation
    cv_rmse = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_scaled)):
        X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

        xgb_model.fit(X_fold_train, y_fold_train)
        X_fold_train_transformed = xgb_model.apply(X_fold_train)
        X_fold_val_transformed = xgb_model.apply(X_fold_val)

        X_fold_train_tensor = torch.FloatTensor(X_fold_train_transformed).to(device)
        y_fold_train_tensor = torch.FloatTensor(y_fold_train.reshape(-1, 1)).to(device)
        X_fold_val_tensor = torch.FloatTensor(X_fold_val_transformed).to(device)
        y_fold_val_tensor = torch.FloatTensor(y_fold_val.reshape(-1, 1)).to(device)

        fold_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers']).to(device)
        fold_optimizer = optim.Adam(fold_model.parameters(), lr=nn_best_params['learning_rate'])
        fold_dataset = TensorDataset(X_fold_train_tensor, y_fold_train_tensor)
        fold_loader = DataLoader(fold_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

        for epoch in range(nn_best_params['num_epochs']):
            fold_model.train()
            for batch_X, batch_y in fold_loader:
                fold_optimizer.zero_grad()
                outputs = fold_model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                fold_optimizer.step()

        fold_model.eval()
        with torch.no_grad():
            fold_predictions = fold_model(X_fold_val_tensor)
            fold_mse = mean_squared_error(y_fold_val_tensor.cpu().numpy(), fold_predictions.cpu().numpy())
            cv_rmse.append(np.sqrt(fold_mse))

    computation_time = time.time() - start_time

    result_df.loc['XGBoost + NN'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': np.mean(cv_rmse),
        'CV Std RMSE': np.std(cv_rmse),
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': best_params
    }

    return result_df, best_params


In [514]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from lightgbm import LGBMRegressor
import optuna

def lgbm_nn_comparison(X, y, result_df, test_size=0.2, random_state=42, n_trials=10, n_folds=5):
    start_time = time.time()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Ensure X and y are numpy arrays
    X = X.values if isinstance(X, pd.DataFrame) else X
    y = y.values if isinstance(y, pd.Series) else y

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    class NeuralNetwork(nn.Module):
        def __init__(self, input_dim, hidden_layers):
            super(NeuralNetwork, self).__init__()
            layers = []
            for i in range(len(hidden_layers)):
                if i == 0:
                    layers.append(nn.Linear(input_dim, hidden_layers[i]))
                else:
                    layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
                layers.append(nn.ReLU())
            layers.append(nn.Linear(hidden_layers[-1], 1))
            self.network = nn.Sequential(*layers)
        
        def forward(self, x):
            return self.network(x).squeeze()

    def objective(trial):
        # Define hyperparameters to tune for LightGBM
        lgb_params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('lgb_learning_rate', 1e-4, 1e-1, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 20, 100),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
        }

        # Train LightGBM model
        lgb_model = LGBMRegressor(**lgb_params)
        lgb_model.fit(X_train_scaled, y_train)
        
        # Extract features using LightGBM
        X_train_transformed = lgb_model.predict(X_train_scaled).reshape(-1, 1)
        X_test_transformed = lgb_model.predict(X_test_scaled).reshape(-1, 1)
        
        # Convert to PyTorch tensors
        X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
        X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
        y_train_tensor = torch.FloatTensor(y_train.reshape(-1, 1)).to(device)
        y_test_tensor = torch.FloatTensor(y_test.reshape(-1, 1)).to(device)
        
        # Define hyperparameters to tune for Neural Network
        hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
        nn_learning_rate = trial.suggest_float('nn_learning_rate', 1e-4, 1e-1, log=True)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        num_epochs = trial.suggest_int('num_epochs', 10, 100)

        # Create the Neural Network model
        input_dim = X_train_transformed.shape[1]
        model = NeuralNetwork(input_dim, hidden_layers).to(device)

        # Define loss function and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

        # Create DataLoader
        train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        # Training
        for epoch in range(num_epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y.squeeze())
                loss.backward()
                optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            predictions = model(X_test_transformed_tensor)
            mse = mean_squared_error(y_test_tensor.cpu().numpy(), predictions.cpu().numpy())

        return mse

    # Perform hyperparameter tuning with Optuna
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    # Get the best hyperparameters
    best_params = study.best_params

    # Train the final LightGBM model with the best hyperparameters
    lgb_best_params = {
        'n_estimators': best_params['n_estimators'],
        'max_depth': best_params['max_depth'],
        'learning_rate': best_params['lgb_learning_rate'],
        'num_leaves': best_params['num_leaves'],
        'subsample': best_params['subsample'],
        'colsample_bytree': best_params['colsample_bytree']
    }
    lgb_model = LGBMRegressor(**lgb_best_params)
    lgb_model.fit(X_train_scaled, y_train)

    # Extract features using LightGBM
    X_train_transformed = lgb_model.predict(X_train_scaled).reshape(-1, 1)
    X_test_transformed = lgb_model.predict(X_test_scaled).reshape(-1, 1)

    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
    y_train_tensor = torch.FloatTensor(y_train.reshape(-1, 1)).to(device)
    y_test_tensor = torch.FloatTensor(y_test.reshape(-1, 1)).to(device)

    # Train the final Neural Network model with the best hyperparameters
    nn_best_params = {
        'hidden_layers': [best_params[f'hidden_layer_{i}'] for i in range(3)],
        'learning_rate': best_params['nn_learning_rate'],
        'batch_size': best_params['batch_size'],
        'num_epochs': best_params['num_epochs']
    }
    input_dim = X_train_transformed.shape[1]
    best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers']).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

    training_start_time = time.time()
    for epoch in range(nn_best_params['num_epochs']):
        best_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = best_model(batch_X)
            loss = criterion(outputs, batch_y.squeeze())
            loss.backward()
            optimizer.step()

    training_time = time.time() - training_start_time

    # Evaluation
    best_model.eval()
    with torch.no_grad():
        inference_start_time = time.time()
        predictions = best_model(X_test_transformed_tensor)
        inference_time = time.time() - inference_start_time

        y_pred = predictions.cpu().numpy()
        y_true = y_test_tensor.cpu().numpy().squeeze()
        
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

    # Cross-validation
    cv_rmse = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_scaled)):
        X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

        lgb_model.fit(X_fold_train, y_fold_train)
        X_fold_train_transformed = lgb_model.predict(X_fold_train).reshape(-1, 1)
        X_fold_val_transformed = lgb_model.predict(X_fold_val).reshape(-1, 1)

        X_fold_train_tensor = torch.FloatTensor(X_fold_train_transformed).to(device)
        y_fold_train_tensor = torch.FloatTensor(y_fold_train.reshape(-1, 1)).to(device)
        X_fold_val_tensor = torch.FloatTensor(X_fold_val_transformed).to(device)
        y_fold_val_tensor = torch.FloatTensor(y_fold_val.reshape(-1, 1)).to(device)

        fold_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers']).to(device)
        fold_optimizer = optim.Adam(fold_model.parameters(), lr=nn_best_params['learning_rate'])
        fold_dataset = TensorDataset(X_fold_train_tensor, y_fold_train_tensor)
        fold_loader = DataLoader(fold_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

        for epoch in range(nn_best_params['num_epochs']):
            fold_model.train()
            for batch_X, batch_y in fold_loader:
                fold_optimizer.zero_grad()
                outputs = fold_model(batch_X)
                loss = criterion(outputs, batch_y.squeeze())
                loss.backward()
                fold_optimizer.step()

        fold_model.eval()
        with torch.no_grad():
            fold_predictions = fold_model(X_fold_val_tensor)
            fold_mse = mean_squared_error(y_fold_val_tensor.cpu().numpy(), fold_predictions.cpu().numpy())
            cv_rmse.append(np.sqrt(fold_mse))

    # Calculate total computation time
    computation_time = time.time() - start_time

    # Store results in the existing result DataFrame
    result_df.loc['LightGBM + NN'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': np.mean(cv_rmse),
        'CV Std RMSE': np.std(cv_rmse),
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': best_params
    }

    return result_df, best_params


In [515]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna

def autoint_nn_comparison(X, y, result_df, test_size=0.2, random_state=42, n_trials=10, n_folds=5):
    start_time = time.time()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Ensure X and y are numpy arrays
    X = X.values if isinstance(X, pd.DataFrame) else X
    y = y.values if isinstance(y, pd.Series) else y

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_train_tensor = torch.FloatTensor(y_train.reshape(-1, 1)).to(device)
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    y_test_tensor = torch.FloatTensor(y_test.reshape(-1, 1)).to(device)

    class AutoInt(nn.Module):
        def __init__(self, input_dim, embedding_dim, num_heads, num_layers):
            super(AutoInt, self).__init__()
            self.embedding = nn.Linear(input_dim, embedding_dim)
            self.attention_layers = nn.ModuleList([
                nn.MultiheadAttention(embedding_dim, num_heads) for _ in range(num_layers)
            ])
            self.fc = nn.Linear(embedding_dim, 1)

        def forward(self, x):
            x = self.embedding(x).unsqueeze(1)
            for attn_layer in self.attention_layers:
                x, _ = attn_layer(x, x, x)
            x = x.squeeze(1)
            x = self.fc(x)
            return x

    class NeuralNetwork(nn.Module):
        def __init__(self, input_dim, hidden_layers):
            super(NeuralNetwork, self).__init__()
            layers = []
            for i in range(len(hidden_layers)):
                if i == 0:
                    layers.append(nn.Linear(input_dim, hidden_layers[i]))
                else:
                    layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
                layers.append(nn.ReLU())
            layers.append(nn.Linear(hidden_layers[-1], 1))
            self.network = nn.Sequential(*layers)
        
        def forward(self, x):
            return self.network(x).squeeze()

    def objective(trial):
        # Define hyperparameters to tune for AutoInt
        num_heads = trial.suggest_int('num_heads', 1, 8)
        embedding_dim = trial.suggest_int('embedding_dim', num_heads, 64, step=num_heads)
        num_layers = trial.suggest_int('num_layers', 1, 3)
        
        # Train AutoInt model
        autoint_model = AutoInt(X_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
        optimizer = optim.Adam(autoint_model.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        
        for epoch in range(10):  # Fixed number of epochs for AutoInt
            autoint_model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = autoint_model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
        
        # Extract features using AutoInt
        autoint_model.eval()
        with torch.no_grad():
            X_train_transformed = autoint_model.embedding(X_train_tensor).cpu().numpy()
            X_test_transformed = autoint_model.embedding(X_test_tensor).cpu().numpy()
        
        # Convert to PyTorch tensors
        X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
        X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
        
        # Define hyperparameters to tune for Neural Network
        hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
        nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        num_epochs = trial.suggest_int('num_epochs', 10, 100)

        # Create the Neural Network model
        input_dim = X_train_transformed.shape[1]
        model = NeuralNetwork(input_dim, hidden_layers).to(device)

        # Define loss function and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

        # Create DataLoader
        train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        # Training
        for epoch in range(num_epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y.squeeze())
                loss.backward()
                optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            predictions = model(X_test_transformed_tensor)
            mse = mean_squared_error(y_test_tensor.cpu().numpy(), predictions.cpu().numpy())

        return mse

    # Perform hyperparameter tuning with Optuna
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    # Get the best hyperparameters
    best_params = study.best_params

    # Train the final AutoInt model with the best hyperparameters
    embedding_dim = best_params['embedding_dim']
    num_heads = best_params['num_heads']
    num_layers = best_params['num_layers']
    autoint_model = AutoInt(X_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
    optimizer = optim.Adam(autoint_model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    for epoch in range(10):  # Fixed number of epochs for AutoInt
        autoint_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = autoint_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Extract features using AutoInt
    autoint_model.eval()
    with torch.no_grad():
        X_train_transformed = autoint_model.embedding(X_train_tensor).cpu().numpy()
        X_test_transformed = autoint_model.embedding(X_test_tensor).cpu().numpy()

    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)

    # Train the final Neural Network model with the best hyperparameters
    nn_best_params = {
        'hidden_layers': [best_params[f'hidden_layer_{i}'] for i in range(3)],
        'learning_rate': best_params['nn_learning_rate'],
        'batch_size': best_params['batch_size'],
        'num_epochs': best_params['num_epochs']
    }
    input_dim = X_train_transformed.shape[1]
    best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers']).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

    training_start_time = time.time()
    for epoch in range(nn_best_params['num_epochs']):
        best_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = best_model(batch_X)
            loss = criterion(outputs, batch_y.squeeze())
            loss.backward()
            optimizer.step()

    training_time = time.time() - training_start_time

    # Evaluation
    best_model.eval()
    with torch.no_grad():
        inference_start_time = time.time()
        predictions = best_model(X_test_transformed_tensor)
        inference_time = time.time() - inference_start_time

        y_pred = predictions.cpu().numpy()
        y_true = y_test_tensor.cpu().numpy().squeeze()
        
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

    # Cross-validation
    cv_rmse = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_scaled)):
        X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

        X_fold_train_tensor = torch.FloatTensor(X_fold_train).to(device)
        y_fold_train_tensor = torch.FloatTensor(y_fold_train.reshape(-1, 1)).to(device)
        X_fold_val_tensor = torch.FloatTensor(X_fold_val).to(device)
        y_fold_val_tensor = torch.FloatTensor(y_fold_val.reshape(-1, 1)).to(device)

        fold_autoint_model = AutoInt(X_fold_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
        fold_optimizer = optim.Adam(fold_autoint_model.parameters(), lr=0.001)
        fold_criterion = nn.MSELoss()
        fold_train_dataset = TensorDataset(X_fold_train_tensor, y_fold_train_tensor)
        fold_train_loader = DataLoader(fold_train_dataset, batch_size=32, shuffle=True)

        for epoch in range(10):  # Fixed number of epochs for AutoInt
            fold_autoint_model.train()
            for batch_X, batch_y in fold_train_loader:
                fold_optimizer.zero_grad()
                outputs = fold_autoint_model(batch_X)
                loss = fold_criterion(outputs, batch_y)
                loss.backward()
                fold_optimizer.step()

        fold_autoint_model.eval()
        with torch.no_grad():
            X_fold_train_transformed = fold_autoint_model.embedding(X_fold_train_tensor).cpu().numpy()
            X_fold_val_transformed = fold_autoint_model.embedding(X_fold_val_tensor).cpu().numpy()

        X_fold_train_transformed_tensor = torch.FloatTensor(X_fold_train_transformed).to(device)
        X_fold_val_transformed_tensor = torch.FloatTensor(X_fold_val_transformed).to(device)

        fold_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers']).to(device)
        fold_optimizer = optim.Adam(fold_model.parameters(), lr=nn_best_params['learning_rate'])
        fold_train_dataset = TensorDataset(X_fold_train_transformed_tensor, y_fold_train_tensor)
        fold_train_loader = DataLoader(fold_train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

        for epoch in range(nn_best_params['num_epochs']):
            fold_model.train()
            for batch_X, batch_y in fold_train_loader:
                fold_optimizer.zero_grad()
                outputs = fold_model(batch_X)
                loss = criterion(outputs, batch_y.squeeze())
                loss.backward()
                fold_optimizer.step()

        fold_model.eval()
        with torch.no_grad():
            fold_predictions = fold_model(X_fold_val_transformed_tensor)
            fold_mse = mean_squared_error(y_fold_val_tensor.cpu().numpy(), fold_predictions.cpu().numpy())
            cv_rmse.append(np.sqrt(fold_mse))

    # Calculate total computation time
    computation_time = time.time() - start_time

       # Store results in the existing result DataFrame
    result_df.loc['AutoInt + NN'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': np.mean(cv_rmse),
        'CV Std RMSE': np.std(cv_rmse),
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': best_params
    }

    return result_df, best_params

In [516]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna

def ft_transformer_nn_comparison(X, y, result_df, test_size=0.2, random_state=42, n_trials=10, n_folds=5):
    start_time = time.time()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Ensure X and y are numpy arrays
    X = X.values if isinstance(X, pd.DataFrame) else X
    y = y.values if isinstance(y, pd.Series) else y

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_train_tensor = torch.FloatTensor(y_train.reshape(-1, 1)).to(device)
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    y_test_tensor = torch.FloatTensor(y_test.reshape(-1, 1)).to(device)

    class FTTransformer(nn.Module):
        def __init__(self, input_dim, embedding_dim, num_heads, num_layers):
            super(FTTransformer, self).__init__()
            self.embedding = nn.Linear(input_dim, embedding_dim)
            self.transformer_layers = nn.ModuleList([
                nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads) for _ in range(num_layers)
            ])
            self.fc = nn.Linear(embedding_dim, input_dim)

        def forward(self, x):
            x = self.embedding(x).unsqueeze(1)
            for transformer_layer in self.transformer_layers:
                x = transformer_layer(x)
            x = x.squeeze(1)
            x = self.fc(x)
            return x

    class NeuralNetwork(nn.Module):
        def __init__(self, input_dim, hidden_layers):
            super(NeuralNetwork, self).__init__()
            layers = []
            for i in range(len(hidden_layers)):
                if i == 0:
                    layers.append(nn.Linear(input_dim, hidden_layers[i]))
                else:
                    layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
                layers.append(nn.ReLU())
            layers.append(nn.Linear(hidden_layers[-1], 1))
            self.network = nn.Sequential(*layers)
        
        def forward(self, x):
            return self.network(x).squeeze()

    def objective(trial):
        # Define hyperparameters to tune for FT-Transformer
        num_heads = trial.suggest_int('num_heads', 1, 8)
        embedding_dim = trial.suggest_int('embedding_dim', num_heads, 64, step=num_heads)
        num_layers = trial.suggest_int('num_layers', 1, 3)
        
        # Train FT-Transformer model
        ft_transformer_model = FTTransformer(X_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
        optimizer = optim.Adam(ft_transformer_model.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        
        for epoch in range(10):  # Fixed number of epochs for FT-Transformer
            ft_transformer_model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = ft_transformer_model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
        
        # Extract features using FT-Transformer
        ft_transformer_model.eval()
        with torch.no_grad():
            X_train_transformed = ft_transformer_model.embedding(X_train_tensor).cpu().numpy()
            X_test_transformed = ft_transformer_model.embedding(X_test_tensor).cpu().numpy()
        
        # Convert to PyTorch tensors
        X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
        X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)
        
        # Define hyperparameters to tune for Neural Network
        hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(3)]
        nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        num_epochs = trial.suggest_int('num_epochs', 10, 100)

        # Create the Neural Network model
        input_dim = X_train_transformed.shape[1]
        model = NeuralNetwork(input_dim, hidden_layers).to(device)

        # Define loss function and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=nn_learning_rate)

        # Create DataLoader
        train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        # Training
        for epoch in range(num_epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            predictions = model(X_test_transformed_tensor)
            mse = mean_squared_error(y_test_tensor.cpu().numpy(), predictions.cpu().numpy())

        return mse

    # Perform hyperparameter tuning with Optuna
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    # Get the best hyperparameters
    best_params = study.best_params

    # Train the final FT-Transformer model with the best hyperparameters
    embedding_dim = best_params['embedding_dim']
    num_heads = best_params['num_heads']
    num_layers = best_params['num_layers']
    ft_transformer_model = FTTransformer(X_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
    optimizer = optim.Adam(ft_transformer_model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    for epoch in range(10):  # Fixed number of epochs for FT-Transformer
        ft_transformer_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = ft_transformer_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Extract features using FT-Transformer
    ft_transformer_model.eval()
    with torch.no_grad():
        X_train_transformed = ft_transformer_model.embedding(X_train_tensor).cpu().numpy()
        X_test_transformed = ft_transformer_model.embedding(X_test_tensor).cpu().numpy()

    # Convert to PyTorch tensors
    X_train_transformed_tensor = torch.FloatTensor(X_train_transformed).to(device)
    X_test_transformed_tensor = torch.FloatTensor(X_test_transformed).to(device)

    # Train the final Neural Network model with the best hyperparameters
    nn_best_params = {
        'hidden_layers': [best_params[f'hidden_layer_{i}'] for i in range(3)],
        'learning_rate': best_params['nn_learning_rate'],
        'batch_size': best_params['batch_size'],
        'num_epochs': best_params['num_epochs']
    }
    input_dim = X_train_transformed.shape[1]
    best_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers']).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(best_model.parameters(), lr=nn_best_params['learning_rate'])

    train_dataset = TensorDataset(X_train_transformed_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

    training_start_time = time.time()
    for epoch in range(nn_best_params['num_epochs']):
        best_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = best_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    training_time = time.time() - training_start_time

    # Evaluation
    best_model.eval()
    with torch.no_grad():
        inference_start_time = time.time()
        predictions = best_model(X_test_transformed_tensor)
        inference_time = time.time() - inference_start_time

        y_pred = predictions.cpu().numpy()
        y_true = y_test_tensor.cpu().numpy().squeeze()
        
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

    # Cross-validation
    cv_rmse = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_scaled)):
        X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

        X_fold_train_tensor = torch.FloatTensor(X_fold_train).to(device)
        y_fold_train_tensor = torch.FloatTensor(y_fold_train.reshape(-1, 1)).to(device)
        X_fold_val_tensor = torch.FloatTensor(X_fold_val).to(device)
        y_fold_val_tensor = torch.FloatTensor(y_fold_val.reshape(-1, 1)).to(device)

        fold_ft_transformer_model = FTTransformer(X_fold_train.shape[1], embedding_dim, num_heads, num_layers).to(device)
        fold_optimizer = optim.Adam(fold_ft_transformer_model.parameters(), lr=0.001)
        fold_criterion = nn.MSELoss()

        fold_train_dataset = TensorDataset(X_fold_train_tensor, y_fold_train_tensor)
        fold_train_loader = DataLoader(fold_train_dataset, batch_size=32, shuffle=True)

        for epoch in range(10):
            fold_ft_transformer_model.train()
            for batch_X, batch_y in fold_train_loader:
                fold_optimizer.zero_grad()
                outputs = fold_ft_transformer_model(batch_X)
                loss = fold_criterion(outputs, batch_y)
                loss.backward()
                fold_optimizer.step()

        fold_ft_transformer_model.eval()
        with torch.no_grad():
            X_fold_train_transformed = fold_ft_transformer_model.embedding(X_fold_train_tensor).cpu().numpy()
            X_fold_val_transformed = fold_ft_transformer_model.embedding(X_fold_val_tensor).cpu().numpy()

        X_fold_train_transformed_tensor = torch.FloatTensor(X_fold_train_transformed).to(device)
        X_fold_val_transformed_tensor = torch.FloatTensor(X_fold_val_transformed).to(device)

        fold_model = NeuralNetwork(input_dim, nn_best_params['hidden_layers']).to(device)
        fold_optimizer = optim.Adam(fold_model.parameters(), lr=nn_best_params['learning_rate'])
        fold_train_dataset = TensorDataset(X_fold_train_transformed_tensor, y_fold_train_tensor)
        fold_train_loader = DataLoader(fold_train_dataset, batch_size=nn_best_params['batch_size'], shuffle=True)

        for epoch in range(nn_best_params['num_epochs']):
            fold_model.train()
            for batch_X, batch_y in fold_train_loader:
                fold_optimizer.zero_grad()
                outputs = fold_model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                fold_optimizer.step()

        fold_model.eval()
        with torch.no_grad():
            fold_predictions = fold_model(X_fold_val_transformed_tensor)
            fold_mse = mean_squared_error(y_fold_val_tensor.cpu().numpy(), fold_predictions.cpu().numpy())
            cv_rmse.append(np.sqrt(fold_mse))

    cv_mean_rmse = np.mean(cv_rmse)
    cv_std_rmse = np.std(cv_rmse)

    # Calculate total computation time
    computation_time = time.time() - start_time

    # Store results in the existing result DataFrame
    result_df.loc['FT-Transformer'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': cv_mean_rmse,
        'CV Std RMSE': cv_std_rmse,
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': best_params
    }

    return result_df, best_params


In [517]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna

def neural_architecture_search(X, y, result_df, test_size=0.2, random_state=42, n_trials=10, n_folds=5):
    start_time = time.time()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Ensure X and y are numpy arrays
    X = X.values if isinstance(X, pd.DataFrame) else X
    y = y.values if isinstance(y, pd.Series) else y

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_train_tensor = torch.FloatTensor(y_train.reshape(-1, 1)).to(device)
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    y_test_tensor = torch.FloatTensor(y_test.reshape(-1, 1)).to(device)

    class NeuralNetwork(nn.Module):
        def __init__(self, input_dim, hidden_layers):
            super(NeuralNetwork, self).__init__()
            layers = []
            for i in range(len(hidden_layers)):
                if i == 0:
                    layers.append(nn.Linear(input_dim, hidden_layers[i]))
                else:
                    layers.append(nn.Linear(hidden_layers[i-1], hidden_layers[i]))
                layers.append(nn.ReLU())
            layers.append(nn.Linear(hidden_layers[-1], 1))
            self.network = nn.Sequential(*layers)
        
        def forward(self, x):
            return self.network(x).squeeze()

    def objective(trial):
        # Define hyperparameters to tune for Neural Network
        num_layers = trial.suggest_int('num_layers', 1, 5)
        hidden_layers = [trial.suggest_int(f'hidden_layer_{i}', 32, 256) for i in range(num_layers)]
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        num_epochs = trial.suggest_int('num_epochs', 10, 100)

        # Create the Neural Network model
        input_dim = X_train.shape[1]
        model = NeuralNetwork(input_dim, hidden_layers).to(device)

        # Define loss function and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # Create DataLoader
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        # Training
        for epoch in range(num_epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            predictions = model(X_test_tensor)
            mse = mean_squared_error(y_test_tensor.cpu().numpy(), predictions.cpu().numpy())

        return mse

    # Perform hyperparameter tuning with Optuna
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    # Get the best hyperparameters
    best_params = study.best_params

    # Train the final Neural Network model with the best hyperparameters
    input_dim = X_train.shape[1]
    best_model = NeuralNetwork(input_dim, 
                               [best_params[f'hidden_layer_{i}'] for i in range(best_params['num_layers'])]).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

    training_start_time = time.time()
    for epoch in range(best_params['num_epochs']):
        best_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = best_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    training_time = time.time() - training_start_time

    # Evaluation
    best_model.eval()
    with torch.no_grad():
        inference_start_time = time.time()
        predictions = best_model(X_test_tensor)
        inference_time = time.time() - inference_start_time

        y_pred = predictions.cpu().numpy()
        y_true = y_test_tensor.cpu().numpy().squeeze()
        
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

    # Cross-validation
    cv_rmse = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_scaled)):
        X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

        X_fold_train_tensor = torch.FloatTensor(X_fold_train).to(device)
        y_fold_train_tensor = torch.FloatTensor(y_fold_train.reshape(-1, 1)).to(device)
        X_fold_val_tensor = torch.FloatTensor(X_fold_val).to(device)
        y_fold_val_tensor = torch.FloatTensor(y_fold_val.reshape(-1, 1)).to(device)

        fold_model = NeuralNetwork(input_dim, 
                                   [best_params[f'hidden_layer_{i}'] for i in range(best_params['num_layers'])]).to(device)
        fold_optimizer = optim.Adam(fold_model.parameters(), lr=best_params['learning_rate'])
        fold_dataset = TensorDataset(X_fold_train_tensor, y_fold_train_tensor)
        fold_loader = DataLoader(fold_dataset, batch_size=best_params['batch_size'], shuffle=True)

        for epoch in range(best_params['num_epochs']):
            fold_model.train()
            for batch_X, batch_y in fold_loader:
                fold_optimizer.zero_grad()
                outputs = fold_model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                fold_optimizer.step()

        fold_model.eval()
        with torch.no_grad():
            fold_predictions = fold_model(X_fold_val_tensor)
            fold_mse = mean_squared_error(y_fold_val_tensor.cpu().numpy(), fold_predictions.cpu().numpy())
            cv_rmse.append(np.sqrt(fold_mse))

    cv_mean_rmse = np.mean(cv_rmse)
    cv_std_rmse = np.std(cv_rmse)

    # Calculate total computation time
    computation_time = time.time() - start_time

    # Store results in the existing result DataFrame
    result_df.loc['Neural Architecture Search'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': cv_mean_rmse,
        'CV Std RMSE': cv_std_rmse,
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': best_params
    }

    return result_df, best_params


In [518]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna

def kan_comparison(X, y, result_df, test_size=0.2, random_state=42, n_trials=10, n_folds=5):
    start_time = time.time()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Ensure X and y are numpy arrays
    X = X.values if isinstance(X, pd.DataFrame) else X
    y = y.values if isinstance(y, pd.Series) else y

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_train_tensor = torch.FloatTensor(y_train.reshape(-1, 1)).to(device)
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    y_test_tensor = torch.FloatTensor(y_test.reshape(-1, 1)).to(device)

    class KAN(nn.Module):
        def __init__(self, input_dim, hidden_dim):
            super(KAN, self).__init__()
            self.hidden_layer = nn.Linear(input_dim, hidden_dim)
            self.activation = nn.ReLU()
            self.output_layer = nn.Linear(hidden_dim, 1)

        def forward(self, x):
            h = self.activation(self.hidden_layer(x))
            out = self.output_layer(h)
            return out.squeeze()

    def objective(trial):
        hidden_dim = trial.suggest_int('hidden_dim', 32, 256)
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        num_epochs = trial.suggest_int('num_epochs', 10, 100)

        input_dim = X_train.shape[1]
        model = KAN(input_dim, hidden_dim).to(device)

        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        for epoch in range(num_epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y.squeeze())
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            predictions = model(X_test_tensor)
            mse = mean_squared_error(y_test_tensor.cpu().numpy(), predictions.cpu().numpy())

        return mse

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params

    input_dim = X_train.shape[1]
    best_model = KAN(input_dim, best_params['hidden_dim']).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

    training_start_time = time.time()
    for epoch in range(best_params['num_epochs']):
        best_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = best_model(batch_X)
            loss = criterion(outputs, batch_y.squeeze())
            loss.backward()
            optimizer.step()

    training_time = time.time() - training_start_time

    best_model.eval()
    with torch.no_grad():
        inference_start_time = time.time()
        predictions = best_model(X_test_tensor)
        inference_time = time.time() - inference_start_time

        y_true = y_test_tensor.cpu().numpy().squeeze()
        y_pred = predictions.cpu().numpy()

        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

    cv_rmse = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_scaled)):
        X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

        X_fold_train_tensor = torch.FloatTensor(X_fold_train).to(device)
        y_fold_train_tensor = torch.FloatTensor(y_fold_train.reshape(-1, 1)).to(device)
        X_fold_val_tensor = torch.FloatTensor(X_fold_val).to(device)
        y_fold_val_tensor = torch.FloatTensor(y_fold_val.reshape(-1, 1)).to(device)

        fold_model = KAN(input_dim, best_params['hidden_dim']).to(device)
        fold_optimizer = optim.Adam(fold_model.parameters(), lr=best_params['learning_rate'])
        fold_dataset = TensorDataset(X_fold_train_tensor, y_fold_train_tensor)
        fold_loader = DataLoader(fold_dataset, batch_size=best_params['batch_size'], shuffle=True)

        for epoch in range(best_params['num_epochs']):
            fold_model.train()
            for batch_X, batch_y in fold_loader:
                fold_optimizer.zero_grad()
                outputs = fold_model(batch_X)
                loss = criterion(outputs, batch_y.squeeze())
                loss.backward()
                fold_optimizer.step()

        fold_model.eval()
        with torch.no_grad():
            fold_predictions = fold_model(X_fold_val_tensor)
            fold_mse = mean_squared_error(y_fold_val_tensor.cpu().numpy(), fold_predictions.cpu().numpy())
            cv_rmse.append(np.sqrt(fold_mse))

    computation_time = time.time() - start_time

    result_df.loc['KAN'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': np.mean(cv_rmse),
        'CV Std RMSE': np.std(cv_rmse),
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': best_params
    }

    return result_df, best_params


In [519]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna

def node_comparison(X, y, result_df, test_size=0.2, random_state=42, n_trials=10, n_folds=5):
    start_time = time.time()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Ensure X and y are numpy arrays
    X = X.values if isinstance(X, pd.DataFrame) else X
    y = y.values if isinstance(y, pd.Series) else y

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_train_tensor = torch.FloatTensor(y_train.reshape(-1, 1)).to(device)
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    y_test_tensor = torch.FloatTensor(y_test.reshape(-1, 1)).to(device)

    class NODE(nn.Module):
        def __init__(self, input_dim, num_layers, num_trees, tree_dim):
            super(NODE, self).__init__()
            self.layers = nn.ModuleList()
            for _ in range(num_layers):
                layer = nn.ModuleList()
                for _ in range(num_trees):
                    tree = nn.Sequential(
                        nn.Linear(input_dim, tree_dim),
                        nn.ReLU(),
                        nn.Linear(tree_dim, 1)
                    )
                    layer.append(tree)
                self.layers.append(layer)
            self.output = nn.Linear(num_layers * num_trees, 1)

        def forward(self, x):
            tree_outputs = []
            for layer in self.layers:
                layer_outputs = []
                for tree in layer:
                    layer_outputs.append(tree(x))
                layer_output = torch.cat(layer_outputs, dim=1)
                tree_outputs.append(layer_output)
            x = torch.cat(tree_outputs, dim=1)
            return self.output(x).squeeze()

    def objective(trial):
        # Define hyperparameters to tune for NODE
        num_layers = trial.suggest_int('num_layers', 1, 5)
        num_trees = trial.suggest_int('num_trees', 1, 10)
        tree_dim = trial.suggest_int('tree_dim', 8, 64)
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        num_epochs = trial.suggest_int('num_epochs', 10, 100)

        # Create the NODE model
        input_dim = X_train.shape[1]
        model = NODE(input_dim, num_layers, num_trees, tree_dim).to(device)

        # Define loss function and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # Create DataLoader
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        # Training
        for epoch in range(num_epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y.squeeze())
                loss.backward()
                optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            predictions = model(X_test_tensor)
            mse = mean_squared_error(y_test_tensor.cpu().numpy(), predictions.cpu().numpy())

        return mse

    # Perform hyperparameter tuning with Optuna
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    # Get the best hyperparameters
    best_params = study.best_params

    # Train the final NODE model with the best hyperparameters
    input_dim = X_train.shape[1]
    best_model = NODE(input_dim, 
                      best_params['num_layers'], 
                      best_params['num_trees'], 
                      best_params['tree_dim']).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

    training_start_time = time.time()
    for epoch in range(best_params['num_epochs']):
        best_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = best_model(batch_X)
            loss = criterion(outputs, batch_y.squeeze())
            loss.backward()
            optimizer.step()

    training_time = time.time() - training_start_time

    # Evaluation
    best_model.eval()
    with torch.no_grad():
        inference_start_time = time.time()
        predictions = best_model(X_test_tensor)
        inference_time = time.time() - inference_start_time

        y_pred = predictions.cpu().numpy()
        y_true = y_test_tensor.cpu().numpy().squeeze()
        
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

    # Cross-validation
    cv_rmse = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_scaled)):
        X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

        X_fold_train_tensor = torch.FloatTensor(X_fold_train).to(device)
        y_fold_train_tensor = torch.FloatTensor(y_fold_train.reshape(-1, 1)).to(device)
        X_fold_val_tensor = torch.FloatTensor(X_fold_val).to(device)
        y_fold_val_tensor = torch.FloatTensor(y_fold_val.reshape(-1, 1)).to(device)

        fold_model = NODE(input_dim, 
                          best_params['num_layers'], 
                          best_params['num_trees'], 
                          best_params['tree_dim']).to(device)
        fold_optimizer = optim.Adam(fold_model.parameters(), lr=best_params['learning_rate'])
        fold_dataset = TensorDataset(X_fold_train_tensor, y_fold_train_tensor)
        fold_loader = DataLoader(fold_dataset, batch_size=best_params['batch_size'], shuffle=True)

        for epoch in range(best_params['num_epochs']):
            fold_model.train()
            for batch_X, batch_y in fold_loader:
                fold_optimizer.zero_grad()
                outputs = fold_model(batch_X)
                loss = criterion(outputs, batch_y.squeeze())
                loss.backward()
                fold_optimizer.step()

        fold_model.eval()
        with torch.no_grad():
            fold_predictions = fold_model(X_fold_val_tensor)
            fold_mse = mean_squared_error(y_fold_val_tensor.cpu().numpy(), fold_predictions.cpu().numpy())
            cv_rmse.append(np.sqrt(fold_mse))

    computation_time = time.time() - start_time

    result_df.loc['NODE'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': np.mean(cv_rmse),
        'CV Std RMSE': np.std(cv_rmse),
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': best_params
    }

    return result_df, best_params


In [520]:
import pandas as pd
import numpy as np
import time
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from pytorch_tabnet.tab_model import TabNetRegressor
import optuna

def tabnet_comparison(X, y, result_df, test_size=0.2, random_state=42, n_trials=5, n_folds=5):
    start_time = time.time()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Ensure X and y are numpy arrays
    X = X.values if isinstance(X, pd.DataFrame) else X
    y = y.values if isinstance(y, pd.Series) else y

    # Reshape y to be 2D
    y = y.reshape(-1, 1)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    def objective(trial):
        # Define hyperparameters to tune for TabNet
        n_d = trial.suggest_int('n_d', 8, 64)
        n_a = trial.suggest_int('n_a', 8, 64)
        n_steps = trial.suggest_int('n_steps', 3, 10)
        gamma = trial.suggest_float('gamma', 1.0, 2.0)
        lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128,256])
        num_epochs = trial.suggest_int('num_epochs', 10, 40)

        # Create the TabNet model
        model = TabNetRegressor(
            n_d=n_d,
            n_a=n_a,
            n_steps=n_steps,
            gamma=gamma,
            lambda_sparse=lambda_sparse,
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=learning_rate),
            device_name=device
        )

        # Training
        model.fit(
            X_train=X_train_scaled, y_train=y_train,
            eval_set=[(X_test_scaled, y_test)],
            eval_name=['val'],
            eval_metric=['rmse'],
            max_epochs=num_epochs,
            patience=10,
            batch_size=batch_size,
            virtual_batch_size=batch_size // 2,
            num_workers=0,
            drop_last=False
        )

        # Evaluation
        y_pred = model.predict(X_test_scaled)
        mse = mean_squared_error(y_test, y_pred)

        return mse

    # Perform hyperparameter tuning with Optuna
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    # Get the best hyperparameters
    best_params = study.best_params

    # Train the final TabNet model with the best hyperparameters
    best_model = TabNetRegressor(
        n_d=best_params['n_d'],
        n_a=best_params['n_a'],
        n_steps=best_params['n_steps'],
        gamma=best_params['gamma'],
        lambda_sparse=best_params['lambda_sparse'],
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=best_params['learning_rate']),
        device_name=device
    )

    training_start_time = time.time()
    best_model.fit(
        X_train=X_train_scaled, y_train=y_train,
        eval_set=[(X_test_scaled, y_test)],
        eval_name=['val'],
        eval_metric=['rmse'],
        max_epochs=best_params['num_epochs'],
        patience=10,
        batch_size=best_params['batch_size'],
        virtual_batch_size=best_params['batch_size'] // 2,
        num_workers=0,
        drop_last=False
    )
    training_time = time.time() - training_start_time

    # Evaluation
    inference_start_time = time.time()
    y_pred = best_model.predict(X_test_scaled)
    inference_time = time.time() - inference_start_time

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Cross-validation
    cv_rmse = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_scaled)):
        X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

        fold_model = TabNetRegressor(
            n_d=best_params['n_d'],
            n_a=best_params['n_a'],
            n_steps=best_params['n_steps'],
            gamma=best_params['gamma'],
            lambda_sparse=best_params['lambda_sparse'],
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=best_params['learning_rate']),
            device_name=device
        )

        fold_model.fit(
            X_train=X_fold_train, y_train=y_fold_train,
            eval_set=[(X_fold_val, y_fold_val)],
            eval_name=['val'],
            eval_metric=['rmse'],
            max_epochs=best_params['num_epochs'],
            patience=10,
            batch_size=best_params['batch_size'],
            virtual_batch_size=best_params['batch_size'] // 2,
            num_workers=0,
            drop_last=False
        )

        y_fold_pred = fold_model.predict(X_fold_val)
        fold_rmse = np.sqrt(mean_squared_error(y_fold_val, y_fold_pred))
        cv_rmse.append(fold_rmse)

    cv_mean_rmse = np.mean(cv_rmse)
    cv_std_rmse = np.std(cv_rmse)

    # Calculate total computation time
    computation_time = time.time() - start_time

    # Store results in the existing result DataFrame
    result_df.loc['TabNet'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': cv_mean_rmse,
        'CV Std RMSE': cv_std_rmse,
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': best_params
    }

    return result_df, best_params


In [521]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna

def saint_comparison(X, y, result_df, test_size=0.2, random_state=42, n_trials=5, n_folds=5):
    start_time = time.time()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Ensure X and y are numpy arrays
    X = X.values if isinstance(X, pd.DataFrame) else X
    y = y.values if isinstance(y, pd.Series) else y

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_train_tensor = torch.FloatTensor(y_train.reshape(-1, 1)).to(device)
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    y_test_tensor = torch.FloatTensor(y_test.reshape(-1, 1)).to(device)

    class SAINT(nn.Module):
        def __init__(self, input_dim, dim, depth, heads, mlp_dim, dropout=0.1):
            super(SAINT, self).__init__()
            self.embeds = nn.Linear(input_dim, dim)
            self.transformer = nn.TransformerEncoder(
                nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim, dropout=dropout),
                num_layers=depth
            )
            self.mlp_head = nn.Sequential(
                nn.LayerNorm(dim),
                nn.Linear(dim, 1)
            )

        def forward(self, x):
            x = self.embeds(x)
            x = x.unsqueeze(1)  # Add sequence dimension
            x = self.transformer(x)
            x = x.squeeze(1)  # Remove sequence dimension
            return self.mlp_head(x).squeeze()

    def objective(trial):
        heads = trial.suggest_int('heads', 1, 8)
        dim = trial.suggest_int('dim', heads, 256, step=heads)
        depth = trial.suggest_int('depth', 1, 6)
        mlp_dim = trial.suggest_int('mlp_dim', 32, 256)
        dropout = trial.suggest_float('dropout', 0.0, 0.5)
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128,256])
        num_epochs = trial.suggest_int('num_epochs', 10, 40)

        input_dim = X_train.shape[1]
        model = SAINT(input_dim, dim, depth, heads, mlp_dim, dropout).to(device)

        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        for epoch in range(num_epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y.squeeze())
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            predictions = model(X_test_tensor)
            mse = mean_squared_error(y_test_tensor.cpu().numpy(), predictions.cpu().numpy())

        return mse

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params

    input_dim = X_train.shape[1]
    best_model = SAINT(input_dim, best_params['dim'], best_params['depth'], 
                       best_params['heads'], best_params['mlp_dim'], 
                       best_params['dropout']).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

    training_start_time = time.time()
    for epoch in range(best_params['num_epochs']):
        best_model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = best_model(batch_X)
            loss = criterion(outputs, batch_y.squeeze())
            loss.backward()
            optimizer.step()

    training_time = time.time() - training_start_time

    best_model.eval()
    with torch.no_grad():
        inference_start_time = time.time()
        predictions = best_model(X_test_tensor)
        inference_time = time.time() - inference_start_time

        y_true = y_test_tensor.cpu().numpy().squeeze()
        y_pred = predictions.cpu().numpy()

        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

    cv_rmse = []
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_scaled)):
        X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

        X_fold_train_tensor = torch.FloatTensor(X_fold_train).to(device)
        y_fold_train_tensor = torch.FloatTensor(y_fold_train.reshape(-1, 1)).to(device)
        X_fold_val_tensor = torch.FloatTensor(X_fold_val).to(device)
        y_fold_val_tensor = torch.FloatTensor(y_fold_val.reshape(-1, 1)).to(device)

        fold_model = SAINT(input_dim, best_params['dim'], best_params['depth'], 
                           best_params['heads'], best_params['mlp_dim'], 
                           best_params['dropout']).to(device)
        fold_optimizer = optim.Adam(fold_model.parameters(), lr=best_params['learning_rate'])
        fold_dataset = TensorDataset(X_fold_train_tensor, y_fold_train_tensor)
        fold_loader = DataLoader(fold_dataset, batch_size=best_params['batch_size'], shuffle=True)

        for epoch in range(best_params['num_epochs']):
            fold_model.train()
            for batch_X, batch_y in fold_loader:
                fold_optimizer.zero_grad()
                outputs = fold_model(batch_X)
                loss = criterion(outputs, batch_y.squeeze())
                loss.backward()
                fold_optimizer.step()

        fold_model.eval()
        with torch.no_grad():
            fold_predictions = fold_model(X_fold_val_tensor)
            fold_mse = mean_squared_error(y_fold_val_tensor.cpu().numpy(), fold_predictions.cpu().numpy())
            cv_rmse.append(np.sqrt(fold_mse))

    cv_mean_rmse = np.mean(cv_rmse)
    cv_std_rmse = np.std(cv_rmse)

    computation_time = time.time() - start_time

    result_df.loc['SAINT'] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2,
        'CV Mean RMSE': cv_mean_rmse,
        'CV Std RMSE': cv_std_rmse,
        'Training Time (Best Params)': training_time,
        'Inference Time (Best Params)': inference_time,
        'Computation Time (Total)': computation_time,
        'Best Parameters': best_params
    }

    return result_df, best_params


In [522]:
file_prefix = "crop"  # Change this to any word you like
df =  pd.read_csv(f'Dataset/{file_prefix}.csv')
df = encode_categorical_data(df)
X = df.drop('Y', axis=1)
y = df['Y']
X, y = apply_robust_transform(X, y)
#X, y = apply_synthetic_data_to_training(X, y)
'''
result = model_comparison(df, 'Y')
print(result)
result, best_params = mlp_comparison(X, y, result)
print(result)
result, best_params = dnn_comparison(X, y, result)
print(result)
result, best_params = dcn_comparison(X, y, result)
print(result)
result, best_params = wide_and_deep_comparison(X, y, result)
print(result)
result, best_params = xgb_nn_comparison(X, y, result)
print(result)
result, best_params = lgbm_nn_comparison(X, y, result)
print(result)
result, best_params = autoint_nn_comparison(X, y, result)
print(result)
result, best_params = ft_transformer_nn_comparison(X, y, result)
print(result)
result, best_params = neural_architecture_search(X, y, result)
print(result)
result, best_params = kan_comparison(X, y, result)
print(result)
result, best_params = node_comparison(X, y, result)
print(result)
result, best_params = tabnet_comparison(X, y, result)
print(result)
'''
result, best_params = saint_comparison(X, y, result)
print(result)

result.to_csv(f'result/comparison/regression/{file_prefix}_result.csv', index=True)

[I 2024-08-06 10:41:11,568] A new study created in memory with name: no-name-3a6281e6-829b-47bc-8554-cd5dcd1da1fa


Using device: cpu


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:41:26,496] Trial 0 finished with value: 11205650.0 and parameters: {'heads': 5, 'dim': 65, 'depth': 5, 'mlp_dim': 160, 'dropout': 0.11698618521862919, 'learning_rate': 0.001180545242414642, 'batch_size': 64, 'num_epochs': 17}. Best is trial 0 with value: 11205650.0.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:41:32,179] Trial 1 finished with value: 5076915.0 and parameters: {'heads': 2, 'dim': 70, 'depth': 2, 'mlp_dim': 233, 'dropout': 0.03905578596842341, 'learning_rate': 0.0261447175860303, 'batch_size': 64, 'num_epochs': 15}. Best is trial 1 with value: 5076915.0.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:41:49,673] Trial 2 finished with value: 10311736.0 and parameters: {'heads': 2, 'dim': 44, 'depth': 4, 'mlp_dim': 179, 'dropout': 0.43043403669016667, 'learning_rate': 0.0014926498458922672, 'batch_size'

                                       MSE         RMSE          MAE  \
Linear Regression           4889229.831828  2211.160291  1691.734253   
Ridge                        4883088.90566  2209.771234  1690.705258   
Lasso                       4865416.912044  2205.769007  1688.270692   
KNN                          3273818.03677  1809.369514  1027.592047   
Decision Tree               8273080.036686  2876.296236  1650.821209   
Random Forest               3824053.360395  1955.518693  1397.043044   
Gradient Boosting           4477412.204499  2115.989651   1623.35943   
XGBoost                     4467102.980339  2113.552219  1630.558773   
LightGBM                     3659382.94254   1912.95137   1404.21618   
CatBoost                    4676739.350386  2162.577016  1655.313719   
MLP                         4847092.507387  2201.611343  1687.336878   
DNN                              5001051.5  2236.302979  1701.878906   
DCN                              4962697.5  2227.711182  1694.13

In [523]:
file_prefix = "bike"  # Change this to any word you like
df =  pd.read_csv(f'Dataset/{file_prefix}.csv')
df = encode_categorical_data(df)
X = df.drop('Y', axis=1)
y = df['Y']
X, y = apply_robust_transform(X, y)
#X, y = apply_synthetic_data_to_training(X, y)

result = model_comparison(df, 'Y')
print(result)
result, best_params = mlp_comparison(X, y, result)
print(result)
result, best_params = dnn_comparison(X, y, result)
print(result)
result, best_params = dcn_comparison(X, y, result)
print(result)
result, best_params = wide_and_deep_comparison(X, y, result)
print(result)
result, best_params = xgb_nn_comparison(X, y, result)
print(result)
result, best_params = lgbm_nn_comparison(X, y, result)
print(result)
result, best_params = autoint_nn_comparison(X, y, result)
print(result)
result, best_params = ft_transformer_nn_comparison(X, y, result)
print(result)
result, best_params = neural_architecture_search(X, y, result)
print(result)
result, best_params = kan_comparison(X, y, result)
print(result)
result, best_params = node_comparison(X, y, result)
print(result)
result, best_params = tabnet_comparison(X, y, result)
print(result)
result, best_params = saint_comparison(X, y, result)
print(result)

result.to_csv(f'result/comparison/regression/{file_prefix}_result.csv', index=True)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000232 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 14
[LightGBM] [Info] Start training from score 4546.361301
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 14
[LightGBM] [Info] Start training from score 4546.361301
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000081 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1276
[LightGBM] [Info] Number of data points in the train set: 467, number of used features: 14
[LightGBM] [Info] Start tra

60 fits failed out of a total of 288.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Tommy\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Tommy\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Tommy\anaconda3\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 751, in fit
    return self._fit(X, y, incremental=False)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Us

                             MSE        RMSE         MAE  R2 Score  \
Linear Regression            0.0         0.0         0.0       1.0   
Ridge                   0.242336    0.492277    0.398738       1.0   
Lasso                   0.013676    0.116946    0.096502       1.0   
KNN                161030.811799  401.286446  296.465121  0.952954   
Decision Tree       30628.274734  175.009356  123.978311  0.991052   
Random Forest       11178.686815  105.729309   65.787632  0.996734   
Gradient Boosting    8478.942997   92.081176     64.2453  0.997523   
XGBoost              9031.190737   95.032577    66.63099  0.997361   
LightGBM            12274.312832  110.789498   72.435986  0.996414   
CatBoost             6256.066985   79.095303   61.673092  0.998172   
MLP                112512.623433  335.429014  264.401499  0.967129   

                  CV Mean RMSE CV Std RMSE Training Time (Best Params)  \
Linear Regression          0.0         0.0                    0.001997   
Ridge      

[I 2024-08-06 10:48:33,429] Trial 0 finished with value: 3675501.5 and parameters: {'hidden_dim_0': 252, 'hidden_dim_1': 241, 'hidden_dim_2': 33, 'learning_rate': 0.03535070946595991, 'batch_size': 256, 'num_epochs': 82}. Best is trial 0 with value: 3675501.5.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-08-06 10:48:34,297] Trial 1 finished with value: 22352352.0 and parameters: {'hidden_dim_0': 102, 'hidden_dim_1': 167, 'hidden_dim_2': 45, 'learning_rate': 0.00011294308168232602, 'batch_size': 256, 'num_epochs': 39}. Best is trial 0 with value: 3675501.5.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-08-06 10:48:39,506] Trial 2 finished with value: 3342298.25 and parameters: {'

                             MSE         RMSE          MAE  R2 Score  \
Linear Regression            0.0          0.0          0.0       1.0   
Ridge                   0.242336     0.492277     0.398738       1.0   
Lasso                   0.013676     0.116946     0.096502       1.0   
KNN                161030.811799   401.286446   296.465121  0.952954   
Decision Tree       30628.274734   175.009356   123.978311  0.991052   
Random Forest       11178.686815   105.729309    65.787632  0.996734   
Gradient Boosting    8478.942997    92.081176      64.2453  0.997523   
XGBoost              9031.190737    95.032577     66.63099  0.997361   
LightGBM            12274.312832   110.789498    72.435986  0.996414   
CatBoost             6256.066985    79.095303    61.673092  0.998172   
MLP                112512.623433   335.429014   264.401499  0.967129   
DNN                    3837166.5  1958.868652  1641.655029 -0.121047   

                  CV Mean RMSE CV Std RMSE Training Time (Best 

  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:49:18,448] Trial 0 finished with value: 38926.41015625 and parameters: {'cross_layers': 2, 'hidden_layer_0': 149, 'hidden_layer_1': 226, 'hidden_layer_2': 249, 'learning_rate': 0.01224087062035887, 'batch_size': 64, 'num_epochs': 36}. Best is trial 0 with value: 38926.41015625.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:49:28,280] Trial 1 finished with value: 18221.07421875 and parameters: {'cross_layers': 4, 'hidden_layer_0': 139, 'hidden_layer_1': 125, 'hidden_layer_2': 75, 'learning_rate': 0.004134234444134273, 'batch_size': 32, 'num_epochs': 81}. Best is trial 1 with value: 18221.07421875.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:49:31,585] Trial 2 finished with value: 163052.84375 and parameters: {'cross_layers': 3, 'hidden_layer_0': 219, 'hidden_layer_1': 160, 'hidden_layer_2': 71, 'learning_rate': 0

                             MSE         RMSE          MAE  R2 Score  \
Linear Regression            0.0          0.0          0.0       1.0   
Ridge                   0.242336     0.492277     0.398738       1.0   
Lasso                   0.013676     0.116946     0.096502       1.0   
KNN                161030.811799   401.286446   296.465121  0.952954   
Decision Tree       30628.274734   175.009356   123.978311  0.991052   
Random Forest       11178.686815   105.729309    65.787632  0.996734   
Gradient Boosting    8478.942997    92.081176      64.2453  0.997523   
XGBoost              9031.190737    95.032577     66.63099  0.997361   
LightGBM            12274.312832   110.789498    72.435986  0.996414   
CatBoost             6256.066985    79.095303    61.673092  0.998172   
MLP                112512.623433   335.429014   264.401499  0.967129   
DNN                    3837166.5  1958.868652  1641.655029 -0.121047   
DCN                 11457.277344   107.038673    84.923981  0.99

  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:50:19,062] Trial 0 finished with value: 312861.40625 and parameters: {'hidden_layer_0': 163, 'hidden_layer_1': 109, 'hidden_layer_2': 204, 'learning_rate': 0.0015575770976062023, 'batch_size': 256, 'num_epochs': 68}. Best is trial 0 with value: 312861.40625.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:50:20,255] Trial 1 finished with value: 39647.0546875 and parameters: {'hidden_layer_0': 96, 'hidden_layer_1': 237, 'hidden_layer_2': 127, 'learning_rate': 0.04651181970325781, 'batch_size': 256, 'num_epochs': 44}. Best is trial 1 with value: 39647.0546875.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:50:22,162] Trial 2 finished with value: 18973.21484375 and parameters: {'hidden_layer_0': 160, 'hidden_layer_1': 213, 'hidden_layer_2': 167, 'learning_rate': 0.019265541176803028, 'batch_size': 64, 'num_epochs': 34}.

                             MSE         RMSE          MAE  R2 Score  \
Linear Regression            0.0          0.0          0.0       1.0   
Ridge                   0.242336     0.492277     0.398738       1.0   
Lasso                   0.013676     0.116946     0.096502       1.0   
KNN                161030.811799   401.286446   296.465121  0.952954   
Decision Tree       30628.274734   175.009356   123.978311  0.991052   
Random Forest       11178.686815   105.729309    65.787632  0.996734   
Gradient Boosting    8478.942997    92.081176      64.2453  0.997523   
XGBoost              9031.190737    95.032577     66.63099  0.997361   
LightGBM            12274.312832   110.789498    72.435986  0.996414   
CatBoost             6256.066985    79.095303    61.673092  0.998172   
MLP                112512.623433   335.429014   264.401499  0.967129   
DNN                    3837166.5  1958.868652  1641.655029 -0.121047   
DCN                 11457.277344   107.038673    84.923981  0.99

  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-08-06 10:51:10,619] Trial 0 finished with value: 9889692.0 and parameters: {'n_estimators': 127, 'max_depth': 8, 'xgb_learning_rate': 0.011959792063634055, 'subsample': 0.7244652389389996, 'colsample_bytree': 0.8689664654973686, 'use_hidden_layer_0': False, 'use_hidden_layer_1': True, 'hidden_layer_1': 147, 'use_hidden_layer_2': True, 'hidden_layer_2': 136, 'nn_learning_rate': 0.00011287203413641882, 'batch_size': 128, 'num_epochs': 32}. Best is trial 0 with value: 9889692.0.
  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_l

                             MSE         RMSE          MAE  R2 Score  \
Linear Regression            0.0          0.0          0.0       1.0   
Ridge                   0.242336     0.492277     0.398738       1.0   
Lasso                   0.013676     0.116946     0.096502       1.0   
KNN                161030.811799   401.286446   296.465121  0.952954   
Decision Tree       30628.274734   175.009356   123.978311  0.991052   
Random Forest       11178.686815   105.729309    65.787632  0.996734   
Gradient Boosting    8478.942997    92.081176      64.2453  0.997523   
XGBoost              9031.190737    95.032577     66.63099  0.997361   
LightGBM            12274.312832   110.789498    72.435986  0.996414   
CatBoost             6256.066985    79.095303    61.673092  0.998172   
MLP                112512.623433   335.429014   264.401499  0.967129   
DNN                    3837166.5  1958.868652  1641.655029 -0.121047   
DCN                 11457.277344   107.038673    84.923981  0.99

[I 2024-08-06 10:51:33,721] Trial 0 finished with value: 2868993.25 and parameters: {'n_estimators': 159, 'max_depth': 4, 'lgb_learning_rate': 0.000671034630683982, 'num_leaves': 24, 'subsample': 0.9183903949102641, 'colsample_bytree': 0.7890466128306823, 'hidden_layer_0': 188, 'hidden_layer_1': 52, 'hidden_layer_2': 217, 'nn_learning_rate': 0.0031833409938854295, 'batch_size': 128, 'num_epochs': 37}. Best is trial 0 with value: 2868993.25.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 14
[LightGBM] [Info] Start training from score 4546.361301


[I 2024-08-06 10:51:35,903] Trial 1 finished with value: 228554.84375 and parameters: {'n_estimators': 59, 'max_depth': 4, 'lgb_learning_rate': 0.029496532955236084, 'num_leaves': 50, 'subsample': 0.6889563629891278, 'colsample_bytree': 0.5425971897863822, 'hidden_layer_0': 186, 'hidden_layer_1': 96, 'hidden_layer_2': 87, 'nn_learning_rate': 0.024683404777309677, 'batch_size': 32, 'num_epochs': 31}. Best is trial 1 with value: 228554.84375.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 14
[LightGBM] [Info] Start training from score 4546.361301


[I 2024-08-06 10:51:36,752] Trial 2 finished with value: 3325210.75 and parameters: {'n_estimators': 130, 'max_depth': 10, 'lgb_learning_rate': 0.0001732649368776703, 'num_leaves': 30, 'subsample': 0.7674263780423947, 'colsample_bytree': 0.7293481512219095, 'hidden_layer_0': 209, 'hidden_layer_1': 148, 'hidden_layer_2': 40, 'nn_learning_rate': 0.0002530426938510906, 'batch_size': 128, 'num_epochs': 22}. Best is trial 1 with value: 228554.84375.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 14
[LightGBM] [Info] Start training from score 4546.361301


[I 2024-08-06 10:51:37,947] Trial 3 finished with value: 73897.109375 and parameters: {'n_estimators': 89, 'max_depth': 7, 'lgb_learning_rate': 0.02467774945633356, 'num_leaves': 32, 'subsample': 0.6727491590082778, 'colsample_bytree': 0.8822756169693712, 'hidden_layer_0': 33, 'hidden_layer_1': 203, 'hidden_layer_2': 42, 'nn_learning_rate': 0.03514830460902628, 'batch_size': 256, 'num_epochs': 59}. Best is trial 3 with value: 73897.109375.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 14
[LightGBM] [Info] Start training from score 4546.361301


[I 2024-08-06 10:51:41,704] Trial 4 finished with value: 19285.455078125 and parameters: {'n_estimators': 132, 'max_depth': 3, 'lgb_learning_rate': 0.09196238369415112, 'num_leaves': 61, 'subsample': 0.9848122774199848, 'colsample_bytree': 0.926831856626592, 'hidden_layer_0': 155, 'hidden_layer_1': 46, 'hidden_layer_2': 71, 'nn_learning_rate': 0.007218946885761279, 'batch_size': 64, 'num_epochs': 100}. Best is trial 4 with value: 19285.455078125.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 14
[LightGBM] [Info] Start training from score 4546.361301


[I 2024-08-06 10:51:42,529] Trial 5 finished with value: 2379060.25 and parameters: {'n_estimators': 92, 'max_depth': 6, 'lgb_learning_rate': 0.002345131754625289, 'num_leaves': 49, 'subsample': 0.788051902656306, 'colsample_bytree': 0.5563092429642966, 'hidden_layer_0': 121, 'hidden_layer_1': 64, 'hidden_layer_2': 40, 'nn_learning_rate': 0.0002594834562767456, 'batch_size': 128, 'num_epochs': 33}. Best is trial 4 with value: 19285.455078125.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 14
[LightGBM] [Info] Start training from score 4546.361301


[I 2024-08-06 10:51:46,160] Trial 6 finished with value: 2481433.75 and parameters: {'n_estimators': 72, 'max_depth': 8, 'lgb_learning_rate': 0.0024385561555397006, 'num_leaves': 40, 'subsample': 0.8937575421405362, 'colsample_bytree': 0.9529447808587821, 'hidden_layer_0': 182, 'hidden_layer_1': 156, 'hidden_layer_2': 71, 'nn_learning_rate': 0.051986027253995724, 'batch_size': 64, 'num_epochs': 74}. Best is trial 4 with value: 19285.455078125.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000098 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 14
[LightGBM] [Info] Start training from score 4546.361301


[I 2024-08-06 10:51:46,634] Trial 7 finished with value: 1136585.625 and parameters: {'n_estimators': 146, 'max_depth': 3, 'lgb_learning_rate': 0.004790133570810796, 'num_leaves': 99, 'subsample': 0.8498604037395934, 'colsample_bytree': 0.6238814587503646, 'hidden_layer_0': 230, 'hidden_layer_1': 138, 'hidden_layer_2': 132, 'nn_learning_rate': 0.00039865899475304966, 'batch_size': 256, 'num_epochs': 18}. Best is trial 4 with value: 19285.455078125.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 14
[LightGBM] [Info] Start training from score 4546.361301


[I 2024-08-06 10:51:49,235] Trial 8 finished with value: 175843.484375 and parameters: {'n_estimators': 130, 'max_depth': 8, 'lgb_learning_rate': 0.013134729535915362, 'num_leaves': 63, 'subsample': 0.8899849784419127, 'colsample_bytree': 0.6826857045878769, 'hidden_layer_0': 237, 'hidden_layer_1': 184, 'hidden_layer_2': 121, 'nn_learning_rate': 0.0004967123921114462, 'batch_size': 256, 'num_epochs': 98}. Best is trial 4 with value: 19285.455078125.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000225 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 14
[LightGBM] [Info] Start training from score 4546.361301


[I 2024-08-06 10:51:50,359] Trial 9 finished with value: 3239205.75 and parameters: {'n_estimators': 290, 'max_depth': 4, 'lgb_learning_rate': 0.0001279387134732265, 'num_leaves': 70, 'subsample': 0.7805148632515138, 'colsample_bytree': 0.9067804893857552, 'hidden_layer_0': 174, 'hidden_layer_1': 64, 'hidden_layer_2': 202, 'nn_learning_rate': 0.00018389745449068516, 'batch_size': 256, 'num_epochs': 44}. Best is trial 4 with value: 19285.455078125.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 14
[LightGBM] [Info] Start training from score 4546.361301
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1272
[LightGBM] [Info] Number of data points in the train set: 467, number of used features: 14
[LightGBM] [Info] Start training from score 4612.049251
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000117 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train s

[I 2024-08-06 10:52:10,247] A new study created in memory with name: no-name-029eebe5-de14-468e-bb48-ca95b1fc9c0c


                             MSE         RMSE          MAE  R2 Score  \
Linear Regression            0.0          0.0          0.0       1.0   
Ridge                   0.242336     0.492277     0.398738       1.0   
Lasso                   0.013676     0.116946     0.096502       1.0   
KNN                161030.811799   401.286446   296.465121  0.952954   
Decision Tree       30628.274734   175.009356   123.978311  0.991052   
Random Forest       11178.686815   105.729309    65.787632  0.996734   
Gradient Boosting    8478.942997    92.081176      64.2453  0.997523   
XGBoost              9031.190737    95.032577     66.63099  0.997361   
LightGBM            12274.312832   110.789498    72.435986  0.996414   
CatBoost             6256.066985    79.095303    61.673092  0.998172   
MLP                112512.623433   335.429014   264.401499  0.967129   
DNN                    3837166.5  1958.868652  1641.655029 -0.121047   
DCN                 11457.277344   107.038673    84.923981  0.99

  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:52:11,838] Trial 0 finished with value: 2652992.0 and parameters: {'num_heads': 5, 'embedding_dim': 5, 'num_layers': 2, 'hidden_layer_0': 207, 'hidden_layer_1': 155, 'hidden_layer_2': 191, 'nn_learning_rate': 0.005765542935304501, 'batch_size': 256, 'num_epochs': 25}. Best is trial 0 with value: 2652992.0.
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:52:13,406] Trial 1 finished with value: 19184310.0 and parameters: {'num_heads': 1, 'embedding_dim': 12, 'num_layers': 2, 'hidden_layer_0': 104, 'hidden_layer_1': 185, 'hidden_layer_2': 82, 'nn_learning_rate': 0.0003972802075288308, 'batch_size': 128, 'num_epochs': 27}. Best is trial 0 with value: 2652992.0.
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:52:17,639] Trial 2 finished with value: 573273.125 and parameters: {'num_heads': 2, 'embedding_di

                             MSE         RMSE          MAE  R2 Score  \
Linear Regression            0.0          0.0          0.0       1.0   
Ridge                   0.242336     0.492277     0.398738       1.0   
Lasso                   0.013676     0.116946     0.096502       1.0   
KNN                161030.811799   401.286446   296.465121  0.952954   
Decision Tree       30628.274734   175.009356   123.978311  0.991052   
Random Forest       11178.686815   105.729309    65.787632  0.996734   
Gradient Boosting    8478.942997    92.081176      64.2453  0.997523   
XGBoost              9031.190737    95.032577     66.63099  0.997361   
LightGBM            12274.312832   110.789498    72.435986  0.996414   
CatBoost             6256.066985    79.095303    61.673092  0.998172   
MLP                112512.623433   335.429014   264.401499  0.967129   
DNN                    3837166.5  1958.868652  1641.655029 -0.121047   
DCN                 11457.277344   107.038673    84.923981  0.99

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-08-06 10:53:01,368] Trial 0 finished with value: 3421469.0 and parameters: {'num_heads': 3, 'embedding_dim': 63, 'num_layers': 2, 'hidden_layer_0': 252, 'hidden_layer_1': 243, 'hidden_layer_2': 185, 'nn_learning_rate': 0.00010342671135657405, 'batch_size': 32, 'num_epochs': 73}. Best is trial 0 with value: 3421469.0.
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  nn_learning_rate = trial.suggest_loguniform('nn_learning_rate', 1e-4, 1e-1)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-08-06 10:53:08,681] T

                             MSE         RMSE          MAE  R2 Score  \
Linear Regression            0.0          0.0          0.0       1.0   
Ridge                   0.242336     0.492277     0.398738       1.0   
Lasso                   0.013676     0.116946     0.096502       1.0   
KNN                161030.811799   401.286446   296.465121  0.952954   
Decision Tree       30628.274734   175.009356   123.978311  0.991052   
Random Forest       11178.686815   105.729309    65.787632  0.996734   
Gradient Boosting    8478.942997    92.081176      64.2453  0.997523   
XGBoost              9031.190737    95.032577     66.63099  0.997361   
LightGBM            12274.312832   110.789498    72.435986  0.996414   
CatBoost             6256.066985    79.095303    61.673092  0.998172   
MLP                112512.623433   335.429014   264.401499  0.967129   
DNN                    3837166.5  1958.868652  1641.655029 -0.121047   
DCN                 11457.277344   107.038673    84.923981  0.99

  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-08-06 10:54:25,926] Trial 0 finished with value: 4488808.5 and parameters: {'num_layers': 4, 'hidden_layer_0': 36, 'hidden_layer_1': 152, 'hidden_layer_2': 194, 'hidden_layer_3': 205, 'learning_rate': 0.01350052291519599, 'batch_size': 64, 'num_epochs': 21}. Best is trial 0 with value: 4488808.5.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-08-06 10:54:28,908] Trial 1 finished with value: 3085277.25 and parameters: {'num_layers': 2, 'hidden_layer_0': 88, 'hidden_layer_1': 186, 'learning_rate': 0.047488785766772144, 'batch_size': 64, 'num_epochs': 81}. Best is trial 1 with value: 3085277.25.
  learning_rate = trial.suggest_loguniform(

                                      MSE         RMSE          MAE  R2 Score  \
Linear Regression                     0.0          0.0          0.0       1.0   
Ridge                            0.242336     0.492277     0.398738       1.0   
Lasso                            0.013676     0.116946     0.096502       1.0   
KNN                         161030.811799   401.286446   296.465121  0.952954   
Decision Tree                30628.274734   175.009356   123.978311  0.991052   
Random Forest                11178.686815   105.729309    65.787632  0.996734   
Gradient Boosting             8478.942997    92.081176      64.2453  0.997523   
XGBoost                       9031.190737    95.032577     66.63099  0.997361   
LightGBM                     12274.312832   110.789498    72.435986  0.996414   
CatBoost                      6256.066985    79.095303    61.673092  0.998172   
MLP                         112512.623433   335.429014   264.401499  0.967129   
DNN                         

  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:55:05,090] Trial 0 finished with value: 9680211.0 and parameters: {'hidden_dim': 143, 'learning_rate': 0.007395867885574298, 'batch_size': 256, 'num_epochs': 66}. Best is trial 0 with value: 9680211.0.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:55:06,714] Trial 1 finished with value: 790641.6875 and parameters: {'hidden_dim': 220, 'learning_rate': 0.01001065513729506, 'batch_size': 128, 'num_epochs': 84}. Best is trial 1 with value: 790641.6875.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:55:07,237] Trial 2 finished with value: 1021323.1875 and parameters: {'hidden_dim': 249, 'learning_rate': 0.027279409337392695, 'batch_size': 128, 'num_epochs': 26}. Best is trial 1 with value: 790641.6875.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:55:09,841] Trial 3 finished wi

                                      MSE         RMSE          MAE  R2 Score  \
Linear Regression                     0.0          0.0          0.0       1.0   
Ridge                            0.242336     0.492277     0.398738       1.0   
Lasso                            0.013676     0.116946     0.096502       1.0   
KNN                         161030.811799   401.286446   296.465121  0.952954   
Decision Tree                30628.274734   175.009356   123.978311  0.991052   
Random Forest                11178.686815   105.729309    65.787632  0.996734   
Gradient Boosting             8478.942997    92.081176      64.2453  0.997523   
XGBoost                       9031.190737    95.032577     66.63099  0.997361   
LightGBM                     12274.312832   110.789498    72.435986  0.996414   
CatBoost                      6256.066985    79.095303    61.673092  0.998172   
MLP                         112512.623433   335.429014   264.401499  0.967129   
DNN                         

[I 2024-08-06 10:55:35,739] Trial 0 finished with value: 22289934.0 and parameters: {'num_layers': 3, 'num_trees': 9, 'tree_dim': 22, 'learning_rate': 0.000488390552065308, 'batch_size': 256, 'num_epochs': 68}. Best is trial 0 with value: 22289934.0.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:55:40,243] Trial 1 finished with value: 2410389.25 and parameters: {'num_layers': 4, 'num_trees': 2, 'tree_dim': 45, 'learning_rate': 0.001593274747913012, 'batch_size': 64, 'num_epochs': 85}. Best is trial 1 with value: 2410389.25.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:55:46,636] Trial 2 finished with value: 1460056.25 and parameters: {'num_layers': 4, 'num_trees': 9, 'tree_dim': 60, 'learning_rate': 0.002842355564862037, 'batch_size': 128, 'num_epochs': 53}. Best is trial 2 with value: 1460056.25.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 10:55:47,744] Tria

                                      MSE         RMSE          MAE  R2 Score  \
Linear Regression                     0.0          0.0          0.0       1.0   
Ridge                            0.242336     0.492277     0.398738       1.0   
Lasso                            0.013676     0.116946     0.096502       1.0   
KNN                         161030.811799   401.286446   296.465121  0.952954   
Decision Tree                30628.274734   175.009356   123.978311  0.991052   
Random Forest                11178.686815   105.729309    65.787632  0.996734   
Gradient Boosting             8478.942997    92.081176      64.2453  0.997523   
XGBoost                       9031.190737    95.032577     66.63099  0.997361   
LightGBM                     12274.312832   110.789498    72.435986  0.996414   
CatBoost                      6256.066985    79.095303    61.673092  0.998172   
MLP                         112512.623433   335.429014   264.401499  0.967129   
DNN                         

  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 24452210.38356| val_rmse: 4699.63363|  0:00:00s
epoch 1  | loss: 24217496.08219| val_rmse: 4654.52826|  0:00:02s
epoch 2  | loss: 23688959.72603| val_rmse: 4564.50705|  0:00:03s
epoch 3  | loss: 22578614.28767| val_rmse: 4394.94117|  0:00:04s
epoch 4  | loss: 20718039.78082| val_rmse: 3963.44347|  0:00:05s
epoch 5  | loss: 17054912.93151| val_rmse: 3176.75772|  0:00:06s
epoch 6  | loss: 11635838.08219| val_rmse: 1909.47455|  0:00:07s
epoch 7  | loss: 5556557.08904| val_rmse: 1018.34752|  0:00:08s
epoch 8  | loss: 1772434.12329| val_rmse: 2144.8063|  0:00:09s
epoch 9  | loss: 798684.55565| val_rmse: 2773.9313|  0:00:10s
epoch 10 | loss: 631880.92808| val_rmse: 1450.50534|  0:00:11s
epoch 11 | loss: 547872.94521| val_rmse: 985.79514|  0:00:12s
epoch 12 | loss: 339305.11644| val_rmse: 1246.25962|  0:00:14s
epoch 13 | loss: 320559.12414| val_rmse: 910.63539|  0:00:15s
epoch 14 | loss: 329848.46704| val_rmse: 841.20781|  0:00:16s
epoch 15 | loss: 293487.26199| val_rmse: 742

[I 2024-08-06 10:57:05,736] Trial 0 finished with value: 76199.7500882905 and parameters: {'n_d': 8, 'n_a': 20, 'n_steps': 10, 'gamma': 1.2357305038853414, 'lambda_sparse': 1.2028553619612116e-05, 'learning_rate': 0.07369443477914511, 'batch_size': 64, 'num_epochs': 34}. Best is trial 0 with value: 76199.7500882905.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 24413452.35616| val_rmse: 4662.21144|  0:00:00s
epoch 1  | loss: 24096747.91781| val_rmse: 4545.17092|  0:00:01s
epoch 2  | loss: 23496734.19178| val_rmse: 4458.74753|  0:00:01s
epoch 3  | loss: 22488928.32877| val_rmse: 4321.93886|  0:00:02s
epoch 4  | loss: 21054356.16438| val_rmse: 4134.86253|  0:00:02s
epoch 5  | loss: 19083935.91781| val_rmse: 3873.31716|  0:00:03s
epoch 6  | loss: 16472949.28767| val_rmse: 3516.44022|  0:00:04s
epoch 7  | loss: 13370234.94521| val_rmse: 3004.28952|  0:00:04s
epoch 8  | loss: 10066234.0| val_rmse: 2589.60607|  0:00:05s
epoch 9  | loss: 6950763.09589| val_rmse: 1511.68131|  0:00:05s
epoch 10 | loss: 4203651.22603| val_rmse: 1165.24406|  0:00:06s
epoch 11 | loss: 1837469.48116| val_rmse: 1395.40444|  0:00:06s
epoch 12 | loss: 725363.07577| val_rmse: 2289.79228|  0:00:07s
epoch 13 | loss: 432445.27269| val_rmse: 2576.39484|  0:00:07s
epoch 14 | loss: 410195.5854| val_rmse: 2258.3481|  0:00:08s
epoch 15 | loss: 414475.28168| val_rmse:

[I 2024-08-06 10:57:16,088] Trial 1 finished with value: 1357793.7128748728 and parameters: {'n_d': 14, 'n_a': 55, 'n_steps': 8, 'gamma': 1.1467907717737122, 'lambda_sparse': 0.0008453940954853845, 'learning_rate': 0.078293614917899, 'batch_size': 128, 'num_epochs': 18}. Best is trial 0 with value: 76199.7500882905.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 24486133.75342| val_rmse: 4731.08483|  0:00:00s
epoch 1  | loss: 24485065.53425| val_rmse: 4731.10512|  0:00:00s
epoch 2  | loss: 24484192.30137| val_rmse: 4731.10133|  0:00:00s
epoch 3  | loss: 24483564.82192| val_rmse: 4730.96262|  0:00:00s
epoch 4  | loss: 24482556.60274| val_rmse: 4730.93655|  0:00:01s
epoch 5  | loss: 24482091.94521| val_rmse: 4730.92259|  0:00:01s
epoch 6  | loss: 24480974.82192| val_rmse: 4730.88377|  0:00:01s
epoch 7  | loss: 24480501.50685| val_rmse: 4730.8717|  0:00:02s
epoch 8  | loss: 24479470.73973| val_rmse: 4730.7897|  0:00:02s
epoch 9  | loss: 24478917.45205| val_rmse: 4730.67139|  0:00:02s
epoch 10 | loss: 24477880.73973| val_rmse: 4730.58492|  0:00:02s
epoch 11 | loss: 24477315.53425| val_rmse: 4730.52335|  0:00:02s
epoch 12 | loss: 24476212.63014| val_rmse: 4730.49947|  0:00:03s
epoch 13 | loss: 24475692.60274| val_rmse: 4730.44644|  0:00:03s
epoch 14 | loss: 24474866.82192| val_rmse: 4730.36838|  0:00:03s
epoch 15 | loss: 24473854.8

[I 2024-08-06 10:57:25,681] Trial 2 finished with value: 22355799.065775596 and parameters: {'n_d': 8, 'n_a': 48, 'n_steps': 3, 'gamma': 1.2834916001409709, 'lambda_sparse': 4.920167058145946e-05, 'learning_rate': 0.0005914133483117685, 'batch_size': 128, 'num_epochs': 39}. Best is trial 0 with value: 76199.7500882905.


epoch 38 | loss: 24453136.10959| val_rmse: 4728.19194|  0:00:09s
Stop training because you reached max_epochs = 39 with best_epoch = 38 and best_val_rmse = 4728.19194


  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 24480732.54795| val_rmse: 4729.77081|  0:00:00s
epoch 1  | loss: 24467553.75342| val_rmse: 4727.84109|  0:00:01s
epoch 2  | loss: 24451686.76712| val_rmse: 4726.90527|  0:00:01s
epoch 3  | loss: 24438625.09589| val_rmse: 4725.62572|  0:00:02s
epoch 4  | loss: 24427010.9589| val_rmse: 4724.59537|  0:00:02s
epoch 5  | loss: 24412820.13699| val_rmse: 4724.0054|  0:00:03s
epoch 6  | loss: 24403594.63014| val_rmse: 4722.80717|  0:00:04s
epoch 7  | loss: 24392263.34247| val_rmse: 4722.24709|  0:00:04s
epoch 8  | loss: 24382890.27397| val_rmse: 4721.12929|  0:00:05s
epoch 9  | loss: 24372352.9589| val_rmse: 4719.83091|  0:00:05s
epoch 10 | loss: 24362125.9726| val_rmse: 4718.92372|  0:00:06s
epoch 11 | loss: 24351390.82192| val_rmse: 4717.92682|  0:00:06s
epoch 12 | loss: 24342890.63014| val_rmse: 4717.17622|  0:00:07s
epoch 13 | loss: 24338006.13699| val_rmse: 4716.68049|  0:00:08s
epoch 14 | loss: 24330807.67123| val_rmse: 4715.95697|  0:00:08s
epoch 15 | loss: 24318998.520

[I 2024-08-06 10:57:42,514] Trial 3 finished with value: 22088987.204420976 and parameters: {'n_d': 28, 'n_a': 10, 'n_steps': 10, 'gamma': 1.5433950905710887, 'lambda_sparse': 4.5040512233346874e-05, 'learning_rate': 0.0030057199503047993, 'batch_size': 128, 'num_epochs': 28}. Best is trial 0 with value: 76199.7500882905.
  lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-6, 1e-3)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)


epoch 0  | loss: 24194045.23288| val_rmse: 4639.36141|  0:00:01s
epoch 1  | loss: 23182136.93151| val_rmse: 4484.09677|  0:00:03s
epoch 2  | loss: 21296910.19178| val_rmse: 4261.9565|  0:00:05s
epoch 3  | loss: 18329377.56164| val_rmse: 3920.79005|  0:00:07s
epoch 4  | loss: 14170822.13699| val_rmse: 3229.0747|  0:00:09s
epoch 5  | loss: 9001659.46404| val_rmse: 2482.45138|  0:00:11s
epoch 6  | loss: 4746198.55137| val_rmse: 1812.74439|  0:00:13s
epoch 7  | loss: 1873058.25342| val_rmse: 921.70611|  0:00:14s
epoch 8  | loss: 854688.23716| val_rmse: 739.16533|  0:00:16s
epoch 9  | loss: 980503.25342| val_rmse: 659.14614|  0:00:18s
epoch 10 | loss: 826414.8476| val_rmse: 677.52522|  0:00:20s
epoch 11 | loss: 798523.35788| val_rmse: 555.56248|  0:00:22s
epoch 12 | loss: 573836.73031| val_rmse: 532.15729|  0:00:23s
epoch 13 | loss: 618201.31571| val_rmse: 468.65609|  0:00:25s
epoch 14 | loss: 572446.72774| val_rmse: 452.31771|  0:00:27s
epoch 15 | loss: 609627.91096| val_rmse: 422.4022|  0

[I 2024-08-06 10:58:39,802] Trial 4 finished with value: 97539.57682783014 and parameters: {'n_d': 50, 'n_a': 35, 'n_steps': 9, 'gamma': 1.116697820972655, 'lambda_sparse': 2.6334513984166876e-05, 'learning_rate': 0.019131552859410053, 'batch_size': 32, 'num_epochs': 29}. Best is trial 0 with value: 76199.7500882905.


epoch 0  | loss: 24452210.38356| val_rmse: 4699.63363|  0:00:01s
epoch 1  | loss: 24217496.08219| val_rmse: 4654.52826|  0:00:02s
epoch 2  | loss: 23688959.72603| val_rmse: 4564.50705|  0:00:03s
epoch 3  | loss: 22578614.28767| val_rmse: 4394.94117|  0:00:04s
epoch 4  | loss: 20718039.78082| val_rmse: 3963.44347|  0:00:05s
epoch 5  | loss: 17054912.93151| val_rmse: 3176.75772|  0:00:05s
epoch 6  | loss: 11635838.08219| val_rmse: 1909.47455|  0:00:06s
epoch 7  | loss: 5556557.08904| val_rmse: 1018.34752|  0:00:07s
epoch 8  | loss: 1772434.12329| val_rmse: 2144.8063|  0:00:08s
epoch 9  | loss: 798684.55565| val_rmse: 2773.9313|  0:00:09s
epoch 10 | loss: 631880.92808| val_rmse: 1450.50534|  0:00:10s
epoch 11 | loss: 547872.94521| val_rmse: 985.79514|  0:00:11s
epoch 12 | loss: 339305.11644| val_rmse: 1246.25962|  0:00:12s
epoch 13 | loss: 320559.12414| val_rmse: 910.63539|  0:00:13s
epoch 14 | loss: 329848.46704| val_rmse: 841.20781|  0:00:14s
epoch 15 | loss: 293487.26199| val_rmse: 742



epoch 0  | loss: 25169468.10278| val_rmse: 4634.35733|  0:00:00s
epoch 1  | loss: 24999813.30193| val_rmse: 4605.66987|  0:00:01s
epoch 2  | loss: 24792012.30407| val_rmse: 4564.68171|  0:00:02s
epoch 3  | loss: 24441288.6167| val_rmse: 4504.18548|  0:00:03s
epoch 4  | loss: 23889182.68951| val_rmse: 4371.63852|  0:00:03s
epoch 5  | loss: 22598617.75161| val_rmse: 4061.71298|  0:00:04s
epoch 6  | loss: 20115911.48394| val_rmse: 3570.03648|  0:00:05s
epoch 7  | loss: 16286042.11349| val_rmse: 2737.0023|  0:00:06s
epoch 8  | loss: 11496987.93576| val_rmse: 1389.00026|  0:00:06s
epoch 9  | loss: 7094881.79015| val_rmse: 1240.31717|  0:00:07s
epoch 10 | loss: 3389781.08137| val_rmse: 2095.9284|  0:00:08s
epoch 11 | loss: 1332766.15538| val_rmse: 2364.6902|  0:00:09s
epoch 12 | loss: 804508.03239| val_rmse: 2313.56318|  0:00:09s
epoch 13 | loss: 723574.74762| val_rmse: 1531.84139|  0:00:10s
epoch 14 | loss: 896832.00187| val_rmse: 1210.2204|  0:00:11s
epoch 15 | loss: 939486.89601| val_rmse



epoch 0  | loss: 24022977.05782| val_rmse: 5106.86792|  0:00:00s
epoch 1  | loss: 23853603.24197| val_rmse: 5079.44603|  0:00:01s
epoch 2  | loss: 23587466.26981| val_rmse: 5009.64075|  0:00:02s
epoch 3  | loss: 23071381.4818| val_rmse: 4920.32872|  0:00:03s
epoch 4  | loss: 22126108.12848| val_rmse: 4734.16062|  0:00:04s
epoch 5  | loss: 20528513.88437| val_rmse: 4385.87282|  0:00:04s
epoch 6  | loss: 18140814.62527| val_rmse: 3983.69233|  0:00:05s
epoch 7  | loss: 15113480.46467| val_rmse: 3271.50572|  0:00:06s
epoch 8  | loss: 11156376.73233| val_rmse: 2394.34973|  0:00:06s
epoch 9  | loss: 7068091.76338| val_rmse: 1393.096|  0:00:07s
epoch 10 | loss: 3221020.48796| val_rmse: 1060.0737|  0:00:08s
epoch 11 | loss: 763399.01164| val_rmse: 1982.5137|  0:00:09s
epoch 12 | loss: 739029.53466| val_rmse: 1990.16649|  0:00:11s
epoch 13 | loss: 430174.3298| val_rmse: 2193.3237|  0:00:12s
epoch 14 | loss: 229050.01077| val_rmse: 2101.11711|  0:00:14s
epoch 15 | loss: 311527.60278| val_rmse: 1



epoch 0  | loss: 24747712.65525| val_rmse: 4811.90522|  0:00:00s
epoch 1  | loss: 24557667.4454| val_rmse: 4782.56199|  0:00:01s
epoch 2  | loss: 24273730.55246| val_rmse: 4739.589|  0:00:02s
epoch 3  | loss: 23649539.01071| val_rmse: 4682.37813|  0:00:03s
epoch 4  | loss: 22652369.66167| val_rmse: 4549.31562|  0:00:03s
epoch 5  | loss: 21356676.70664| val_rmse: 4348.59452|  0:00:05s
epoch 6  | loss: 19334711.32762| val_rmse: 4035.45819|  0:00:05s
epoch 7  | loss: 16228246.19486| val_rmse: 3299.68273|  0:00:06s
epoch 8  | loss: 12037929.55567| val_rmse: 2453.85656|  0:00:07s
epoch 9  | loss: 7195992.21039| val_rmse: 1596.89912|  0:00:08s
epoch 10 | loss: 3545655.49518| val_rmse: 1025.51965|  0:00:08s
epoch 11 | loss: 1339541.21547| val_rmse: 1576.31228|  0:00:09s
epoch 12 | loss: 713916.54216| val_rmse: 3177.95412|  0:00:10s
epoch 13 | loss: 871446.77797| val_rmse: 2450.64964|  0:00:11s
epoch 14 | loss: 671382.32468| val_rmse: 1300.88421|  0:00:12s
epoch 15 | loss: 482878.0273| val_rms



epoch 0  | loss: 23906812.66381| val_rmse: 5148.89825|  0:00:00s
epoch 1  | loss: 23752080.88651| val_rmse: 5110.82522|  0:00:01s
epoch 2  | loss: 23499087.45182| val_rmse: 5067.97158|  0:00:02s
epoch 3  | loss: 23089232.82227| val_rmse: 5005.11234|  0:00:03s
epoch 4  | loss: 22268926.61884| val_rmse: 4838.43999|  0:00:04s
epoch 5  | loss: 20586522.10278| val_rmse: 4537.17993|  0:00:05s
epoch 6  | loss: 18113000.7409| val_rmse: 4145.85509|  0:00:06s
epoch 7  | loss: 14726377.80728| val_rmse: 3287.68825|  0:00:08s
epoch 8  | loss: 10221076.97002| val_rmse: 2294.47 |  0:00:09s
epoch 9  | loss: 5188571.50749| val_rmse: 1074.07164|  0:00:10s
epoch 10 | loss: 1565902.23608| val_rmse: 1276.1466|  0:00:11s
epoch 11 | loss: 652345.85934| val_rmse: 3324.45964|  0:00:12s
epoch 12 | loss: 459559.01934| val_rmse: 3008.62838|  0:00:13s
epoch 13 | loss: 408233.44841| val_rmse: 2649.63594|  0:00:13s
epoch 14 | loss: 437759.57317| val_rmse: 2137.59463|  0:00:14s
epoch 15 | loss: 411092.74993| val_rmse



epoch 0  | loss: 24498725.57265| val_rmse: 4920.85647|  0:00:00s
epoch 1  | loss: 24331322.90598| val_rmse: 4870.4998|  0:00:01s
epoch 2  | loss: 24052455.7265| val_rmse: 4847.83747|  0:00:02s
epoch 3  | loss: 23510567.62393| val_rmse: 4769.44624|  0:00:02s
epoch 4  | loss: 22564583.24786| val_rmse: 4569.64282|  0:00:03s
epoch 5  | loss: 20996712.18803| val_rmse: 4206.66165|  0:00:04s
epoch 6  | loss: 18488050.0| val_rmse: 3727.02999|  0:00:05s
epoch 7  | loss: 15182956.94017| val_rmse: 3004.43316|  0:00:05s
epoch 8  | loss: 11095453.80342| val_rmse: 1734.46439|  0:00:06s
epoch 9  | loss: 6745256.61752| val_rmse: 903.92626|  0:00:07s
epoch 10 | loss: 3025149.5| val_rmse: 2414.38468|  0:00:08s
epoch 11 | loss: 891748.80101| val_rmse: 3096.08453|  0:00:09s
epoch 12 | loss: 618228.76656| val_rmse: 3533.15333|  0:00:09s
epoch 13 | loss: 585663.2703| val_rmse: 3127.70878|  0:00:10s
epoch 14 | loss: 606631.04808| val_rmse: 2297.85575|  0:00:11s
epoch 15 | loss: 479162.47569| val_rmse: 1549.1

[I 2024-08-06 11:01:40,724] A new study created in memory with name: no-name-9b091097-93c8-4796-b545-3b00490a06cd


                                      MSE         RMSE          MAE  R2 Score  \
Linear Regression                     0.0          0.0          0.0       1.0   
Ridge                            0.242336     0.492277     0.398738       1.0   
Lasso                            0.013676     0.116946     0.096502       1.0   
KNN                         161030.811799   401.286446   296.465121  0.952954   
Decision Tree                30628.274734   175.009356   123.978311  0.991052   
Random Forest                11178.686815   105.729309    65.787632  0.996734   
Gradient Boosting             8478.942997    92.081176      64.2453  0.997523   
XGBoost                       9031.190737    95.032577     66.63099  0.997361   
LightGBM                     12274.312832   110.789498    72.435986  0.996414   
CatBoost                      6256.066985    79.095303    61.673092  0.998172   
MLP                         112512.623433   335.429014   264.401499  0.967129   
DNN                         

  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 11:01:57,007] Trial 0 finished with value: 5691730.5 and parameters: {'heads': 5, 'dim': 205, 'depth': 5, 'mlp_dim': 245, 'dropout': 0.1675405283586982, 'learning_rate': 0.012337207550717549, 'batch_size': 128, 'num_epochs': 31}. Best is trial 0 with value: 5691730.5.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 11:02:15,790] Trial 1 finished with value: 3447522.75 and parameters: {'heads': 8, 'dim': 176, 'depth': 6, 'mlp_dim': 182, 'dropout': 0.29067775782058936, 'learning_rate': 0.027944919564903945, 'batch_size': 64, 'num_epochs': 26}. Best is trial 1 with value: 3447522.75.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
[I 2024-08-06 11:02:22,351] Trial 2 finished with value: 22112586.0 and parameters: {'heads': 5, 'dim': 205, 'depth': 4, 'mlp_dim': 189, 'dropout': 0.40429882267265616, 'learning_rate': 0.0007160417220854537, 'batch_

                                      MSE         RMSE          MAE  R2 Score  \
Linear Regression                     0.0          0.0          0.0       1.0   
Ridge                            0.242336     0.492277     0.398738       1.0   
Lasso                            0.013676     0.116946     0.096502       1.0   
KNN                         161030.811799   401.286446   296.465121  0.952954   
Decision Tree                30628.274734   175.009356   123.978311  0.991052   
Random Forest                11178.686815   105.729309    65.787632  0.996734   
Gradient Boosting             8478.942997    92.081176      64.2453  0.997523   
XGBoost                       9031.190737    95.032577     66.63099  0.997361   
LightGBM                     12274.312832   110.789498    72.435986  0.996414   
CatBoost                      6256.066985    79.095303    61.673092  0.998172   
MLP                         112512.623433   335.429014   264.401499  0.967129   
DNN                         