In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
random_state = 42 
raw_dataset = pd.read_csv("./original_dataset/processed_data_encoded.csv") #data has X and Y, community 0-9
X = raw_dataset.drop(columns=["BMI", "DR"])
Y = pd.DataFrame(raw_dataset["DR"])
# Slice your data


X_FOR_FOLDS, X_FINAL_TEST, Y_FOR_FOLDS, Y_FINAL_TEST = train_test_split(X, Y, test_size=0.1, random_state=random_state, stratify=Y)
df = pd.concat([X_FOR_FOLDS, Y_FOR_FOLDS], axis=1)
df.reset_index(drop=True, inplace=True)

In [2]:
import torch
import torch.nn as nn
class FeedForwardBlock(nn.Module):
    def __init__(self, in_features, out_features, dropout=None, activation=nn.ReLU6):
        super().__init__()
        layers = [
            nn.Linear(in_features, out_features),
            nn.BatchNorm1d(out_features),
            activation()
        ]
        if dropout and dropout > 0:
            layers.append(nn.Dropout(dropout))
        self.block = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.block(x)
def print_activations(x, name):
    print(f"{name}: min={x.min().item()}, max={x.max().item()}")
    return x
class MyModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout= None, hidden_dim2= None):
        super().__init__()
        
        # A couple of FeedForward blocks
        self.block1 = FeedForwardBlock(input_dim, hidden_dim, dropout= None)
        self.block2 = FeedForwardBlock(hidden_dim, hidden_dim2, dropout = dropout/2)
        self.block3 = FeedForwardBlock(hidden_dim2, output_dim, dropout)

        # Final output layer (could be softmax, sigmoid, or whatever your target is)
        self.output_layer = nn.Linear(output_dim, 1)  # Just in case you're doing regression or binary classification

    def forward(self, x):
        x = self.block1(x)
        # x = print_activations(x, "after block1")
        x = self.block2(x)
        # x = print_activations(x, "after block2")
        x = self.block3(x)
        # x = print_activations(x, "after block3")
        x = self.output_layer(x)  # Final linear layer
        return x

#* Test the model
test_model = MyModel(input_dim = 20,
                     hidden_dim = 64,
                     hidden_dim2 = 16,
                     output_dim = 4,
                     dropout = .2)
print(test_model)


MyModel(
  (block1): FeedForwardBlock(
    (block): Sequential(
      (0): Linear(in_features=20, out_features=64, bias=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6()
    )
  )
  (block2): FeedForwardBlock(
    (block): Sequential(
      (0): Linear(in_features=64, out_features=16, bias=True)
      (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6()
      (3): Dropout(p=0.1, inplace=False)
    )
  )
  (block3): FeedForwardBlock(
    (block): Sequential(
      (0): Linear(in_features=16, out_features=4, bias=True)
      (1): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6()
      (3): Dropout(p=0.2, inplace=False)
    )
  )
  (output_layer): Linear(in_features=4, out_features=1, bias=True)
)


In [3]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using", device)
def train_and_evaluate(model, criterion, optimiser, scheduler, train_loader, val_loader, epochs=20, patience=5, device=device, threshold = 0.5):
    # if isinstance(model.last_layer(), nn.Sigmoid) and isinstance(criterion, nn.BCEWithLogitsLoss):
    #     raise ValueError("Model output is Sigmoid but criterion is BCEWithLogitsLoss. Please check your model and criterion compatibility.")
    best_val_loss = float('inf')
    best_model_state = None
    wait = 0
    criterion.to(device) #? Move criterion to device
    #* Epoch Training loop for this fold
    for epoch in range(1,epochs+1):
        #* Set model to training mode: essential for dropout and batch norm layers
        model.train()
        running_loss = 0.0 #? loss for this epoch
        #* Mini-batch training loop
        for batch, (inputs, labels) in enumerate(train_loader,start=1):
            optimiser.zero_grad() #? Zero the gradients
            
            
            torch.set_printoptions(threshold=float('inf'))
            
            assert not torch.isnan(inputs).any(), "Input has NaNs"
            assert not torch.isinf(inputs).any(), "Input has Infs"
            outputs = model(inputs) #? Forward pass through the model
            assert not torch.isnan(outputs).any(), "Model output has NaNs"
            assert not torch.isinf(outputs).any(), "Model output has Infs"
            loss = criterion(outputs, labels) #? Calculate loss
            assert not torch.isnan(loss).any(), "Model loss has NaNs"
            loss.backward() #? Backpropagation
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            running_loss += loss.item()
            optimiser.step() #? Update weights
            scheduler.step()
                
        train_loss = running_loss / len(train_loader)
        # print(f"Epoch: {epoch}, training loss: {train_loss:.4f}")
    
        #* Now we evaluate the model on the validation set, to track training vs validation loss
        model.eval() #? Set model to evaluation mode
        with torch.no_grad(): #? No need to track gradients during evaluation
            val_loss = 0.0    
            for batch, (inputs, labels) in enumerate(val_loader,start=1):#! one pass because val_loader batch size is all, if you want to do it in mini-batches, you MUST change the metric calculations to accept mini-batches
                
                outputs = model(inputs)
                # labels = labels.cpu() 
                loss = criterion(outputs, labels)
                val_loss += loss.item() #? Calculate loss
            avg_val_loss = val_loss / len(val_loader)
        loss_ratio = val_loss / train_loss    
        pos_weight = loss_ratio  # or any other function of loss_ratio you choose
    
        # Update criterion with new pos_weight
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight]).to(device))
        # Early stopping
        if epoch > 30:
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                best_model_state = model.state_dict()
                wait = 0
            elif avg_val_loss*0.95 <= best_val_loss:
                    wait = 0
            else:
                wait += 1
        if wait >= patience:
            print(f"Early stopping triggered at epoch {epoch}, best val loss: {best_val_loss:.4f}")
            break
        print(f"Epoch: {epoch}".ljust(12), f"training loss:{train_loss:.3f}".ljust(16), f"best_val_loss:{best_val_loss:.3f}".ljust(12), f"Val Loss: {avg_val_loss:.3f}", f"Scheduler lr: {scheduler.get_last_lr()}".ljust(50),end="\r")
    #* Use best model to calculate metrics on the validation set
    #! must be outside epoch loop, it comes after the training and cv loop
    model.load_state_dict(best_model_state) #? Load the best model state
    with torch.no_grad():
        for batch, (inputs, labels) in enumerate(val_loader,start=1):#! one pass because val_loader batch size is all, if you want to do it in mini-batches, you MUST change the metric calculations to accept mini-batches
                
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                labels = labels.cpu() 
                # predictions = (torch.sigmoid(outputs) < 0.5).float().cpu().numpy()
                predictions = (torch.sigmoid(outputs) >= threshold).float().cpu().numpy()
                val_loss += loss.item() #? Calculate loss
                
    #! The following should have length equal to fold number           
    accuracy=accuracy_score(labels, predictions) 
    precision=precision_score(labels, predictions, pos_label=1, zero_division=0)
    recall=recall_score(labels, predictions, pos_label=1)
    f1=f1_score(labels, predictions, pos_label=1)
    auc=roc_auc_score(labels, predictions)
    
    return model, accuracy, precision, recall, f1, auc

Using cuda


In [4]:
from Training_Helper_Functions import *
from Preprocessing_Functions import * 
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
import numpy as np
import optuna
from torch import optim
def maximise_combined_score(trial):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    epochs = 10000
    random_state = 42
    
    # Scaler
    scaler = RobustScaler() # Or StandardScaler() - You can also make this a hyperparameter if you want

    kFolds = FOLDS_GENERATOR(df, n_splits=5, random_state=42,             
                            OD_majority = IsolationForest(contamination=trial.suggest_float("contamination_majority", 0.01, 0.4), random_state=random_state), 
                            OD_minority = IsolationForest(contamination=trial.suggest_float("contamination_minority", 0.01, 0.2), random_state=random_state), 
                            oversampler_first = trial.suggest_categorical("oversampler_first", [False]),
                            synthesizer =  trial.suggest_categorical("synthesizer", ["TVAE", "CTGAN"]),
                            epochs = trial.suggest_int("epochs", 100, 1000, step=100),
                            n_synthetic_data = trial.suggest_int("n_synthetic_data", 1000, 10000, step=1000),
                            scaler=scaler,      
                               ) # Pass outlier models and scaler

    # Model hyperparameters (first-level optimization)
    hidden_dim = trial.suggest_int("hidden_dim", 512, 512, step=32)
    hidden_dim2 = trial.suggest_int("hidden_dim2", 256, 256, step=32)
    output_dim = trial.suggest_int("output_dim", 96, 96, step=2)

    dropout = trial.suggest_float("dropout", 0.0, 0.4)
    threshold = trial.suggest_float("threshold", 0.3, 0.8)
    # dropout = None
    initial_lr = trial.suggest_float("initial_lr", 1e-4, 1e-4, log=True)
    max_lr = trial.suggest_float("max_lr", 1e-4, 1e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)


    # Loss function hyperparameters
    criterion_choice = "FocalLoss"
    # Hyperparameter exploration optimization
    if criterion_choice == "BCEWithLogitsLoss":
        pos_weight = trial.suggest_int("pos_weight", 1, 1)
        alpha = None
        gamma = None
    elif criterion_choice == "FocalLoss":
        pos_weight = None
        alpha = trial.suggest_float("alpha", 0.25, 0.75)
        gamma = trial.suggest_float("gamma", 1.0, 5.0)
    else:
        pos_weight = None

    # Initialize lists for metrics across folds
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_list = []
    auc_list = []

    # Cross-validation loop
    for fold, (train_x, test_x, train_y, test_y) in enumerate(kFolds, start=1):
        print(f"Fold {fold}:")
        # Create DataLoader for current fold
        train_loader, val_loader = fold_to_dataloader_tensor(train_x, test_x, train_y, test_y, batch_size=64,
                                                            device=device)
        # Instantiate and initialize the model
        model = MyModel(input_dim=get_feature_count(train_loader), hidden_dim=hidden_dim, hidden_dim2=hidden_dim2,
                        output_dim=output_dim, dropout=dropout)
        model.to(device)
        model.apply(init_weights)

        # Map the choice to the actual loss function
        criterion = criterion_mapping(criterion_choice, pos_weight, alpha, gamma)
        optimiser = optim.Adam(model.parameters(), lr=initial_lr, weight_decay=weight_decay)
        scheduler = torch.optim.lr_scheduler.CyclicLR(
            optimiser,
            base_lr=1e-5,
            max_lr=max_lr,
            cycle_momentum=False)

        # Train and evaluate the model on the current fold
        model, accuracy, precision, recall, f1, auc = train_and_evaluate(
            model, criterion, optimiser, scheduler, train_loader, val_loader, epochs=epochs, patience=100,
            device=device, threshold=threshold
        )
        print(f"Accuracy: {accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, f1: {f1:.4f}, auc: {auc:.4f}")
        del model
        del train_loader
        del val_loader

        # Append the metrics from the current fold
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
        auc_list.append(auc)
        break

    # Calculate the average metrics across all folds
    avg_accuracy = np.sum(accuracy_list) / len(accuracy_list)
    avg_precision = np.sum(precision_list) / len(precision_list)
    avg_recall = np.sum(recall_list) / len(recall_list)
    avg_f1 = np.sum(f1_list) / len(f1_list)
    avg_auc = np.sum(auc_list) / len(auc_list)

    # Combine metrics into a single "score"
    # combined_score = (avg_f1 + avg_precision + avg_recall + avg_accuracy + avg_auc) / 5
    combined_score = avg_f1

    return combined_score

Using cuda


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# import threading
# import optuna
# from optuna_dashboard import run_server

# def start_dashboard():
#     run_server(storage)

# storage = optuna.storages.InMemoryStorage()
# study = optuna.create_study(direction="maximize", storage=storage, study_name="Basic")

# # Start dashboard in a separate thread
# dashboard_thread = threading.Thread(target=start_dashboard, daemon=True)
# dashboard_thread.start()

# # Run optimization
# study.optimize(maximise_combined_score, n_trials=30)

# # After optimization, print results
# print("Best trial:")
# trial = study.best_trial
# print(f"  Combined score: {trial.value}")
# print("  Best hyperparameters:")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")


In [6]:
import threading
import optuna
from optuna_dashboard import run_server
# !fuser -k 8080/tcp

# Define your persistent storage
storage = "sqlite:///opt6.db"

# Create or load your study
study_name = "optuna6"
try:
    study = optuna.load_study(study_name=study_name, storage=storage)
except KeyError:
    study = optuna.create_study(study_name=study_name, direction="maximize", storage=storage)

# Start Optuna Dashboard in a separate thread
dashboard_thread = threading.Thread(target=lambda: run_server(storage), daemon=True)
dashboard_thread.start()

# Run optimization
# Ensure the 'DR' column exists in the DataFrame
if 'DR' not in df.columns:
    raise KeyError("'DR' column is missing in the DataFrame. Please ensure it is present before running the optimization.")

study.optimize(maximise_combined_score, n_trials=1000)

# Print results
print("Best trial:")
trial = study.best_trial
print(f"  Combined score: {trial.value}")
print("  Best hyperparameters:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64


Bottle v0.13.2 server starting up (using WSGIRefServer())...
Listening on http://localhost:8080/
Hit Ctrl-C to quit.



After OD, majority: 3508
After OD, minority: 402
Before oversampling & synthetic data: DR 
0.0    3508
1.0     402
Name: count, dtype: int64
No balancing condition applied


Gen. (-3.60) | Discrim. (-0.25): 100%|██████████| 900/900 [12:39<00:00,  1.19it/s]



Applying SMOTENC oversampling...
                Age        Gender     Community          UAlb           Ucr  \
count  20270.000000  20270.000000  20270.000000  20270.000000  20270.000000   
mean      63.727532      0.631722      3.960582     20.331354   3607.456854   
std        6.959672      0.482349      2.807606     39.342070   5239.106526   
min       36.000000      0.000000      0.000000      0.100000      1.000000   
25%       59.000000      0.000000      2.000000      4.184483      4.396464   
50%       64.000000      1.000000      4.000000      9.063473     76.000000   
75%       68.357314      1.000000      6.000000     18.000000   7773.608541   
max       92.000000      1.000000      9.000000    743.200000  21612.000000   

               UACR            TC            TG          TCTG          LDLC  \
count  20270.000000  20270.000000  20270.000000  20270.000000  20270.000000   
mean      23.776713      5.361760      1.432968      5.129870      3.501887   
std       57.0524

[I 2025-04-20 03:23:05,545] Trial 8 finished with value: 0.27251732101616627 and parameters: {'contamination_majority': 0.15021697850474855, 'contamination_minority': 0.13233392596415058, 'oversampler_first': False, 'synthesizer': 'CTGAN', 'epochs': 900, 'n_synthetic_data': 8000, 'hidden_dim': 512, 'hidden_dim2': 256, 'output_dim': 96, 'dropout': 0.38946861215834033, 'threshold': 0.6584412288432107, 'initial_lr': 0.0001, 'max_lr': 0.0001, 'weight_decay': 1.574611387172148e-06, 'alpha': 0.316893542580654, 'gamma': 3.7027944858375617}. Best is trial 8 with value: 0.27251732101616627.


Early stopping triggered at epoch 101, best val loss: 0.0474
Accuracy: 0.7258, precision: 0.1861, recall: 0.5086, f1: 0.2725, auc: 0.6294
Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 3144
After OD, minority: 391
Before oversampling & synthetic data: DR 
0.0    3144
1.0     391
Name: count, dtype: int64
No balancing condition applied


Gen. (-3.95) | Discrim. (-0.25): 100%|██████████| 300/300 [03:49<00:00,  1.31it/s]



Applying SMOTENC oversampling...
                Age        Gender     Community          UAlb           Ucr  \
count  21372.000000  21372.000000  21372.000000  21372.000000  21372.000000   
mean      64.286581      0.669240      4.160444     31.842400   3322.294502   
std        6.326107      0.470498      2.945617     43.836541   5086.263665   
min       36.000000      0.000000      0.000000      0.100000      1.000000   
25%       60.862722      0.000000      2.000000      5.600000      1.000000   
50%       64.373740      1.000000      4.000000     13.800000     29.082433   
75%       68.352838      1.000000      7.000000     37.992704   6024.499548   
max       92.000000      1.000000      9.000000    579.200000  21612.000000   

               UACR            TC            TG          TCTG          LDLC  \
count  21372.000000  21372.000000  21372.000000  21372.000000  21372.000000   
mean      22.612860      5.067257      1.490302      4.306000      3.228803   
std       44.4174

[I 2025-04-20 03:29:46,206] Trial 9 finished with value: 0.2736842105263158 and parameters: {'contamination_majority': 0.23861170649269892, 'contamination_minority': 0.15654091518236513, 'oversampler_first': False, 'synthesizer': 'CTGAN', 'epochs': 300, 'n_synthetic_data': 10000, 'hidden_dim': 512, 'hidden_dim2': 256, 'output_dim': 96, 'dropout': 0.3569178698105516, 'threshold': 0.6709228786237011, 'initial_lr': 0.0001, 'max_lr': 0.0001, 'weight_decay': 0.000175474918734877, 'alpha': 0.5726982593417291, 'gamma': 3.0367612828221273}. Best is trial 9 with value: 0.2736842105263158.


Early stopping triggered at epoch 101, best val loss: 0.0451
Accuracy: 0.7598, precision: 0.1970, recall: 0.4483, f1: 0.2737, auc: 0.6215
Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 2624
After OD, minority: 386
Before oversampling & synthetic data: DR 
0.0    2624
1.0     386
Name: count, dtype: int64
No balancing condition applied


Loss: -3.542: 100%|██████████| 700/700 [01:08<00:00, 10.15it/s]



Applying SMOTENC oversampling...
                Age        Gender     Community          UAlb           Ucr  \
count  19676.000000  19676.000000  19676.000000  19676.000000  19676.000000   
mean      63.234350      0.520533      4.581165     27.678497   2973.074380   
std        6.434681      0.499591      3.443760     43.244719   4729.288833   
min       36.000000      0.000000      0.000000      0.100000      1.000000   
25%       59.000000      0.000000      1.000000      5.388990      5.507641   
50%       64.000000      1.000000      5.000000     11.948979     18.000000   
75%       68.000000      1.000000      8.000000     29.125291   5618.500000   
max       92.000000      1.000000      9.000000    579.200000  21406.000000   

               UACR            TC            TG          TCTG          LDLC  \
count  19676.000000  19676.000000  19676.000000  19676.000000  19676.000000   
mean      35.026480      5.226429      1.531858      4.499504      3.277258   
std       69.5359

[I 2025-04-20 03:33:26,154] Trial 10 finished with value: 0.31140350877192985 and parameters: {'contamination_majority': 0.36440847556669387, 'contamination_minority': 0.1666859063609198, 'oversampler_first': False, 'synthesizer': 'TVAE', 'epochs': 700, 'n_synthetic_data': 8000, 'hidden_dim': 512, 'hidden_dim2': 256, 'output_dim': 96, 'dropout': 0.22071422861933293, 'threshold': 0.3613903167751564, 'initial_lr': 0.0001, 'max_lr': 0.0001, 'weight_decay': 3.479181528123841e-05, 'alpha': 0.5966234271313775, 'gamma': 4.826243998827755}. Best is trial 10 with value: 0.31140350877192985.


Early stopping triggered at epoch 101, best val loss: 0.0622
Accuracy: 0.7267, precision: 0.2088, recall: 0.6121, f1: 0.3114, auc: 0.6758
Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 3743
After OD, minority: 400
Before oversampling & synthetic data: DR 
0.0    3743
1.0     400
Name: count, dtype: int64
No balancing condition applied


Gen. (-3.86) | Discrim. (0.37): 100%|██████████| 400/400 [06:04<00:00,  1.10it/s] 



Applying SMOTENC oversampling...
                Age        Gender     Community          UAlb           Ucr  \
count  20480.000000  20480.000000  20480.000000  20480.000000  20480.000000   
mean      64.803974      0.505371      3.960010     24.864483   3563.782994   
std        6.387233      0.499983      2.842104     44.186971   5548.064988   
min       36.000000      0.000000      0.000000      0.100000      1.000000   
25%       61.000000      0.000000      1.000000      4.500000      1.000000   
50%       65.000000      1.000000      4.000000     10.800000     13.000000   
75%       69.000000      1.000000      6.000000     25.100000   6639.000000   
max       92.000000      1.000000      9.000000    743.200000  21612.000000   

               UACR            TC            TG          TCTG          LDLC  \
count  20480.000000  20480.000000  20480.000000  20480.000000  20480.000000   
mean      31.784570      5.332882      1.444258      4.502561      3.338774   
std       60.6651

[I 2025-04-20 03:42:11,351] Trial 11 finished with value: 0.26785714285714285 and parameters: {'contamination_majority': 0.09347636814387006, 'contamination_minority': 0.1376385578197567, 'oversampler_first': False, 'synthesizer': 'CTGAN', 'epochs': 400, 'n_synthetic_data': 8000, 'hidden_dim': 512, 'hidden_dim2': 256, 'output_dim': 96, 'dropout': 0.30767711325996694, 'threshold': 0.7064846371685825, 'initial_lr': 0.0001, 'max_lr': 0.0001, 'weight_decay': 1.5751711978868503e-05, 'alpha': 0.42801786924273644, 'gamma': 4.262391319788994}. Best is trial 10 with value: 0.31140350877192985.


Early stopping triggered at epoch 101, best val loss: 0.0270
Accuracy: 0.7859, precision: 0.2045, recall: 0.3879, f1: 0.2679, auc: 0.6093
Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 2686
After OD, minority: 432
Before oversampling & synthetic data: DR 
0.0    2686
1.0     432
Name: count, dtype: int64
No balancing condition applied


Gen. (-3.63) | Discrim. (-0.07): 100%|██████████| 100/100 [01:09<00:00,  1.43it/s]



Applying SMOTENC oversampling...
                Age        Gender     Community          UAlb           Ucr  \
count  19508.000000  19508.000000  19508.000000  19508.000000  19508.000000   
mean      62.466747      0.472165      4.335965     42.416610   6253.750644   
std        7.328257      0.499237      2.808639     79.400377   6561.770504   
min       36.000000      0.000000      0.000000      0.100000      1.000000   
25%       58.000000      0.000000      2.000000     10.600000     17.000000   
50%       62.232475      0.000000      4.000000     19.000000   4731.897364   
75%       67.000000      1.000000      7.000000     34.900000  11276.500000   
max       92.000000      1.000000      9.000000    804.400000  21612.000000   

               UACR            TC            TG          TCTG          LDLC  \
count  19508.000000  19508.000000  19508.000000  19508.000000  19508.000000   
mean      32.426278      5.149449      1.517541      5.213474      3.278851   
std       84.7823

[I 2025-04-20 03:46:18,190] Trial 12 finished with value: 0.2507836990595611 and parameters: {'contamination_majority': 0.34948158022741505, 'contamination_minority': 0.06880033836275448, 'oversampler_first': False, 'synthesizer': 'CTGAN', 'epochs': 100, 'n_synthetic_data': 10000, 'hidden_dim': 512, 'hidden_dim2': 256, 'output_dim': 96, 'dropout': 0.22980910173224448, 'threshold': 0.7299757346167486, 'initial_lr': 0.0001, 'max_lr': 0.0001, 'weight_decay': 0.0005616159846069074, 'alpha': 0.6436548259943973, 'gamma': 2.535446310244834}. Best is trial 10 with value: 0.31140350877192985.


Early stopping triggered at epoch 101, best val loss: 0.0735
Accuracy: 0.7920, precision: 0.1970, recall: 0.3448, f1: 0.2508, auc: 0.5935
Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 3750
After OD, minority: 418
Before oversampling & synthetic data: DR 
0.0    3750
1.0     418
Name: count, dtype: int64
No balancing condition applied


Loss: -4.731: 100%|██████████| 1000/1000 [02:26<00:00,  6.81it/s]



Applying SMOTENC oversampling...
                Age       Gender     Community          UAlb           Ucr  \
count  26444.000000  26444.00000  26444.000000  26444.000000  26444.000000   
mean      63.445500      0.52095      4.387687     28.498645   3078.681710   
std        7.237485      0.49957      2.923765     49.175075   4986.763579   
min       36.000000      0.00000      0.000000      0.100000      1.000000   
25%       57.918611      0.00000      2.000000      6.300000      4.075326   
50%       64.000000      1.00000      4.000000     12.284973     12.077442   
75%       69.000000      1.00000      7.000000     28.790159   5104.000000   
max       92.000000      1.00000      9.000000    743.200000  21612.000000   

               UACR            TC            TG          TCTG          LDLC  \
count  26444.000000  26444.000000  26444.000000  26444.000000  26444.000000   
mean      37.033843      5.262373      1.555843      4.447261      3.243586   
std       73.520988      0

[I 2025-04-20 03:52:08,322] Trial 13 finished with value: 0.24415584415584415 and parameters: {'contamination_majority': 0.09167246266803016, 'contamination_minority': 0.09763134290899277, 'oversampler_first': False, 'synthesizer': 'TVAE', 'epochs': 1000, 'n_synthetic_data': 10000, 'hidden_dim': 512, 'hidden_dim2': 256, 'output_dim': 96, 'dropout': 0.032374092175299476, 'threshold': 0.7061348333247492, 'initial_lr': 0.0001, 'max_lr': 0.0001, 'weight_decay': 2.2775039913916632e-05, 'alpha': 0.26303810229712793, 'gamma': 4.381132269825942}. Best is trial 10 with value: 0.31140350877192985.


Early stopping triggered at epoch 101, best val loss: 0.0791
Accuracy: 0.7467, precision: 0.1747, recall: 0.4052, f1: 0.2442, auc: 0.5951
Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 3764
After OD, minority: 429
Before oversampling & synthetic data: DR 
0.0    3764
1.0     429
Name: count, dtype: int64
No balancing condition applied


Loss: 17.063: 100%|██████████| 300/300 [00:44<00:00,  6.81it/s]



Applying SMOTENC oversampling...
                Age        Gender     Community          UAlb           Ucr  \
count  25060.000000  25060.000000  25060.000000  25060.000000  25060.000000   
mean      64.033484      0.561253      4.369314     38.944037   3390.209463   
std        7.115336      0.496244      3.089659     69.808625   5278.953564   
min       36.000000      0.000000      0.000000      0.100000      1.000000   
25%       58.936735      0.000000      2.000000      6.022186      8.000000   
50%       64.570969      1.000000      4.000000     14.000000     27.000000   
75%       69.202804      1.000000      7.000000     35.600000   5893.250000   
max       92.000000      1.000000      9.000000    804.400000  21612.000000   

               UACR            TC            TG          TCTG          LDLC  \
count  25060.000000  25060.000000  25060.000000  25060.000000  25060.000000   
mean      54.216495      5.265660      1.564289      4.462741      3.242040   
std      112.4116

[I 2025-04-20 03:56:04,886] Trial 14 finished with value: 0.24930747922437674 and parameters: {'contamination_majority': 0.0882122429276147, 'contamination_minority': 0.07549747945788421, 'oversampler_first': False, 'synthesizer': 'TVAE', 'epochs': 300, 'n_synthetic_data': 9000, 'hidden_dim': 512, 'hidden_dim2': 256, 'output_dim': 96, 'dropout': 0.05419787963167862, 'threshold': 0.6177616301702312, 'initial_lr': 0.0001, 'max_lr': 0.0001, 'weight_decay': 6.430939497617512e-05, 'alpha': 0.5220249977579714, 'gamma': 2.4367042389655893}. Best is trial 10 with value: 0.31140350877192985.


Early stopping triggered at epoch 101, best val loss: 0.0910
Accuracy: 0.7641, precision: 0.1837, recall: 0.3879, f1: 0.2493, auc: 0.5972
Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 3020
After OD, minority: 389
Before oversampling & synthetic data: DR 
0.0    3020
1.0     389
Name: count, dtype: int64
No balancing condition applied


Loss: -0.362: 100%|██████████| 600/600 [01:07<00:00,  8.91it/s]



Applying SMOTENC oversampling...
                Age        Gender     Community          UAlb           Ucr  \
count  15346.000000  15346.000000  15346.000000  15346.000000  15346.000000   
mean      63.659678      0.551610      4.380360     27.343251   3179.418542   
std        6.872849      0.497346      2.920103     47.581588   4947.380146   
min       36.000000      0.000000      0.000000      0.100000      1.000000   
25%       59.000000      0.000000      2.000000      5.799271      5.000000   
50%       64.000000      1.000000      4.000000     12.400000     14.000000   
75%       68.374668      1.000000      7.000000     28.721352   6031.000000   
max       92.000000      1.000000      9.000000    579.200000  21612.000000   

               UACR            TC            TG          TCTG          LDLC  \
count  15346.000000  15346.000000  15346.000000  15346.000000  15346.000000   
mean      38.190983      5.256019      1.469136      4.357191      3.244171   
std       82.4502

[I 2025-04-20 03:59:29,797] Trial 15 finished with value: 0.25806451612903225 and parameters: {'contamination_majority': 0.2684181595160283, 'contamination_minority': 0.16142373570522398, 'oversampler_first': False, 'synthesizer': 'TVAE', 'epochs': 600, 'n_synthetic_data': 5000, 'hidden_dim': 512, 'hidden_dim2': 256, 'output_dim': 96, 'dropout': 0.10723672154587219, 'threshold': 0.4623827402855553, 'initial_lr': 0.0001, 'max_lr': 0.0001, 'weight_decay': 3.6405554998036576e-06, 'alpha': 0.44271380090463985, 'gamma': 1.4201276947510975}. Best is trial 10 with value: 0.31140350877192985.


Early stopping triggered at epoch 101, best val loss: 0.1046
Accuracy: 0.7398, precision: 0.1812, recall: 0.4483, f1: 0.2581, auc: 0.6104
Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 3577
After OD, minority: 433
Before oversampling & synthetic data: DR 
0.0    3577
1.0     433
Name: count, dtype: int64
No balancing condition applied


Gen. (-3.87) | Discrim. (0.05): 100%|██████████| 500/500 [07:14<00:00,  1.15it/s] 



Applying SMOTENC oversampling...
               Age       Gender    Community         UAlb           Ucr  \
count  8764.000000  8764.000000  8764.000000  8764.000000   8764.000000   
mean     63.583436     0.472958     4.267115    33.915327   4179.199434   
std       6.717980     0.499297     2.956987    61.563867   5592.047919   
min      36.000000     0.000000     0.000000     0.100000      1.000000   
25%      59.468633     0.000000     2.000000     5.800000      6.000000   
50%      64.000000     0.000000     4.000000    13.055662     16.000000   
75%      68.000000     1.000000     7.000000    30.912765   7919.703430   
max      92.000000     1.000000     9.000000   804.400000  21612.000000   

              UACR           TC           TG         TCTG         LDLC  \
count  8764.000000  8764.000000  8764.000000  8764.000000  8764.000000   
mean     39.324108     5.193275     1.645470     4.476721     3.333495   
std      92.385673     0.957094     1.044818     2.193227     0.8766

[I 2025-04-20 04:08:09,884] Trial 16 finished with value: 0.2560386473429952 and parameters: {'contamination_majority': 0.13350000246196955, 'contamination_minority': 0.06511384855049375, 'oversampler_first': False, 'synthesizer': 'CTGAN', 'epochs': 500, 'n_synthetic_data': 1000, 'hidden_dim': 512, 'hidden_dim2': 256, 'output_dim': 96, 'dropout': 0.027051775019285487, 'threshold': 0.7421586126544666, 'initial_lr': 0.0001, 'max_lr': 0.0001, 'weight_decay': 1.1584984744557185e-06, 'alpha': 0.6007009866448916, 'gamma': 4.070817717019664}. Best is trial 10 with value: 0.31140350877192985.


Early stopping triggered at epoch 101, best val loss: 0.0401
Accuracy: 0.7319, precision: 0.1779, recall: 0.4569, f1: 0.2560, auc: 0.6099
Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 2610
After OD, minority: 378
Before oversampling & synthetic data: DR 
0.0    2610
1.0     378
Name: count, dtype: int64
No balancing condition applied


Loss: -2.405: 100%|██████████| 700/700 [01:07<00:00, 10.33it/s]



Applying SMOTENC oversampling...
                Age        Gender     Community          UAlb           Ucr  \
count  18754.000000  18754.000000  18754.000000  18754.000000  18754.000000   
mean      63.278412      0.506719      4.086808     34.130288   3325.108925   
std        6.506807      0.499968      3.088595     51.271001   4706.847869   
min       36.000000      0.000000      0.000000      0.100000      1.000000   
25%       58.542238      0.000000      1.000000      6.700000      5.000000   
50%       63.520094      1.000000      4.000000     15.000000     13.000000   
75%       68.000000      1.000000      7.000000     36.747193   6513.750000   
max       92.000000      1.000000      9.000000    579.200000  21406.000000   

               UACR            TC            TG          TCTG          LDLC  \
count  18754.000000  18754.000000  18754.000000  18754.000000  18754.000000   
mean      41.606587      5.253787      1.504384      4.479052      3.282440   
std       70.1885

[I 2025-04-20 04:11:44,963] Trial 17 finished with value: 0.31140350877192985 and parameters: {'contamination_majority': 0.3677957934944276, 'contamination_minority': 0.1853724964612426, 'oversampler_first': False, 'synthesizer': 'TVAE', 'epochs': 700, 'n_synthetic_data': 7000, 'hidden_dim': 512, 'hidden_dim2': 256, 'output_dim': 96, 'dropout': 0.18550673087954428, 'threshold': 0.30579345693132365, 'initial_lr': 0.0001, 'max_lr': 0.0001, 'weight_decay': 0.00010945611253891039, 'alpha': 0.7171065064908012, 'gamma': 4.997157203710595}. Best is trial 10 with value: 0.31140350877192985.


Early stopping triggered at epoch 101, best val loss: 0.0369
Accuracy: 0.7267, precision: 0.2088, recall: 0.6121, f1: 0.3114, auc: 0.6758
Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 2478
After OD, minority: 372
Before oversampling & synthetic data: DR 
0.0    2478
1.0     372
Name: count, dtype: int64
No balancing condition applied


Loss: 0.959: 100%|██████████| 700/700 [01:09<00:00, 10.09it/s] 



Applying SMOTENC oversampling...
                Age        Gender     Community          UAlb           Ucr  \
count  17382.000000  17382.000000  17382.000000  17382.000000  17382.000000   
mean      62.754943      0.565182      3.925555     23.035883   2618.238704   
std        6.833126      0.495747      3.149550     42.479395   4397.636455   
min       36.000000      0.000000      0.000000      0.100000      1.000000   
25%       57.000000      0.000000      1.000000      5.218618      2.656958   
50%       62.529859      1.000000      4.000000      9.800000     14.000000   
75%       68.000000      1.000000      7.000000     21.100000   4516.500000   
max       92.000000      1.000000      9.000000    509.800000  21406.000000   

               UACR            TC            TG          TCTG          LDLC  \
count  17382.000000  17382.000000  17382.000000  17382.000000  17382.000000   
mean      27.492605      5.294046      1.451746      4.418661      3.302407   
std       54.9348

[I 2025-04-20 04:15:12,166] Trial 18 finished with value: 0.30303030303030304 and parameters: {'contamination_majority': 0.3997959726280539, 'contamination_minority': 0.19760227650362944, 'oversampler_first': False, 'synthesizer': 'TVAE', 'epochs': 700, 'n_synthetic_data': 7000, 'hidden_dim': 512, 'hidden_dim2': 256, 'output_dim': 96, 'dropout': 0.1893991178514301, 'threshold': 0.3033899976082144, 'initial_lr': 0.0001, 'max_lr': 0.0001, 'weight_decay': 8.921769103648588e-05, 'alpha': 0.738700985930516, 'gamma': 4.950084658554453}. Best is trial 10 with value: 0.31140350877192985.


Early stopping triggered at epoch 101, best val loss: 0.0198
Accuracy: 0.6797, precision: 0.1942, recall: 0.6897, f1: 0.3030, auc: 0.6841
Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 2810
After OD, minority: 372
Before oversampling & synthetic data: DR 
0.0    2810
1.0     372
Name: count, dtype: int64
No balancing condition applied


Loss: -3.804: 100%|██████████| 800/800 [01:31<00:00,  8.78it/s]



Applying SMOTENC oversampling...
                Age        Gender     Community          UAlb           Ucr  \
count  17006.000000  17006.000000  17006.000000  17006.000000  17006.000000   
mean      64.088426      0.531695      4.202517     31.226735   3249.790758   
std        6.584280      0.499009      2.947672     50.081098   4791.578947   
min       36.000000      0.000000      0.000000      0.100000      1.000000   
25%       59.541029      0.000000      2.000000      7.851886      5.000000   
50%       64.532528      1.000000      4.000000     15.200000     13.374327   
75%       69.000000      1.000000      7.000000     30.493021   5906.883464   
max       92.000000      1.000000      9.000000    509.800000  21612.000000   

               UACR            TC            TG          TCTG          LDLC  \
count  17006.000000  17006.000000  17006.000000  17006.000000  17006.000000   
mean      36.217419      5.262489      1.519368      4.625401      3.290117   
std       68.4226

[I 2025-04-20 04:18:57,932] Trial 19 finished with value: 0.288135593220339 and parameters: {'contamination_majority': 0.31934634909443355, 'contamination_minority': 0.19847624714754336, 'oversampler_first': False, 'synthesizer': 'TVAE', 'epochs': 800, 'n_synthetic_data': 6000, 'hidden_dim': 512, 'hidden_dim2': 256, 'output_dim': 96, 'dropout': 0.1890473137698285, 'threshold': 0.310353568379071, 'initial_lr': 0.0001, 'max_lr': 0.0001, 'weight_decay': 0.00032951332417461177, 'alpha': 0.7487986308480048, 'gamma': 4.832263121016547}. Best is trial 10 with value: 0.31140350877192985.


Early stopping triggered at epoch 101, best val loss: 0.0299
Accuracy: 0.7076, precision: 0.1910, recall: 0.5862, f1: 0.2881, auc: 0.6537
Using device: cuda
Original class distribution: DR
0.0    4129
1.0     464
Name: count, dtype: int64
After OD, majority: 2519
After OD, minority: 384
Before oversampling & synthetic data: DR 
0.0    2519
1.0     384
Name: count, dtype: int64
No balancing condition applied


Loss: 15.466:  44%|████▎     | 305/700 [00:31<00:40,  9.73it/s]
[W 2025-04-20 04:20:00,646] Trial 20 failed with parameters: {'contamination_majority': 0.38992761803719844, 'contamination_minority': 0.17177103086362977, 'oversampler_first': False, 'synthesizer': 'TVAE', 'epochs': 700, 'n_synthetic_data': 3000} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/jovyan/.local/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_3786/3368046877.py", line 17, in maximise_combined_score
    kFolds = FOLDS_GENERATOR(df, n_splits=5, random_state=42,
  File "/home/jovyan/ADL2/Preprocessing_Functions.py", line 197, in FOLDS_GENERATOR
    X_train_processed = Synthetic_Data_Generator(X_train_processed, synthesizer=synthesizer, conditions=None, epochs=epochs, batch_size=512, n_synthetic_data=n_synthetic_data)
                     

KeyboardInterrupt: 