In [1]:
import optuna
import torch
import numpy as np
import tqdm
import sklearn
import networkx as nx
import random
import warnings
import time

  from .autonotebook import tqdm as notebook_tqdm


# 1. Model definition

In [2]:
class AutoEncoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.encoder = torch.nn.Linear(input_dim, hidden_dim)
        self.decoder = torch.nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        encoded = torch.sigmoid(self.encoder(x))
        decoded = torch.sigmoid(self.decoder(encoded))
        return encoded, decoded

In [3]:
class GraphEncoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super().__init__()
        self.autoencoders = torch.nn.ModuleList()
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            self.autoencoders.append(AutoEncoder(prev_dim, hidden_dim))
            prev_dim = hidden_dim

    def forward(self, x):
        for autoencoder in self.autoencoders:
            x = torch.sigmoid(autoencoder.encoder(x))
        encoded = x
        for autoencoder in reversed(self.autoencoders):
            x = torch.sigmoid(autoencoder.decoder(x))
        decoded = x
        return encoded, decoded

# 2. Test on benchmark "Wine"

In [4]:
def compute_ncut(s, labels):
    """
    Compute  normalized cut for given similarity matrix s and cluster labels:
      Ncut = sum_k cut(C_k, V\C_k) / assoc(C_k, V)
    where
      cut(C, V\C) = sum_{i in C, j not in C} A[i,j]
      assoc(C, V) = sum_{i in C, j in V} A[i,j]  (i.e., volume of C)
    A : symmetric adjacency/similarity numpy array
    labels : length-n array of integer cluster labels
    Returns float Ncut value.
    """

    # Get the unique labels in the community assignment
    unique_labels = np.unique(labels)
    
    # Precompute degrees
    degrees = s.sum(axis=1)  # degree/volume per node
    
    # Initialize ncut
    ncut = 0.0
    
    # For each cluster compute link and volume, then sum up to get ncut
    for lab in unique_labels:
        
        # Get the indices of nodes in cluster lab
        idx = np.where(labels == lab)[0]
        if idx.size == 0:
            raise Exception("compute_ncut_from_labels: empty cluster found in labels.")
        
        # Compute volume = sum of degrees of nodes in idx
        volume = degrees[idx].sum()
        
        # If volume is not zero, compute link to get the local cut then sum to ncut, otherwise skip (i.e. cut = 0)
        if volume != 0:

            # Compute link = sum over i in C, j not in C, of A[i,j]
            # = volume - internal connections
            internal_connections = s[np.ix_(idx, idx)].sum()
            link = volume - internal_connections
            
            # Compute local cut contribution
            local_cut = link / volume

            # Sum to ncut
            ncut += local_cut
    
    return ncut

warnings.filterwarnings("error", category=sklearn.exceptions.ConvergenceWarning)

## 2.1. Data loading

In [5]:
# Loading Wine
x, y= sklearn.datasets.load_wine(return_X_y=True, as_frame=False)
x = sklearn.preprocessing.MinMaxScaler().fit_transform(x)
s = sklearn.metrics.pairwise.cosine_similarity(x, x)
nts = s / np.sum(s, axis=1, keepdims=True)
print("[*] nts.shape:", nts.shape)
print("[*] number of clusters:", len(set(y)))
y_pred = sklearn.cluster.KMeans(n_clusters=len(set(y)), n_init=100, random_state=97).fit_predict(nts)
nmi = sklearn.metrics.normalized_mutual_info_score(y, y_pred)
ncut = compute_ncut(nts, y_pred)
print("[*] nmi:", nmi)
print("[*] ncut:", ncut)

[*] nts.shape: (178, 178)
[*] number of clusters: 3
[*] nmi: 0.6351524906645799
[*] ncut: 1.8980409224697135


In [6]:
y_pred = sklearn.cluster.SpectralClustering(n_clusters=len(set(y)), affinity='precomputed', assign_labels='kmeans', n_init=100, random_state=97,).fit_predict(s)
nmi = sklearn.metrics.normalized_mutual_info_score(y, y_pred)
ncut = compute_ncut(nts, y_pred)
print("[*] nmi:", nmi)
print("[*] ncut:", ncut)

[*] nmi: 0.7126929756859989
[*] ncut: 1.8958455727716612


In [7]:
def objective(trial):

    # Print trial number
    print(f"\ntrial {trial.number}----------------------------")
    
    # Set globals
    global best_nmi
    global best_ncut
    global best_ncut_nmi
    global loss_tolerance
    global stab_tolerance
    global max_time_per_layer
    
    # Set random seeds
    torch.manual_seed(97)
    np.random.seed(97)
    random.seed(97)

    # Suggest a decay rate for hidden dimensions
    dim_decay_rate = trial.suggest_float("dim_decay_rate", 0.6, 0.9, step=0.05)

    # Compute the hidden dimensions
    latent_dim = int(x_train.shape[1] * dim_decay_rate)
    hidden_dims = []
    hidden_dims.append(latent_dim)
    while latent_dim * dim_decay_rate >= len(set(y)):
        latent_dim = int(latent_dim * dim_decay_rate)
        hidden_dims.append(latent_dim)

    # Suggest the number of layers
    n_layers = trial.suggest_int("n_layers", 1, len(hidden_dims), step=1)
    hidden_dims = hidden_dims[:n_layers]
    
    # Create the model using the hidden dimensions
    model = GraphEncoder(input_dim=x_train.shape[1], hidden_dims=hidden_dims).to(device)

    # Suggest rho and beta for the sparsity constraint
    rho = trial.suggest_float("rho", 1e-4, 1e-1, log=True)
    beta = trial.suggest_float("beta", 1e-2, 1e3, log=True)
    
    # Suggest a learning rate for the optimizer and create the optimizer    
    lr = trial.suggest_float("lr", 1e-3, 1e-2, log=True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    
    # Create initial dataloader
    current_x_train = x_train.clone().to(device)
    dataloader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(current_x_train),
        batch_size=batch_size,
        shuffle=True
    )
    dataloader_iter = iter(dataloader)

    # Suggest nb_epochs_per_layer
    # nb_epochs_per_layer = nb_epochs_per_layer_pool[trial.suggest_int("nb_epochs_per_layer", 0, len(nb_epochs_per_layer_pool)-1)]
    # nb_train_iters = nb_epochs_per_layer * len(dataloader)

    # Print some hyper parameters
    print("> hidden dims =", hidden_dims)
    print("> rho =", rho)
    print("> beta =", beta)
    
    # Launch the training loop
    # For each layer in the stacked autoencoder: train the layer
    for layer_number in range(len(model.autoencoders)):
        stop = False
        last_loss = None
        start_time = time.time()
        pb = tqdm.tqdm(desc=f"layer: {layer_number}")
        stab = 0
        while not stop:
            try:
                (x_batch,) = next(dataloader_iter)
            except StopIteration:
                dataloader_iter = iter(dataloader)
                (x_batch,) = next(dataloader_iter)
            x_batch = x_batch.to(device)
            optimizer.zero_grad()
            encoded, decoded = model.autoencoders[layer_number](x_batch)
            loss_1 = torch.nn.functional.mse_loss(decoded, x_batch, reduction='sum')
            rho_hat = torch.mean(encoded, dim=0)
            loss_2 = torch.sum(rho * torch.log(rho / rho_hat) + (1 - rho) * torch.log((1 - rho) / (1 - rho_hat)))
            loss = loss_1 + beta * loss_2
            loss.backward()
            optimizer.step()
            
            # Stop criteria
            elapsed_time = time.time() - start_time
            if elapsed_time > max_time_per_layer:
                print(f"[!] stopping layer {layer_number} training after {elapsed_time:.2f}s (> {max_time_per_layer}s)")
                pb.close()
                break
            if last_loss is None:
                last_loss = loss.item()
            else:
                if abs(last_loss - loss.item()) < loss_tolerance:
                    stab += 1
                    if stab == stab_tolerance:
                        stop = True
                        pb.close()
                else:
                    stab = 0
                last_loss = loss.item()
            pb.set_postfix({"loss": loss.item(), "stab": stab})
            pb.update(1)

        # Create new dataloader on the latent representations
        with torch.no_grad():
            current_x_train, _ = model.autoencoders[layer_number](current_x_train)
            dataloader = torch.utils.data.DataLoader(
                torch.utils.data.TensorDataset(current_x_train),
                batch_size=batch_size,
                shuffle=True
            )
            dataloader_iter = iter(dataloader)
    
    try:
        # Evaluate the model
        with torch.no_grad():
            
            # Get the encoded representations
            encoded, _ = model(x_train)
            encoded = encoded.to('cpu')

            y_pred = sklearn.cluster.KMeans(n_clusters=len(set(y)), n_init=100, random_state=97).fit_predict(encoded.numpy())
            nmi = sklearn.metrics.normalized_mutual_info_score(y, y_pred)
            ncut = compute_ncut(nts, y_pred)
            
            # Print average nmi and ncut
            print("[*] nmi =", nmi)
            print("[*] ncut =", ncut)
            
            # If average nmi is better than the best so far, update best_nmi
            if nmi > best_nmi:
                best_nmi = nmi
            
            # If average ncut is better than the best so far, update best_ncut and its corresponding average nmi (i.e. best_ncut_nmi)
            if ncut < best_ncut:
                best_ncut = ncut
                best_ncut_nmi = nmi
    
    except sklearn.exceptions.ConvergenceWarning:
        print("[!] KMeans did not converge (not enough distinct points) --> Returning inf for ncut")
        ncut = float('inf')

    # Return ncut as the objective to minimize
    return ncut


# Set global parameters
nb_epochs_per_layer_pool = [10, 100, 500, 1000, 2500, 5000]
nb_kmeans_tests = 100
nb_trials = 20
device = ('cuda' if torch.cuda.is_available() else 'cpu'); print("[*] using device:", device)
x_train = torch.tensor(nts, dtype=torch.float32).to(device)
batch_size = x_train.shape[0]
max_time_per_layer = 3 * 60  # seconds
loss_tolerance = 1e-4
stab_tolerance = 5

# Set globals to track best results
best_nmi = 0.0
best_ncut = float('inf')
best_ncut_nmi = 0.0

# Run optuna study
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(sampler=sampler, direction="minimize")
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=nb_trials)

# Display the best results
print("========================================================")
print("========================================================")
print("[*] best nmi =", best_nmi)
print("[*] best ncut =", best_ncut)
print("[*] best ncut nmi =", best_ncut_nmi)

[*] using device: cuda


[I 2025-12-01 12:57:34,183] A new study created in memory with name: no-name-b9e92d1f-4120-4127-b592-b1bed28a300b



trial 0----------------------------
> hidden dims = [124, 86, 60, 42, 29, 20, 14, 9, 6, 4]
> rho = 0.015702970884055395
> beta = 9.846738873614559


layer: 0: 11042it [00:47, 232.68it/s, loss=0.064, stab=4] 
layer: 1: 9565it [00:24, 387.67it/s, loss=0.0263, stab=4]
layer: 2: 9423it [00:23, 398.48it/s, loss=0.0312, stab=4]
layer: 3: 9310it [00:23, 394.58it/s, loss=0.0353, stab=4]
layer: 4: 9170it [00:22, 414.55it/s, loss=0.0388, stab=4]
layer: 5: 8963it [00:19, 456.75it/s, loss=0.0455, stab=4]
layer: 6: 8883it [00:42, 207.67it/s, loss=0.0522, stab=4]
layer: 7: 8756it [00:29, 292.35it/s, loss=0.059, stab=4] 
layer: 8: 8709it [00:30, 283.36it/s, loss=0.0629, stab=4]
layer: 9: 8534it [00:30, 281.12it/s, loss=0.0668, stab=4]


[*] nmi = 0.4641306752137041
[*] ncut = 1.9130839726751143

trial 1----------------------------
> hidden dims = [115]
> rho = 0.0396760507705299
> beta = 10.129197956845726


layer: 0: 6585it [00:23, 282.58it/s, loss=0.0597, stab=4]


[*] nmi = 0.6211184082714877
[*] ncut = 1.8984274850308323

trial 2----------------------------
> hidden dims = [106, 63, 37, 22, 13, 7, 4]
> rho = 0.03142880890840111
> beta = 0.1152644954031561


layer: 0: 14791it [00:52, 282.32it/s, loss=0.164, stab=4]
layer: 1: 11256it [00:38, 295.25it/s, loss=0.191, stab=4]
layer: 2: 10298it [00:32, 319.11it/s, loss=0.227, stab=4]
layer: 3: 8777it [00:27, 324.28it/s, loss=0.264, stab=4]
layer: 4: 6605it [00:20, 321.53it/s, loss=0.291, stab=4]
layer: 5: 4626it [00:14, 328.21it/s, loss=0.299, stab=4]
layer: 6: 2281it [00:07, 321.19it/s, loss=0.228, stab=4]


[*] nmi = 0.5115523458936237
[*] ncut = 1.908028227185197

trial 3----------------------------
> hidden dims = [115, 74, 48]
> rho = 0.0037520558551242854
> beta = 1.4445251022763053


layer: 0: 13563it [00:42, 320.84it/s, loss=0.0278, stab=4]
layer: 1: 13323it [00:40, 330.16it/s, loss=0.0291, stab=4]
layer: 2: 12988it [00:38, 333.75it/s, loss=0.0356, stab=4]


[*] nmi = 0.5051111287733552
[*] ncut = 1.9121122309794387

trial 4----------------------------
> hidden dims = [142, 113, 90]
> rho = 0.0007523742884534858
> beta = 0.6789053271698483


layer: 0: 13776it [00:41, 333.49it/s, loss=0.0235, stab=4]
layer: 1: 14229it [00:42, 333.89it/s, loss=0.0423, stab=4]
layer: 2: 14079it [00:42, 332.97it/s, loss=0.0439, stab=4]


[*] nmi = 0.5104618482605399
[*] ncut = 1.9064982738158145

trial 5----------------------------
> hidden dims = [151, 128, 108, 91, 77]
> rho = 0.003489018845491387
> beta = 9.163741808778772


layer: 0: 12782it [00:40, 313.79it/s, loss=0.055, stab=4] 
layer: 1: 13207it [00:40, 322.69it/s, loss=0.0689, stab=4]
layer: 2: 13038it [00:39, 326.02it/s, loss=0.0748, stab=4]
layer: 3: 12959it [00:40, 317.81it/s, loss=0.0783, stab=4]
layer: 4: 12830it [00:39, 324.07it/s, loss=0.0843, stab=4]


[*] nmi = 0.3927904612147574
[*] ncut = 1.929871489545791

trial 6----------------------------
> hidden dims = [142, 113, 90]
> rho = 0.00015673095467235422
> beta = 555.1721685244722


layer: 0: 8298it [00:25, 322.90it/s, loss=0.0285, stab=4]
layer: 1: 10751it [00:33, 323.29it/s, loss=0.02, stab=4]  
layer: 2: 10428it [00:32, 322.62it/s, loss=0.0266, stab=4]


[*] nmi = 0.2552908971517697
[*] ncut = 1.9583589425809549

trial 7----------------------------
> hidden dims = [151, 128, 108, 91, 77, 65, 55]
> rho = 0.00019634341572933326
> beta = 26.373339933815235


layer: 0: 9589it [00:29, 324.64it/s, loss=0.0652, stab=4]
layer: 1: 12046it [00:37, 325.16it/s, loss=0.0925, stab=4]
layer: 2: 11873it [00:36, 325.90it/s, loss=0.0944, stab=4]
layer: 3: 11626it [00:35, 323.83it/s, loss=0.101, stab=4]
layer: 4: 11434it [00:35, 324.33it/s, loss=0.103, stab=4]
layer: 5: 11256it [00:34, 328.61it/s, loss=0.1, stab=4]  
layer: 6: 11033it [00:33, 329.93it/s, loss=0.106, stab=4]


[!] KMeans did not converge (not enough distinct points) --> Returning inf for ncut

trial 8----------------------------
> hidden dims = [106, 63, 37, 22]
> rho = 0.00012681352169084607
> beta = 352.0481045526035


layer: 0: 13311it [00:40, 326.91it/s, loss=0.0465, stab=4]
layer: 1: 16095it [00:49, 325.41it/s, loss=0.0457, stab=4]
layer: 2: 15684it [00:48, 324.68it/s, loss=0.0564, stab=4]
layer: 3: 15357it [00:47, 325.56it/s, loss=0.0593, stab=4]


[!] KMeans did not converge (not enough distinct points) --> Returning inf for ncut

trial 9----------------------------
> hidden dims = [142, 113, 90, 72, 57]
> rho = 0.0036324869566766076
> beta = 5.414413211338521


layer: 0: 12168it [00:37, 325.17it/s, loss=0.0418, stab=4]
layer: 1: 12207it [00:37, 322.68it/s, loss=0.0538, stab=4]
layer: 2: 12206it [00:40, 303.64it/s, loss=0.0546, stab=4]
layer: 3: 11701it [00:41, 279.62it/s, loss=0.0656, stab=4]
layer: 4: 11736it [00:42, 278.50it/s, loss=0.0686, stab=4]


[*] nmi = 0.4264033494171922
[*] ncut = 1.9163121360437465

trial 10----------------------------
> hidden dims = [124]
> rho = 0.08102356207766644
> beta = 0.012297288957910173


layer: 0: 76it [00:00, 270.40it/s, loss=0.923, stab=4] 


[*] nmi = 0.6351524906645799
[*] ncut = 1.8980409224697135

trial 11----------------------------
> hidden dims = [124]
> rho = 0.0869821884209373
> beta = 0.0267870779847426


layer: 0: 70it [00:00, 301.45it/s, loss=1.57, stab=4]  


[*] nmi = 0.6351524906645799
[*] ncut = 1.8980409224697135

trial 12----------------------------
> hidden dims = [124]
> rho = 0.08332447280612446
> beta = 0.014747073255776684


layer: 0: 76it [00:00, 283.86it/s, loss=0.954, stab=4] 


[*] nmi = 0.6351524906645799
[*] ncut = 1.8980409224697135

trial 13----------------------------
> hidden dims = [124]
> rho = 0.09928034768566334
> beta = 0.010509393517283788


layer: 0: 375it [00:01, 286.18it/s, loss=0.613, stab=4]


[*] nmi = 0.6351524906645799
[*] ncut = 1.8980409224697135

trial 14----------------------------
> hidden dims = [133, 99, 74, 55, 41, 30, 22, 16, 12, 9, 6, 4]
> rho = 0.01212855396856456
> beta = 0.10352283573154424


layer: 0: 10139it [00:35, 285.75it/s, loss=0.183, stab=4]
layer: 1: 8390it [00:25, 325.92it/s, loss=0.196, stab=4]
layer: 2: 8050it [00:24, 326.15it/s, loss=0.179, stab=4]
layer: 3: 7588it [00:23, 324.30it/s, loss=0.2, stab=4]  
layer: 4: 6893it [00:21, 316.42it/s, loss=0.213, stab=4]
layer: 5: 6182it [00:19, 318.13it/s, loss=0.227, stab=4]
layer: 6: 5283it [00:16, 318.00it/s, loss=0.247, stab=4]
layer: 7: 3956it [00:11, 331.37it/s, loss=0.234, stab=4]
layer: 8: 3116it [00:09, 338.96it/s, loss=0.211, stab=4]
layer: 9: 2813it [00:08, 337.69it/s, loss=0.198, stab=4]
layer: 10: 2128it [00:06, 340.38it/s, loss=0.184, stab=4]
layer: 11: 1577it [00:04, 341.02it/s, loss=0.126, stab=4]


[*] nmi = 0.5115523458936237
[*] ncut = 1.908028227185197

trial 15----------------------------
> hidden dims = [133, 99, 74, 55, 41, 30, 22, 16, 12]
> rho = 0.011593469235899505
> beta = 0.07178265086709312


layer: 0: 12360it [00:35, 346.60it/s, loss=0.215, stab=4]
layer: 1: 10253it [00:29, 347.69it/s, loss=0.227, stab=4]
layer: 2: 9527it [00:28, 336.68it/s, loss=0.247, stab=4]
layer: 3: 8654it [00:28, 306.37it/s, loss=0.282, stab=4]
layer: 4: 7559it [00:23, 319.78it/s, loss=0.304, stab=4]
layer: 5: 6376it [00:18, 346.37it/s, loss=0.321, stab=4]
layer: 6: 5072it [00:14, 346.05it/s, loss=0.33, stab=4] 
layer: 7: 3607it [00:10, 347.72it/s, loss=0.289, stab=4]
layer: 8: 2849it [00:08, 348.38it/s, loss=0.266, stab=4]


[*] nmi = 0.5284800321535952
[*] ncut = 1.9070190070349016

trial 16----------------------------
> hidden dims = [160, 144, 129, 116, 104, 93, 83, 74, 66, 59, 53, 47, 42, 37, 33, 29, 26, 23, 20, 18, 16, 14, 12, 10, 9]
> rho = 0.040288451760735484
> beta = 0.31923936674526876


layer: 0: 8790it [00:25, 343.68it/s, loss=0.0877, stab=4]
layer: 1: 6375it [00:18, 346.60it/s, loss=0.0708, stab=4]
layer: 2: 6388it [00:18, 344.30it/s, loss=0.0704, stab=4]
layer: 3: 6390it [00:18, 343.69it/s, loss=0.0713, stab=4]
layer: 4: 6398it [00:19, 330.57it/s, loss=0.0757, stab=4]
layer: 5: 6455it [00:19, 339.25it/s, loss=0.0833, stab=4]
layer: 6: 6684it [00:19, 339.60it/s, loss=0.077, stab=4] 
layer: 7: 6643it [00:19, 337.68it/s, loss=0.0719, stab=4]
layer: 8: 6711it [00:19, 342.78it/s, loss=0.0737, stab=4]
layer: 9: 6560it [00:19, 338.69it/s, loss=0.0771, stab=4]
layer: 10: 6421it [00:18, 339.75it/s, loss=0.0744, stab=4]
layer: 11: 6503it [00:19, 338.69it/s, loss=0.0759, stab=4]
layer: 12: 6287it [00:18, 335.49it/s, loss=0.0685, stab=4]
layer: 13: 6315it [00:19, 332.09it/s, loss=0.068, stab=4] 
layer: 14: 6252it [00:18, 337.22it/s, loss=0.0657, stab=4]
layer: 15: 6155it [00:17, 347.92it/s, loss=0.068, stab=4] 
layer: 16: 5928it [00:17, 346.52it/s, loss=0.0724, stab=4]
layer: 

[*] nmi = 0.5644552632922665
[*] ncut = 1.91162576645247

trial 17----------------------------
> hidden dims = [115, 74]
> rho = 0.0008674626461838451
> beta = 0.03085845033476696


layer: 0: 11267it [00:32, 344.87it/s, loss=0.346, stab=4]
layer: 1: 6637it [00:19, 343.71it/s, loss=0.417, stab=4]


[*] nmi = 0.5226144060083494
[*] ncut = 1.909236221347139

trial 18----------------------------
> hidden dims = [124, 86]
> rho = 0.02655832474833674
> beta = 89.0898940920518


layer: 0: 4703it [00:13, 340.65it/s, loss=0.107, stab=4] 
layer: 1: 2638it [00:08, 316.18it/s, loss=0.0208, stab=4]


[*] nmi = 0.14120765106733965
[*] ncut = 1.9729063208581121

trial 19----------------------------
> hidden dims = [133, 99, 74, 55, 41, 30]
> rho = 0.08087411777442799
> beta = 0.30801066727344234


layer: 0: 8761it [00:27, 317.39it/s, loss=0.102, stab=4]
layer: 1: 5273it [00:16, 323.59it/s, loss=0.0838, stab=4]
layer: 2: 5322it [00:17, 298.74it/s, loss=0.0876, stab=4]
layer: 3: 5416it [00:17, 302.03it/s, loss=0.092, stab=4] 
layer: 4: 5433it [00:18, 299.91it/s, loss=0.0965, stab=4]
layer: 5: 5434it [00:18, 301.72it/s, loss=0.102, stab=4]


[*] nmi = 0.5134164763045211
[*] ncut = 1.9081071156541918
[*] best nmi = 0.6351524906645799
[*] best ncut = 1.8980409224697135
[*] best ncut nmi = 0.6351524906645799
