In [1]:
import optuna
import torch
import numpy as np
import tqdm
import sklearn
import networkx as nx
import random
import warnings
import time

  from .autonotebook import tqdm as notebook_tqdm


# 1. Model definition

In [2]:
class AutoEncoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.encoder = torch.nn.Linear(input_dim, hidden_dim)
        self.decoder = torch.nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        encoded = torch.sigmoid(self.encoder(x))
        decoded = torch.sigmoid(self.decoder(encoded))
        return encoded, decoded

In [3]:
class GraphEncoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super().__init__()
        self.autoencoders = torch.nn.ModuleList()
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            self.autoencoders.append(AutoEncoder(prev_dim, hidden_dim))
            prev_dim = hidden_dim

    def forward(self, x):
        for autoencoder in self.autoencoders:
            x = torch.sigmoid(autoencoder.encoder(x))
        encoded = x
        for autoencoder in reversed(self.autoencoders):
            x = torch.sigmoid(autoencoder.decoder(x))
        decoded = x
        return encoded, decoded

# 2. Test on benchmark "email"

In [4]:
def compute_ncut(s, labels):
    """
    Compute  normalized cut for given similarity matrix s and cluster labels:
      Ncut = sum_k cut(C_k, V\C_k) / assoc(C_k, V)
    where
      cut(C, V\C) = sum_{i in C, j not in C} A[i,j]
      assoc(C, V) = sum_{i in C, j in V} A[i,j]  (i.e., volume of C)
    A : symmetric adjacency/similarity numpy array
    labels : length-n array of integer cluster labels
    Returns float Ncut value.
    """

    # Get the unique labels in the community assignment
    unique_labels = np.unique(labels)
    
    # Precompute degrees
    degrees = s.sum(axis=1)  # degree/volume per node
    
    # Initialize ncut
    ncut = 0.0
    
    # For each cluster compute link and volume, then sum up to get ncut
    for lab in unique_labels:
        
        # Get the indices of nodes in cluster lab
        idx = np.where(labels == lab)[0]
        if idx.size == 0:
            raise Exception("compute_ncut_from_labels: empty cluster found in labels.")
        
        # Compute volume = sum of degrees of nodes in idx
        volume = degrees[idx].sum()
        
        # If volume is not zero, compute link to get the local cut then sum to ncut, otherwise skip (i.e. cut = 0)
        if volume != 0:

            # Compute link = sum over i in C, j not in C, of A[i,j]
            # = volume - internal connections
            internal_connections = s[np.ix_(idx, idx)].sum()
            link = volume - internal_connections
            
            # Compute local cut contribution
            local_cut = link / volume

            # Sum to ncut
            ncut += local_cut
    
    return ncut

warnings.filterwarnings("error", category=sklearn.exceptions.ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning, message="Graph is not fully connected, spectral embedding may not work as expected.")

## 2.1. Data loading

In [5]:
nxg = nx.read_gml("../datasets/real/email/email.gml") # read the football gml file into a networkx graph
y = [nxg.nodes[n]["value"] for n in nxg.nodes] # extract the ground-truth community labels
s = nx.to_numpy_array(nxg) # generate the similarity matrix
s = s + np.diag(np.ones(nxg.number_of_nodes())) # we add self-loops (not indicated in the original paper but improves performance)
nts = s / np.sum(s, axis=1, keepdims=True) # generate the normalized training set
print("[*] nts.shape:", nts.shape)
print("[*] number of clusters:", len(set(y)))
y_pred = sklearn.cluster.KMeans(n_clusters=len(set(y)), n_init=100, random_state=97).fit_predict(nts)
nmi = sklearn.metrics.normalized_mutual_info_score(y, y_pred)
ncut = compute_ncut(nts, y_pred)
print("[*] nmi:", nmi)
print("[*] ncut:", ncut)

[*] nts.shape: (1005, 1005)
[*] number of clusters: 42
[*] nmi: 0.40867148927590957
[*] ncut: 11.101097091130402


In [6]:
y_pred = sklearn.cluster.SpectralClustering(n_clusters=len(set(y)), affinity='precomputed', assign_labels='kmeans', n_init=100, random_state=97,).fit_predict(s)
nmi = sklearn.metrics.normalized_mutual_info_score(y, y_pred)
ncut = compute_ncut(nts, y_pred)
print("[*] nmi:", nmi)
print("[*] ncut:", ncut)

[*] nmi: 0.487196951731793
[*] ncut: 20.625335338205108


## 2.3. Model training with hyper-parameter tuning 

In [None]:
def objective(trial):

    # Print trial number
    print(f"\ntrial {trial.number}----------------------------")
    
    # Set globals
    global best_nmi
    global best_ncut
    global best_ncut_nmi
    global loss_tolerance
    global stab_tolerance
    global max_time_per_layer
    
    # Set random seeds
    torch.manual_seed(97)
    np.random.seed(97)
    random.seed(97)

    # Suggest a decay rate for hidden dimensions
    dim_decay_rate = trial.suggest_float("dim_decay_rate", 0.6, 0.9, step=0.05)

    # Compute the hidden dimensions
    latent_dim = int(x_train.shape[1] * dim_decay_rate)
    hidden_dims = []
    hidden_dims.append(latent_dim)
    while latent_dim * dim_decay_rate >= len(set(y)):
        latent_dim = int(latent_dim * dim_decay_rate)
        hidden_dims.append(latent_dim)

    # Suggest the number of layers
    n_layers = trial.suggest_int("n_layers", 1, len(hidden_dims), step=1)
    hidden_dims = hidden_dims[:n_layers]
    
    # Create the model using the hidden dimensions
    model = GraphEncoder(input_dim=x_train.shape[1], hidden_dims=hidden_dims).to(device)

    # Suggest rho and beta for the sparsity constraint
    rho = trial.suggest_float("rho", 1e-4, 1e-1, log=True)
    beta = trial.suggest_float("beta", 1e-2, 1e3, log=True)
    
    # Suggest a learning rate for the optimizer and create the optimizer    
    lr = trial.suggest_float("lr", 1e-3, 1e-2, log=True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    
    # Create initial dataloader
    current_x_train = x_train.clone().to(device)
    dataloader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(current_x_train),
        batch_size=batch_size,
        shuffle=True
    )
    dataloader_iter = iter(dataloader)

    # Suggest nb_epochs_per_layer
    # nb_epochs_per_layer = nb_epochs_per_layer_pool[trial.suggest_int("nb_epochs_per_layer", 0, len(nb_epochs_per_layer_pool)-1)]
    # nb_train_iters = nb_epochs_per_layer * len(dataloader)

    # Print some hyper parameters
    print("> hidden dims =", hidden_dims)
    print("> rho =", rho)
    print("> beta =", beta)
    
    # Launch the training loop
    # For each layer in the stacked autoencoder: train the layer
    for layer_number in range(len(model.autoencoders)):
        stop = False
        last_loss = None
        start_time = time.time()
        pb = tqdm.tqdm(desc=f"layer: {layer_number}")
        stab = 0
        while not stop:
            try:
                (x_batch,) = next(dataloader_iter)
            except StopIteration:
                dataloader_iter = iter(dataloader)
                (x_batch,) = next(dataloader_iter)
            x_batch = x_batch.to(device)
            optimizer.zero_grad()
            encoded, decoded = model.autoencoders[layer_number](x_batch)
            loss_1 = torch.nn.functional.mse_loss(decoded, x_batch, reduction='sum')
            rho_hat = torch.mean(encoded, dim=0)
            loss_2 = torch.sum(rho * torch.log(rho / rho_hat) + (1 - rho) * torch.log((1 - rho) / (1 - rho_hat)))
            loss = loss_1 + beta * loss_2
            loss.backward()
            optimizer.step()
            
            # Stop criteria
            elapsed_time = time.time() - start_time
            if elapsed_time > max_time_per_layer:
                print(f"[!] stopping layer {layer_number} training after {elapsed_time:.2f}s (> {max_time_per_layer}s)")
                pb.close()
                break
            if last_loss is None:
                last_loss = loss.item()
            else:
                if abs(last_loss - loss.item()) < loss_tolerance:
                    stab += 1
                    if stab == stab_tolerance:
                        stop = True
                        pb.close()
                else:
                    stab = 0
                last_loss = loss.item()
            pb.set_postfix({"loss": loss.item(), "stab": stab})
            pb.update(1)

        # Create new dataloader on the latent representations
        with torch.no_grad():
            current_x_train, _ = model.autoencoders[layer_number](current_x_train)
            dataloader = torch.utils.data.DataLoader(
                torch.utils.data.TensorDataset(current_x_train),
                batch_size=batch_size,
                shuffle=True
            )
            dataloader_iter = iter(dataloader)
    
    try:
        # Evaluate the model
        with torch.no_grad():
            
            # Get the encoded representations
            encoded, _ = model(x_train)
            encoded = encoded.to('cpu')

            y_pred = sklearn.cluster.KMeans(n_clusters=len(set(y)), n_init=100, random_state=97).fit_predict(encoded.numpy())
            nmi = sklearn.metrics.normalized_mutual_info_score(y, y_pred)
            ncut = compute_ncut(nts, y_pred)
            
            # Print average nmi and ncut
            print("[*] nmi =", nmi)
            print("[*] ncut =", ncut)
            
            # If average nmi is better than the best so far, update best_nmi
            if nmi > best_nmi:
                best_nmi = nmi
            
            # If average ncut is better than the best so far, update best_ncut and its corresponding average nmi (i.e. best_ncut_nmi)
            if ncut < best_ncut:
                best_ncut = ncut
                best_ncut_nmi = nmi
    
    except sklearn.exceptions.ConvergenceWarning:
        print("[!] KMeans did not converge (not enough distinct points) --> Returning inf for ncut")
        ncut = float('inf')

    # Return ncut as the objective to minimize
    return ncut


# Set global parameters
nb_epochs_per_layer_pool = [10, 100, 500, 1000, 2500, 5000]
nb_kmeans_tests = 100
nb_trials = 20
device = ('cuda' if torch.cuda.is_available() else 'cpu'); print("[*] using device:", device)
x_train = torch.tensor(nts, dtype=torch.float32).to(device)
batch_size = x_train.shape[0]
max_time_per_layer = 3 * 60  # seconds
loss_tolerance = 1e-4
stab_tolerance = 5

# Set globals to track best results
best_nmi = 0.0
best_ncut = float('inf')
best_ncut_nmi = 0.0

# Run optuna study
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(sampler=sampler, direction="minimize")
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=nb_trials)

# Display the best results
print("========================================================")
print("========================================================")
print("[*] best nmi =", best_nmi)
print("[*] best ncut =", best_ncut)
print("[*] best ncut nmi =", best_ncut_nmi)

[I 2025-12-01 13:01:06,016] A new study created in memory with name: no-name-47730f95-8094-493c-9533-d456c5b48ec5


[*] using device: cuda

trial 0----------------------------
> hidden dims = [703, 492, 344, 240, 168, 117, 81, 56]
> rho = 0.015702970884055395
> beta = 9.846738873614559


layer: 0: 23900it [03:00, 132.78it/s, loss=0.237, stab=0] 


[!] stopping layer 0 training after 180.00s (> 180s)


layer: 1: 21608it [02:22, 151.75it/s, loss=0.729, stab=4] 
layer: 2: 23093it [02:28, 155.77it/s, loss=0.961, stab=4] 
layer: 3: 24579it [02:39, 154.06it/s, loss=2.6, stab=4]   
layer: 4: 26450it [02:53, 152.56it/s, loss=5.06, stab=4]  
layer: 5: 27590it [03:00, 153.28it/s, loss=8.34, stab=0]  


[!] stopping layer 5 training after 180.00s (> 180s)


layer: 6: 27899it [03:00, 154.99it/s, loss=11.6, stab=0] 


[!] stopping layer 6 training after 180.01s (> 180s)


layer: 7: 27695it [03:00, 153.86it/s, loss=17.6, stab=0] 


[!] stopping layer 7 training after 180.01s (> 180s)
[*] nmi = 0.10900818828395596
[*] ncut = 32.548597640405525

trial 1----------------------------
> hidden dims = [653]
> rho = 0.0396760507705299
> beta = 10.129197956845726


layer: 0: 17135it [02:08, 133.35it/s, loss=0.366, stab=4]


[*] nmi = 0.6659256695667087
[*] ncut = 21.78386129964417

trial 2----------------------------
> hidden dims = [603, 361, 216, 129, 77, 46]
> rho = 0.03142880890840111
> beta = 0.1152644954031561


layer: 0: 22497it [02:32, 147.62it/s, loss=0.18, stab=4] 
layer: 1: 22730it [02:25, 156.39it/s, loss=4.67, stab=4]  
layer: 2: 28281it [02:59, 157.12it/s, loss=155, stab=0]   


[!] stopping layer 2 training after 180.00s (> 180s)


layer: 3: 28305it [03:00, 157.25it/s, loss=1.97e+3, stab=0]


[!] stopping layer 3 training after 180.00s (> 180s)


layer: 4: 28413it [02:57, 159.82it/s, loss=800, stab=4]   
layer: 5: 12470it [01:17, 160.79it/s, loss=312, stab=4]  


[*] nmi = 0.4725404216672759
[*] ncut = 27.262347330186337

trial 3----------------------------
> hidden dims = [653, 424, 275]
> rho = 0.0037520558551242854
> beta = 1.4445251022763053


layer: 0: 26534it [03:00, 147.41it/s, loss=1.49, stab=0]


[!] stopping layer 0 training after 180.00s (> 180s)


layer: 1: 24180it [02:14, 179.50it/s, loss=0.39, stab=4]  
layer: 2: 25776it [02:20, 183.44it/s, loss=0.445, stab=4] 


[*] nmi = 0.22437918184212768
[*] ncut = 29.912024298008987

trial 4----------------------------
> hidden dims = [804, 643]
> rho = 0.0007523742884534858
> beta = 0.6789053271698483


layer: 0: 26915it [02:52, 156.04it/s, loss=1.64, stab=4]
layer: 1: 22715it [02:17, 165.58it/s, loss=0.886, stab=4]


[*] nmi = 0.26678221889744097
[*] ncut = 22.453191262176478

trial 5----------------------------
> hidden dims = [854, 725, 616, 523]
> rho = 0.003489018845491387
> beta = 9.163741808778772


layer: 0: 27510it [03:00, 152.83it/s, loss=0.966, stab=0] 


[!] stopping layer 0 training after 180.00s (> 180s)


layer: 1: 25085it [02:37, 159.26it/s, loss=0.427, stab=4] 
layer: 2: 25038it [02:26, 170.75it/s, loss=0.377, stab=4] 
layer: 3: 24585it [02:20, 174.50it/s, loss=0.341, stab=4] 


[*] nmi = 0.16572609312672865
[*] ncut = 28.52338374792191

trial 6----------------------------
> hidden dims = [804, 643, 514]
> rho = 0.00015673095467235422
> beta = 555.1721685244722


layer: 0: 9980it [01:05, 153.33it/s, loss=132, stab=4]    
layer: 1: 11638it [01:11, 163.86it/s, loss=0.2, stab=4]   
layer: 2: 11096it [01:04, 171.92it/s, loss=0.221, stab=4] 


[*] nmi = 0.23024387223367754
[*] ncut = 27.42581561676402

trial 7----------------------------
> hidden dims = [854, 725, 616, 523, 444, 377]
> rho = 0.00019634341572933326
> beta = 26.373339933815235


layer: 0: 28880it [03:00, 160.44it/s, loss=37.2, stab=0]  


[!] stopping layer 0 training after 180.00s (> 180s)


layer: 1: 18203it [01:50, 164.90it/s, loss=65, stab=4]    
layer: 2: 15968it [01:31, 175.19it/s, loss=0.443, stab=4] 
layer: 3: 15892it [01:28, 179.67it/s, loss=0.736, stab=4] 
layer: 4: 15409it [01:24, 182.30it/s, loss=0.51, stab=4]  
layer: 5: 15436it [01:23, 184.37it/s, loss=0.366, stab=4] 


[*] nmi = 0.18520481559803229
[*] ncut = 27.981113420424478

trial 8----------------------------
> hidden dims = [603, 361, 216]
> rho = 0.00012681352169084607
> beta = 352.0481045526035


layer: 0: 14756it [01:28, 166.71it/s, loss=132, stab=4]   
layer: 1: 16894it [01:32, 181.84it/s, loss=0.16, stab=4]  
layer: 2: 16792it [01:31, 183.59it/s, loss=0.128, stab=4] 


[*] nmi = 0.2246669352003012
[*] ncut = 26.50421795145601

trial 9----------------------------
> hidden dims = [804, 643, 514, 411, 328]
> rho = 0.0036324869566766076
> beta = 5.414413211338521


layer: 0: 28777it [03:00, 159.87it/s, loss=0.349, stab=0] 


[!] stopping layer 0 training after 180.00s (> 180s)


layer: 1: 21888it [02:09, 169.13it/s, loss=0.482, stab=4] 
layer: 2: 23127it [02:10, 177.20it/s, loss=0.277, stab=4] 
layer: 3: 24255it [02:14, 180.75it/s, loss=0.3, stab=4]   
layer: 4: 23769it [02:12, 179.38it/s, loss=0.654, stab=4] 


[*] nmi = 0.1817473453929935
[*] ncut = 31.324465992714174

trial 10----------------------------
> hidden dims = [703]
> rho = 0.08102356207766644
> beta = 0.012297288957910173


layer: 0: 54it [00:00, 119.21it/s, loss=138, stab=4]  


[*] nmi = 0.26668177924640746
[*] ncut = 10.816101219789385

trial 11----------------------------
> hidden dims = [703]
> rho = 0.0869821884209373
> beta = 0.0267870779847426


layer: 0: 57it [00:00, 166.24it/s, loss=142, stab=4]  


[*] nmi = 0.41438994582445154
[*] ncut = 10.22166811805229

trial 12----------------------------
> hidden dims = [703]
> rho = 0.08332447280612446
> beta = 0.014747073255776684


layer: 0: 56it [00:00, 157.36it/s, loss=138, stab=4] 


[*] nmi = 0.39824210663672405
[*] ncut = 10.399832664811305

trial 13----------------------------
> hidden dims = [703]
> rho = 0.0987188295669722
> beta = 0.01094257583586978


layer: 0: 64it [00:00, 167.65it/s, loss=137, stab=4]   


[*] nmi = 0.28370623558085745
[*] ncut = 11.292569790447335

trial 14----------------------------
> hidden dims = [753, 564, 423, 317, 237, 177, 132, 99, 74]
> rho = 0.011710530134441607
> beta = 0.07923398067978257


layer: 0: 19045it [02:01, 157.38it/s, loss=0.5, stab=4]  
layer: 1: 10073it [01:00, 167.85it/s, loss=9.31, stab=4]  
layer: 2: 31125it [03:00, 172.91it/s, loss=38.6, stab=0] 


[!] stopping layer 2 training after 180.00s (> 180s)


layer: 3: 31670it [03:00, 175.94it/s, loss=256, stab=0]   


[!] stopping layer 3 training after 180.00s (> 180s)


layer: 4: 31722it [03:00, 176.22it/s, loss=904, stab=0]    


[!] stopping layer 4 training after 180.01s (> 180s)


layer: 5: 31684it [03:00, 176.02it/s, loss=949, stab=0]    


[!] stopping layer 5 training after 180.00s (> 180s)


layer: 6: 31714it [03:00, 176.19it/s, loss=557, stab=0]  


[!] stopping layer 6 training after 180.00s (> 180s)


layer: 7: 31918it [03:00, 177.32it/s, loss=339, stab=0]  


[!] stopping layer 7 training after 180.00s (> 180s)


layer: 8: 12702it [01:11, 177.36it/s, loss=238, stab=4]  


[*] nmi = 0.36557914276460624
[*] ncut = 30.26994150476641

trial 15----------------------------
> hidden dims = [753, 564]
> rho = 0.012591180377658155
> beta = 0.08662897127781095


layer: 0: 17435it [01:50, 157.59it/s, loss=0.585, stab=4]
layer: 1: 30859it [03:00, 171.44it/s, loss=0.438, stab=0]


[!] stopping layer 1 training after 180.00s (> 180s)
[*] nmi = 0.45009339528211467
[*] ncut = 19.29291800338486

trial 16----------------------------
> hidden dims = [904, 813, 731, 657, 591, 531, 477, 429, 386, 347, 312, 280, 252, 226, 203, 182, 163, 146, 131, 117, 105, 94, 84, 75]
> rho = 0.03936865284316899
> beta = 0.29422792524628416


layer: 0: 15854it [01:44, 151.80it/s, loss=0.262, stab=4]
layer: 1: 23283it [02:32, 152.26it/s, loss=0.334, stab=4]
layer: 2: 28392it [03:00, 157.73it/s, loss=2.1, stab=0]  


[!] stopping layer 2 training after 180.00s (> 180s)


layer: 3: 28983it [03:00, 161.01it/s, loss=2.22, stab=0] 


[!] stopping layer 3 training after 180.00s (> 180s)


layer: 4: 31040it [03:00, 172.44it/s, loss=4.3, stab=0]  


[!] stopping layer 4 training after 180.00s (> 180s)


layer: 5: 31499it [03:00, 174.99it/s, loss=7.85, stab=0] 


[!] stopping layer 5 training after 180.00s (> 180s)


layer: 6: 31752it [03:00, 176.40it/s, loss=18.3, stab=0] 


[!] stopping layer 6 training after 180.01s (> 180s)


layer: 7: 32223it [03:00, 179.01it/s, loss=52.2, stab=0] 


[!] stopping layer 7 training after 180.00s (> 180s)


layer: 8: 32221it [03:00, 179.00it/s, loss=129, stab=0]  


[!] stopping layer 8 training after 180.00s (> 180s)


layer: 9: 32280it [03:00, 179.33it/s, loss=241, stab=0]   


[!] stopping layer 9 training after 180.00s (> 180s)


layer: 10: 32658it [03:00, 181.43it/s, loss=368, stab=0]   


[!] stopping layer 10 training after 180.00s (> 180s)


layer: 11: 32638it [03:00, 181.32it/s, loss=408, stab=0]   


[!] stopping layer 11 training after 180.00s (> 180s)


layer: 12: 32686it [03:00, 181.59it/s, loss=423, stab=0]  


[!] stopping layer 12 training after 180.00s (> 180s)


layer: 13: 32252it [03:00, 179.17it/s, loss=368, stab=0]  


[!] stopping layer 13 training after 180.00s (> 180s)


layer: 14: 31931it [03:00, 177.39it/s, loss=293, stab=0]   


[!] stopping layer 14 training after 180.00s (> 180s)


layer: 15: 32148it [03:00, 178.60it/s, loss=227, stab=0]  


[!] stopping layer 15 training after 180.00s (> 180s)


layer: 16: 32332it [03:00, 179.62it/s, loss=200, stab=0]  


[!] stopping layer 16 training after 180.00s (> 180s)


layer: 17: 32149it [03:00, 178.60it/s, loss=155, stab=0]  


[!] stopping layer 17 training after 180.00s (> 180s)


layer: 18: 30919it [03:00, 171.77it/s, loss=119, stab=0]  


[!] stopping layer 18 training after 180.00s (> 180s)


layer: 19: 31076it [03:00, 172.64it/s, loss=85.6, stab=0] 


[!] stopping layer 19 training after 180.00s (> 180s)


layer: 20: 30925it [03:00, 171.80it/s, loss=63.3, stab=0] 


[!] stopping layer 20 training after 180.00s (> 180s)


layer: 21: 31084it [03:00, 172.69it/s, loss=51.8, stab=0]


[!] stopping layer 21 training after 180.00s (> 180s)


layer: 22: 31984it [03:00, 177.69it/s, loss=41.7, stab=0]


[!] stopping layer 22 training after 180.00s (> 180s)


layer: 23: 5446it [00:30, 180.47it/s, loss=30.1, stab=4]


[*] nmi = 0.20354354721160833
[*] ncut = 34.30373032351829

trial 17----------------------------
> hidden dims = [653, 424]
> rho = 0.0008134650578175098
> beta = 0.027553913125744383


layer: 0: 54it [00:00, 164.41it/s, loss=147, stab=4]  
layer: 1: 134it [00:00, 159.90it/s, loss=1.92, stab=4]


[*] nmi = 0.21648643398318013
[*] ncut = 23.68756028831647

trial 18----------------------------
> hidden dims = [703]
> rho = 0.022984705474655778
> beta = 0.04022710905341625


layer: 0: 70it [00:00, 125.84it/s, loss=151, stab=4]   


[*] nmi = 0.38411973586187215
[*] ncut = 11.113397719223542

trial 19----------------------------
> hidden dims = [753, 564, 423, 317, 237, 177, 132, 99]
> rho = 0.08087411777442799
> beta = 0.2207543157923934


layer: 0: 15792it [01:43, 152.46it/s, loss=0.198, stab=4]
layer: 1: 30294it [03:00, 168.30it/s, loss=8.25, stab=0] 


[!] stopping layer 1 training after 180.00s (> 180s)


layer: 2: 31054it [03:00, 172.52it/s, loss=45.9, stab=0]  


[!] stopping layer 2 training after 180.00s (> 180s)


layer: 3: 9266it [00:58, 168.30it/s, loss=478, stab=0]    