In [1]:
import optuna
import torch
import numpy as np
import tqdm
import sklearn
import networkx as nx
import random
import warnings

  from .autonotebook import tqdm as notebook_tqdm


# 1. Model definition

In [2]:
class AutoEncoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.encoder = torch.nn.Linear(input_dim, hidden_dim)
        self.decoder = torch.nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        encoded = torch.sigmoid(self.encoder(x))
        decoded = torch.sigmoid(self.decoder(encoded))
        return encoded, decoded

In [3]:
class GraphEncoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super().__init__()
        self.autoencoders = torch.nn.ModuleList()
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            self.autoencoders.append(AutoEncoder(prev_dim, hidden_dim))
            prev_dim = hidden_dim

    def forward(self, x):
        for autoencoder in self.autoencoders:
            x = torch.sigmoid(autoencoder.encoder(x))
        encoded = x
        for autoencoder in reversed(self.autoencoders):
            x = torch.sigmoid(autoencoder.decoder(x))
        decoded = x
        return encoded, decoded

# 2. Test on benchmark "football"

In [2]:
def compute_ncut(s, labels):
    """
    Compute  normalized cut for given similarity matrix s and cluster labels:
      Ncut = sum_k cut(C_k, V\C_k) / assoc(C_k, V)
    where
      cut(C, V\C) = sum_{i in C, j not in C} A[i,j]
      assoc(C, V) = sum_{i in C, j in V} A[i,j]  (i.e., volume of C)
    A : symmetric adjacency/similarity numpy array
    labels : length-n array of integer cluster labels
    Returns float Ncut value.
    """

    # Get the unique labels in the community assignment
    unique_labels = np.unique(labels)
    
    # Precompute degrees
    degrees = s.sum(axis=1)  # degree/volume per node
    
    # Initialize ncut
    ncut = 0.0
    
    # For each cluster compute link and volume, then sum up to get ncut
    for lab in unique_labels:
        
        # Get the indices of nodes in cluster lab
        idx = np.where(labels == lab)[0]
        if idx.size == 0:
            raise Exception("compute_ncut_from_labels: empty cluster found in labels.")
        
        # Compute volume = sum of degrees of nodes in idx
        volume = degrees[idx].sum()
        
        # If volume is not zero, compute link to get the local cut then sum to ncut, otherwise skip (i.e. cut = 0)
        if volume != 0:

            # Compute link = sum over i in C, j not in C, of A[i,j]
            # = volume - internal connections
            internal_connections = s[np.ix_(idx, idx)].sum()
            link = volume - internal_connections
            
            # Compute local cut contribution
            local_cut = link / volume

            # Sum to ncut
            ncut += local_cut
    
    return ncut

warnings.filterwarnings("error", category=sklearn.exceptions.ConvergenceWarning)

## 2.1. Data loading

In [3]:
nxg = nx.read_gml("../datasets/synthetic/lfr_0.40.gml") # read the football gml file into a networkx graph
y = [nxg.nodes[n]["value"] for n in nxg.nodes] # extract the ground-truth community labels
s = nx.to_numpy_array(nxg) # generate the similarity matrix
s = s + np.diag(np.ones(nxg.number_of_nodes())) # we add self-loops (not indicated in the original paper but improves performance)
nts = s / np.sum(s, axis=1, keepdims=True) # generate the normalized training set
print("[*] nts.shape:", nts.shape)
print("[*] number of clusters:", len(set(y)))
cumulated_nmi = 0
cumulated_ncut = 0
nb_kmeans_tests = 100
random.seed(0)
for _ in tqdm.tqdm(range(nb_kmeans_tests)):
    kmeans = sklearn.cluster.KMeans(n_clusters=len(set(y)), algorithm="lloyd", random_state=random.randint(0, 10000))
    y_pred_origspace = kmeans.fit_predict(nts)
    cumulated_nmi += sklearn.metrics.normalized_mutual_info_score(y, y_pred_origspace)
    cumulated_ncut += compute_ncut(s, y_pred_origspace)
print("[*] original space average nmi:", cumulated_nmi / nb_kmeans_tests)
print("[*] original space average ncut:", cumulated_ncut / nb_kmeans_tests)

[*] nts.shape: (250, 250)
[*] number of clusters: 3


100%|██████████| 100/100 [00:00<00:00, 115.11it/s]

[*] original space average nmi: 0.01319438903360806
[*] original space average ncut: 1.221212088695503





In [4]:
warnings.filterwarnings("ignore", category=UserWarning, message="Graph is not fully connected, spectral embedding may not work as expected.")
cumulated_nmi = 0
cumulated_ncut = 0
nb_kmeans_tests = 100
random.seed(0)
for _ in tqdm.tqdm(range(nb_kmeans_tests)):
    y_pred_origspace = y_pred_origspace = sklearn.cluster.SpectralClustering(n_clusters=len(set(y)), affinity='precomputed', assign_labels='kmeans', random_state=random.randint(0, 10000)).fit_predict(s)
    cumulated_nmi += sklearn.metrics.normalized_mutual_info_score(y, y_pred_origspace)
    cumulated_ncut += compute_ncut(s, y_pred_origspace)
print("[*] original space average nmi:", cumulated_nmi / nb_kmeans_tests)
print("[*] original space average ncut:", cumulated_ncut / nb_kmeans_tests)

100%|██████████| 100/100 [00:03<00:00, 27.86it/s]

[*] original space average nmi: 0.0097685375855284
[*] original space average ncut: 0.5515753822934782





## 2.3. Model training with hyper-parameter tuning 

In [6]:
def objective(trial):

    # Print trial number
    print(f"\ntrial {trial.number}----------------------------")
    
    # Set globals
    global best_avg_nmi
    global best_avg_ncut
    global best_avg_ncut_avg_nmi
    
    # Set random seeds
    torch.manual_seed(97)
    np.random.seed(97)
    random.seed(97)

    # Suggest a decay rate for hidden dimensions
    dim_decay_rate = trial.suggest_float("dim_decay_rate", 0.6, 0.9, step=0.05)

    # Compute the hidden dimensions
    latent_dim = int(x_train.shape[1] * dim_decay_rate)
    hidden_dims = []
    hidden_dims.append(latent_dim)
    while latent_dim * dim_decay_rate >= len(set(y)):
        latent_dim = int(latent_dim * dim_decay_rate)
        hidden_dims.append(latent_dim)

    # Suggest the number of layers
    n_layers = trial.suggest_int("n_layers", 1, len(hidden_dims), step=1)
    hidden_dims = hidden_dims[:n_layers]
    
    # Create the model using the hidden dimensions
    model = GraphEncoder(input_dim=x_train.shape[1], hidden_dims=hidden_dims).to(device)

    # Suggest rho and beta for the sparsity constraint
    rho = trial.suggest_float("rho", 1e-4, 1e-1, log=True)
    beta = trial.suggest_float("beta", 1e-2, 1e3, log=True)
    
    # Suggest a learning rate for the optimizer and create the optimizer    
    lr = trial.suggest_float("lr", 1e-3, 1e-2, log=True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    
    # Create initial dataloader
    current_x_train = x_train.clone().to(device)
    dataloader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(current_x_train),
        batch_size=batch_size,
        shuffle=True
    )
    dataloader_iter = iter(dataloader)

    # Suggest nb_epochs_per_layer
    nb_epochs_per_layer = nb_epochs_per_layer_pool[trial.suggest_int("nb_epochs_per_layer", 0, len(nb_epochs_per_layer_pool)-1)]
    nb_train_iters = nb_epochs_per_layer * len(dataloader)

    # Print some hyper parameters
    print("> hidden dims =", hidden_dims)
    print("> rho =", rho)
    print("> beta =", beta)
    
    # Launch the training loop
    # For each layer in the stacked autoencoder: train the layer
    for layer_number in range(len(model.autoencoders)):
        for _ in (pb := tqdm.tqdm(range(nb_train_iters), desc=f"layer: {layer_number}")):
            try:
                (x_batch,) = next(dataloader_iter)
            except StopIteration:
                dataloader_iter = iter(dataloader)
                (x_batch,) = next(dataloader_iter)
            x_batch = x_batch.to(device)
            optimizer.zero_grad()
            encoded, decoded = model.autoencoders[layer_number](x_batch)
            loss_1 = torch.nn.functional.mse_loss(decoded, x_batch, reduction='sum')
            rho_hat = torch.mean(encoded, dim=0)
            loss_2 = torch.sum(rho * torch.log(rho / rho_hat) + (1 - rho) * torch.log((1 - rho) / (1 - rho_hat)))
            loss = loss_1 + beta * loss_2
            loss.backward()
            optimizer.step()
            pb.set_postfix({"loss": loss.item()})

        # Create new dataloader on the latent representations
        with torch.no_grad():
            current_x_train, _ = model.autoencoders[layer_number](current_x_train)
            dataloader = torch.utils.data.DataLoader(
                torch.utils.data.TensorDataset(current_x_train),
                batch_size=batch_size,
                shuffle=True
            )
            dataloader_iter = iter(dataloader)
    
    try:
        # Evaluate the model
        with torch.no_grad():
            
            # Get the encoded representations
            encoded, _ = model(x_train)
            encoded = encoded.to('cpu')
            
            # Evaluate average nmi and ncut over several kmeans runs
            cumulated_nmi = 0
            cumulated_ncut = 0
            for _ in tqdm.tqdm(range(nb_kmeans_tests), desc="computing avg nmi and ncut"):
                kmeans = sklearn.cluster.KMeans(n_clusters=len(set(y)), algorithm="lloyd", random_state=random.randint(0, 10000,), n_init='auto')
                y_pred = kmeans.fit_predict(encoded)
                cumulated_nmi += sklearn.metrics.normalized_mutual_info_score(y, y_pred)
                cumulated_ncut += compute_ncut(s, y_pred)
            avg_nmi = cumulated_nmi / nb_kmeans_tests
            avg_ncut = cumulated_ncut / nb_kmeans_tests
            
            # Print average nmi and ncut
            print("[*] average nmi =", avg_nmi)
            print("[*] average ncut =", avg_ncut)
            
            # If average nmi is better than the best so far, update best_avg_nmi
            if avg_nmi > best_avg_nmi:
                best_avg_nmi = avg_nmi
            
            # If average ncut is better than the best so far, update best_avg_ncut and its corresponding average nmi (i.e. best_avg_ncut_avg_nmi)
            if avg_ncut < best_avg_ncut:
                best_avg_ncut = avg_ncut
                best_avg_ncut_avg_nmi = avg_nmi
    
    except sklearn.exceptions.ConvergenceWarning:
        print("[!] KMeans did not converge (not enough distinct points) --> Returning inf for avg_ncut")
        avg_ncut = float('inf')

    # Return avg_ncut as the objective to minimize
    return avg_ncut


# Set global parameters
nb_epochs_per_layer_pool = [10, 100, 500, 1000, 2500, 5000]
nb_kmeans_tests = 100
nb_trials = 20
device = ('cuda' if torch.cuda.is_available() else 'cpu'); print("[*] using device:", device)
x_train = torch.tensor(nts, dtype=torch.float32).to(device)
batch_size = x_train.shape[0]

# Set globals to track best results
best_avg_nmi = 0.0
best_avg_ncut = float('inf')
best_avg_ncut_avg_nmi = 0.0

# Run optuna study
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(sampler=sampler, direction="minimize")
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=nb_trials)

# Display the best results
print("========================================================")
print("========================================================")
print("[*] best avg nmi =", best_avg_nmi)
print("[*] best avg ncut =", best_avg_ncut)
print("[*] best avg ncut avg nmi =", best_avg_ncut_avg_nmi)

[*] using device: cuda


[I 2025-11-15 09:25:17,122] A new study created in memory with name: no-name-32d11d69-2514-4363-9071-f82cb3d25511



trial 0----------------------------
> hidden dims = [175, 122, 85, 59, 41, 28, 19, 13, 9, 6, 4]
> rho = 0.015702970884055395
> beta = 9.846738873614559


layer: 0: 100%|██████████| 10/10 [00:00<00:00, 17.12it/s, loss=5.29e+3]
layer: 1: 100%|██████████| 10/10 [00:00<00:00, 113.23it/s, loss=300]
layer: 2: 100%|██████████| 10/10 [00:00<00:00, 113.55it/s, loss=1.2e+3]
layer: 3: 100%|██████████| 10/10 [00:00<00:00, 118.07it/s, loss=310]
layer: 4: 100%|██████████| 10/10 [00:00<00:00, 118.64it/s, loss=328]
layer: 5: 100%|██████████| 10/10 [00:00<00:00, 123.83it/s, loss=243]
layer: 6: 100%|██████████| 10/10 [00:00<00:00, 119.78it/s, loss=166]
layer: 7: 100%|██████████| 10/10 [00:00<00:00, 100.55it/s, loss=108]
layer: 8: 100%|██████████| 10/10 [00:00<00:00, 98.58it/s, loss=75.2]
layer: 9: 100%|██████████| 10/10 [00:00<00:00, 112.38it/s, loss=57]
layer: 10: 100%|██████████| 10/10 [00:00<00:00, 117.63it/s, loss=43.8]
computing avg nmi and ncut:   0%|          | 0/100 [00:00<?, ?it/s]


[!] KMeans did not converge (not enough distinct points) --> Returning inf for avg_ncut

trial 1----------------------------
> hidden dims = [150, 90, 54, 32, 19, 11, 6]
> rho = 0.006358358856676255
> beta = 34.70266988650411


layer: 0: 100%|██████████| 5000/5000 [00:44<00:00, 113.55it/s, loss=198]    
layer: 1: 100%|██████████| 5000/5000 [00:16<00:00, 305.53it/s, loss=42.2]  
layer: 2: 100%|██████████| 5000/5000 [00:14<00:00, 334.25it/s, loss=57]    
layer: 3: 100%|██████████| 5000/5000 [00:15<00:00, 332.55it/s, loss=38]    
layer: 4: 100%|██████████| 5000/5000 [00:15<00:00, 330.81it/s, loss=23.8] 
layer: 5: 100%|██████████| 5000/5000 [00:15<00:00, 312.72it/s, loss=17]   
layer: 6: 100%|██████████| 5000/5000 [00:15<00:00, 317.90it/s, loss=10.5]
computing avg nmi and ncut: 100%|██████████| 100/100 [00:00<00:00, 400.66it/s]


[*] average nmi = 0.006363918161098709
[*] average ncut = 1.4877014035979175

trial 2----------------------------
> hidden dims = [212, 180, 153, 130, 110]
> rho = 0.0003511356313970409
> beta = 0.08260808399079603


layer: 0: 100%|██████████| 1000/1000 [00:02<00:00, 334.41it/s, loss=64.4]
layer: 1: 100%|██████████| 1000/1000 [00:03<00:00, 311.95it/s, loss=1.93]
layer: 2: 100%|██████████| 1000/1000 [00:02<00:00, 335.67it/s, loss=10.8]
layer: 3: 100%|██████████| 1000/1000 [00:02<00:00, 348.31it/s, loss=0.486]
layer: 4: 100%|██████████| 1000/1000 [00:02<00:00, 349.87it/s, loss=8.7]
computing avg nmi and ncut: 100%|██████████| 100/100 [00:00<00:00, 364.56it/s]


[*] average nmi = 0.004439682635058223
[*] average ncut = 1.3218674932922823

trial 3----------------------------
> hidden dims = [187, 140, 105, 78, 58]
> rho = 0.006847920095574782
> beta = 0.04982752357076448


layer: 0: 100%|██████████| 500/500 [00:01<00:00, 344.61it/s, loss=61.1]
layer: 1: 100%|██████████| 500/500 [00:01<00:00, 320.36it/s, loss=1.7] 
layer: 2: 100%|██████████| 500/500 [00:01<00:00, 337.30it/s, loss=4.67]
layer: 3: 100%|██████████| 500/500 [00:01<00:00, 305.22it/s, loss=0.862]
layer: 4: 100%|██████████| 500/500 [00:01<00:00, 318.08it/s, loss=2.78]
computing avg nmi and ncut: 100%|██████████| 100/100 [00:00<00:00, 308.60it/s]


[*] average nmi = 0.009438542752586815
[*] average ncut = 1.3702278439062812

trial 4----------------------------
> hidden dims = [187, 140, 105, 78, 58, 43, 32, 24, 18, 13, 9]
> rho = 0.0003972110727381913
> beta = 3.725393839578884


layer: 0: 100%|██████████| 10/10 [00:00<00:00, 311.45it/s, loss=755]
layer: 1: 100%|██████████| 10/10 [00:00<00:00, 365.17it/s, loss=40.1]
layer: 2: 100%|██████████| 10/10 [00:00<00:00, 359.88it/s, loss=657]
layer: 3: 100%|██████████| 10/10 [00:00<00:00, 370.33it/s, loss=56]
layer: 4: 100%|██████████| 10/10 [00:00<00:00, 375.97it/s, loss=502]
layer: 5: 100%|██████████| 10/10 [00:00<00:00, 323.08it/s, loss=58]
layer: 6: 100%|██████████| 10/10 [00:00<00:00, 383.29it/s, loss=272]
layer: 7: 100%|██████████| 10/10 [00:00<00:00, 341.47it/s, loss=62.3]
layer: 8: 100%|██████████| 10/10 [00:00<00:00, 338.33it/s, loss=117]
layer: 9: 100%|██████████| 10/10 [00:00<00:00, 355.54it/s, loss=90.3]
layer: 10: 100%|██████████| 10/10 [00:00<00:00, 339.77it/s, loss=45]
computing avg nmi and ncut:   0%|          | 0/100 [00:00<?, ?it/s]


[!] KMeans did not converge (not enough distinct points) --> Returning inf for avg_ncut

trial 5----------------------------
> hidden dims = [200, 160, 128, 102]
> rho = 0.00015673095467235422
> beta = 555.1721685244722


layer: 0: 100%|██████████| 2500/2500 [00:07<00:00, 334.55it/s, loss=182]   
layer: 1: 100%|██████████| 2500/2500 [00:07<00:00, 337.14it/s, loss=221]    
layer: 2: 100%|██████████| 2500/2500 [00:07<00:00, 334.88it/s, loss=143]   
layer: 3: 100%|██████████| 2500/2500 [00:07<00:00, 343.49it/s, loss=140]   
computing avg nmi and ncut: 100%|██████████| 100/100 [00:00<00:00, 342.32it/s]


[*] average nmi = 0.012651105129518376
[*] average ncut = 1.5279956223581628

trial 6----------------------------
> hidden dims = [175, 122]
> rho = 0.01129013355909268
> beta = 1.587678152692399


layer: 0: 100%|██████████| 500/500 [00:01<00:00, 302.83it/s, loss=234]
layer: 1: 100%|██████████| 500/500 [00:01<00:00, 332.18it/s, loss=1]   
computing avg nmi and ncut: 100%|██████████| 100/100 [00:00<00:00, 349.18it/s]


[*] average nmi = 0.008349802906416992
[*] average ncut = 1.3404911067665544

trial 7----------------------------
> hidden dims = [150, 90, 54, 32, 19, 11, 6, 3]
> rho = 0.0005975027999960298
> beta = 20.54051942538844


layer: 0: 100%|██████████| 1000/1000 [00:02<00:00, 339.45it/s, loss=561]   
layer: 1: 100%|██████████| 1000/1000 [00:02<00:00, 338.73it/s, loss=95.4]
layer: 2: 100%|██████████| 1000/1000 [00:03<00:00, 322.48it/s, loss=220]  
layer: 3: 100%|██████████| 1000/1000 [00:03<00:00, 305.14it/s, loss=79.7]
layer: 4: 100%|██████████| 1000/1000 [00:03<00:00, 324.41it/s, loss=93.3]
layer: 5: 100%|██████████| 1000/1000 [00:02<00:00, 349.99it/s, loss=46.3]
layer: 6: 100%|██████████| 1000/1000 [00:02<00:00, 337.59it/s, loss=36.5]
layer: 7: 100%|██████████| 1000/1000 [00:02<00:00, 335.25it/s, loss=8.36]
computing avg nmi and ncut: 100%|██████████| 100/100 [00:00<00:00, 407.16it/s]


[*] average nmi = 0.007475920462929459
[*] average ncut = 1.4897763014982055

trial 8----------------------------
> hidden dims = [187, 140, 105]
> rho = 0.08105016126411585
> beta = 75.10418138777538


layer: 0: 100%|██████████| 5000/5000 [00:14<00:00, 341.49it/s, loss=47]   
layer: 1: 100%|██████████| 5000/5000 [00:14<00:00, 340.85it/s, loss=2.27]
layer: 2: 100%|██████████| 5000/5000 [00:14<00:00, 342.87it/s, loss=0.287]
computing avg nmi and ncut: 100%|██████████| 100/100 [00:00<00:00, 392.57it/s]


[*] average nmi = 0.008227392267905712
[*] average ncut = 1.5129625666932238

trial 9----------------------------
> hidden dims = [200, 160, 128, 102, 81, 64, 51, 40, 32, 25, 20, 16, 12, 9, 7, 5, 4]
> rho = 0.00018427970406864567
> beta = 0.09548041810464164


layer: 0: 100%|██████████| 100/100 [00:00<00:00, 348.98it/s, loss=130]
layer: 1: 100%|██████████| 100/100 [00:00<00:00, 350.63it/s, loss=3.29]
layer: 2: 100%|██████████| 100/100 [00:00<00:00, 363.90it/s, loss=11.6]
layer: 3: 100%|██████████| 100/100 [00:00<00:00, 353.08it/s, loss=4.89]
layer: 4: 100%|██████████| 100/100 [00:00<00:00, 374.20it/s, loss=7.59]
layer: 5: 100%|██████████| 100/100 [00:00<00:00, 336.02it/s, loss=5.81]
layer: 6: 100%|██████████| 100/100 [00:00<00:00, 363.38it/s, loss=6.55]
layer: 7: 100%|██████████| 100/100 [00:00<00:00, 371.69it/s, loss=6.75]
layer: 8: 100%|██████████| 100/100 [00:00<00:00, 363.93it/s, loss=12.1]
layer: 9: 100%|██████████| 100/100 [00:00<00:00, 382.28it/s, loss=27.5]
layer: 10: 100%|██████████| 100/100 [00:00<00:00, 258.21it/s, loss=29.4]
layer: 11: 100%|██████████| 100/100 [00:00<00:00, 327.40it/s, loss=24.6]
layer: 12: 100%|██████████| 100/100 [00:00<00:00, 342.08it/s, loss=57.4]
layer: 13: 100%|██████████| 100/100 [00:00<00:00, 337.26it/s, 

[!] KMeans did not converge (not enough distinct points) --> Returning inf for avg_ncut

trial 10----------------------------
> hidden dims = [225, 202, 181, 162, 145, 130, 117, 105, 94, 84, 75, 67, 60, 54, 48, 43, 38, 34, 30, 27, 24, 21, 18, 16, 14, 12]
> rho = 0.001276986535679652
> beta = 0.012297288957910173


layer: 0: 100%|██████████| 1000/1000 [00:03<00:00, 331.99it/s, loss=51.2]
layer: 1: 100%|██████████| 1000/1000 [00:02<00:00, 354.52it/s, loss=2.27]
layer: 2: 100%|██████████| 1000/1000 [00:02<00:00, 337.46it/s, loss=1.89]
layer: 3: 100%|██████████| 1000/1000 [00:02<00:00, 340.72it/s, loss=0.186]
layer: 4: 100%|██████████| 1000/1000 [00:02<00:00, 346.34it/s, loss=1.41]
layer: 5: 100%|██████████| 1000/1000 [00:03<00:00, 329.14it/s, loss=0.41]
layer: 6: 100%|██████████| 1000/1000 [00:03<00:00, 329.85it/s, loss=1.24]
layer: 7: 100%|██████████| 1000/1000 [00:02<00:00, 345.55it/s, loss=0.682]
layer: 8: 100%|██████████| 1000/1000 [00:02<00:00, 340.45it/s, loss=1.35]
layer: 9: 100%|██████████| 1000/1000 [00:02<00:00, 345.13it/s, loss=1.2]
layer: 10: 100%|██████████| 1000/1000 [00:02<00:00, 348.95it/s, loss=1.47]
layer: 11: 100%|██████████| 1000/1000 [00:03<00:00, 330.99it/s, loss=1.6]
layer: 12: 100%|██████████| 1000/1000 [00:02<00:00, 335.08it/s, loss=1.41]
layer: 13: 100%|██████████| 1000/10

[*] average nmi = 0.006546648262914634
[*] average ncut = 1.2460477633118068

trial 11----------------------------
> hidden dims = [225, 202, 181, 162, 145, 130, 117, 105, 94, 84, 75, 67, 60, 54, 48, 43, 38, 34, 30, 27, 24, 21, 18, 16, 14, 12, 10, 9, 8, 7, 6, 5]
> rho = 0.001194229221743439
> beta = 0.010443437508657413


layer: 0: 100%|██████████| 1000/1000 [00:03<00:00, 274.67it/s, loss=50.7]
layer: 1: 100%|██████████| 1000/1000 [00:03<00:00, 269.98it/s, loss=2.17]
layer: 2: 100%|██████████| 1000/1000 [00:03<00:00, 274.59it/s, loss=1.52]
layer: 3: 100%|██████████| 1000/1000 [00:03<00:00, 280.70it/s, loss=0.175]
layer: 4: 100%|██████████| 1000/1000 [00:03<00:00, 302.43it/s, loss=1.29]
layer: 5: 100%|██████████| 1000/1000 [00:03<00:00, 313.85it/s, loss=0.321]
layer: 6: 100%|██████████| 1000/1000 [00:03<00:00, 281.44it/s, loss=1.11]
layer: 7: 100%|██████████| 1000/1000 [00:02<00:00, 334.73it/s, loss=0.562]
layer: 8: 100%|██████████| 1000/1000 [00:02<00:00, 345.11it/s, loss=1.3]
layer: 9: 100%|██████████| 1000/1000 [00:03<00:00, 301.57it/s, loss=1.12]
layer: 10: 100%|██████████| 1000/1000 [00:03<00:00, 313.63it/s, loss=1.46]
layer: 11: 100%|██████████| 1000/1000 [00:02<00:00, 345.87it/s, loss=1.64]
layer: 12: 100%|██████████| 1000/1000 [00:03<00:00, 290.04it/s, loss=1.56]
layer: 13: 100%|██████████| 1000/

[*] average nmi = 0.005343161470294304
[*] average ncut = 1.2713821600461928

trial 12----------------------------
> hidden dims = [225, 202, 181, 162, 145, 130, 117, 105, 94, 84, 75, 67, 60, 54, 48, 43, 38, 34, 30, 27, 24, 21, 18, 16, 14, 12, 10, 9, 8, 7, 6, 5, 4, 3]
> rho = 0.0016740028870066716
> beta = 0.011302295738159825


layer: 0: 100%|██████████| 2500/2500 [00:08<00:00, 292.23it/s, loss=47.1]
layer: 1: 100%|██████████| 2500/2500 [00:08<00:00, 297.76it/s, loss=6.96]
layer: 2: 100%|██████████| 2500/2500 [00:08<00:00, 288.67it/s, loss=2.35]
layer: 3: 100%|██████████| 2500/2500 [00:09<00:00, 263.80it/s, loss=1.7] 
layer: 4: 100%|██████████| 2500/2500 [00:08<00:00, 289.90it/s, loss=1.92]
layer: 5: 100%|██████████| 2500/2500 [00:08<00:00, 296.41it/s, loss=1.72]
layer: 6: 100%|██████████| 2500/2500 [00:09<00:00, 273.12it/s, loss=2.09] 
layer: 7: 100%|██████████| 2500/2500 [00:08<00:00, 286.29it/s, loss=2.82] 
layer: 8: 100%|██████████| 2500/2500 [00:08<00:00, 284.80it/s, loss=2.71] 
layer: 9: 100%|██████████| 2500/2500 [00:09<00:00, 263.95it/s, loss=3.25] 
layer: 10: 100%|██████████| 2500/2500 [00:10<00:00, 241.42it/s, loss=3.01] 
layer: 11: 100%|██████████| 2500/2500 [00:10<00:00, 232.57it/s, loss=3.36]
layer: 12: 100%|██████████| 2500/2500 [00:11<00:00, 215.51it/s, loss=2.34]
layer: 13: 100%|██████████| 25

[*] average nmi = 0.020581842533657083
[*] average ncut = 1.0797572981718313

trial 13----------------------------
> hidden dims = [225, 202, 181, 162, 145, 130, 117, 105, 94, 84, 75, 67, 60, 54, 48, 43, 38, 34, 30, 27, 24, 21, 18, 16, 14, 12, 10, 9, 8, 7, 6, 5, 4]
> rho = 0.00180174544833436
> beta = 0.0108259210062387


layer: 0: 100%|██████████| 2500/2500 [00:09<00:00, 270.82it/s, loss=44.7]
layer: 1: 100%|██████████| 2500/2500 [00:07<00:00, 326.13it/s, loss=7.01]
layer: 2: 100%|██████████| 2500/2500 [00:07<00:00, 326.37it/s, loss=2.6] 
layer: 3: 100%|██████████| 2500/2500 [00:07<00:00, 327.82it/s, loss=1.53]
layer: 4: 100%|██████████| 2500/2500 [00:07<00:00, 330.31it/s, loss=1.84]
layer: 5: 100%|██████████| 2500/2500 [00:08<00:00, 308.27it/s, loss=2.25] 
layer: 6: 100%|██████████| 2500/2500 [00:08<00:00, 300.95it/s, loss=2.64] 
layer: 7: 100%|██████████| 2500/2500 [00:07<00:00, 332.30it/s, loss=3.6]  
layer: 8: 100%|██████████| 2500/2500 [00:07<00:00, 327.46it/s, loss=3.17] 
layer: 9: 100%|██████████| 2500/2500 [00:07<00:00, 318.42it/s, loss=3.64] 
layer: 10: 100%|██████████| 2500/2500 [00:08<00:00, 305.01it/s, loss=3.26] 
layer: 11: 100%|██████████| 2500/2500 [00:08<00:00, 289.57it/s, loss=2.65]
layer: 12: 100%|██████████| 2500/2500 [00:08<00:00, 292.38it/s, loss=2.29]
layer: 13: 100%|██████████| 2

[*] average nmi = 0.00773132391853962
[*] average ncut = 1.0703472280039739

trial 14----------------------------
> hidden dims = [225, 202, 181, 162, 145, 130, 117, 105, 94, 84, 75, 67, 60, 54, 48, 43, 38, 34, 30, 27, 24, 21, 18, 16, 14, 12, 10, 9, 8, 7, 6, 5, 4, 3]
> rho = 0.00234392752031698
> beta = 0.5176025906267282


layer: 0: 100%|██████████| 2500/2500 [00:07<00:00, 312.60it/s, loss=62]  
layer: 1: 100%|██████████| 2500/2500 [00:08<00:00, 308.29it/s, loss=17.7]
layer: 2: 100%|██████████| 2500/2500 [00:07<00:00, 325.27it/s, loss=14.5]
layer: 3: 100%|██████████| 2500/2500 [00:07<00:00, 322.36it/s, loss=12.5]
layer: 4: 100%|██████████| 2500/2500 [00:07<00:00, 341.36it/s, loss=14.3] 
layer: 5: 100%|██████████| 2500/2500 [00:07<00:00, 349.41it/s, loss=16.9]  
layer: 6: 100%|██████████| 2500/2500 [00:08<00:00, 305.01it/s, loss=19.5] 
layer: 7: 100%|██████████| 2500/2500 [00:08<00:00, 302.06it/s, loss=21.8] 
layer: 8: 100%|██████████| 2500/2500 [00:07<00:00, 316.27it/s, loss=24.1] 
layer: 9: 100%|██████████| 2500/2500 [00:08<00:00, 306.59it/s, loss=25.3]  
layer: 10: 100%|██████████| 2500/2500 [00:07<00:00, 328.30it/s, loss=26.7]  
layer: 11: 100%|██████████| 2500/2500 [00:07<00:00, 343.86it/s, loss=27.3] 
layer: 12: 100%|██████████| 2500/2500 [00:07<00:00, 345.43it/s, loss=31.4]
layer: 13: 100%|████████

[*] average nmi = 0.01255871174497439
[*] average ncut = 1.2991774485150323

trial 15----------------------------
> hidden dims = [212, 180, 153, 130, 110, 93, 79, 67, 56, 47, 39, 33, 28, 23, 19, 16, 13, 11, 9, 7, 5]
> rho = 0.0028610388377873375
> beta = 0.4496745823104299


layer: 0: 100%|██████████| 2500/2500 [00:08<00:00, 302.31it/s, loss=62.9]
layer: 1: 100%|██████████| 2500/2500 [00:08<00:00, 296.59it/s, loss=19.5]
layer: 2: 100%|██████████| 2500/2500 [00:07<00:00, 316.15it/s, loss=13.3]
layer: 3: 100%|██████████| 2500/2500 [00:08<00:00, 299.70it/s, loss=8.95]
layer: 4: 100%|██████████| 2500/2500 [00:07<00:00, 330.11it/s, loss=11.1]
layer: 5: 100%|██████████| 2500/2500 [00:08<00:00, 302.04it/s, loss=12]   
layer: 6: 100%|██████████| 2500/2500 [00:07<00:00, 332.69it/s, loss=15]   
layer: 7: 100%|██████████| 2500/2500 [00:07<00:00, 322.25it/s, loss=17]   
layer: 8: 100%|██████████| 2500/2500 [00:08<00:00, 303.55it/s, loss=17.6] 
layer: 9: 100%|██████████| 2500/2500 [00:08<00:00, 306.34it/s, loss=18.4]
layer: 10: 100%|██████████| 2500/2500 [00:08<00:00, 291.96it/s, loss=24.9]
layer: 11: 100%|██████████| 2500/2500 [00:08<00:00, 282.01it/s, loss=24.6]
layer: 12: 100%|██████████| 2500/2500 [00:08<00:00, 282.39it/s, loss=22.2]
layer: 13: 100%|██████████| 250

[*] average nmi = 0.007352642471465214
[*] average ncut = 1.0265124688216316

trial 16----------------------------
> hidden dims = [212, 180, 153, 130, 110, 93, 79, 67, 56, 47, 39, 33, 28, 23, 19, 16, 13, 11]
> rho = 0.03255881941776387
> beta = 0.36405069134231305


layer: 0: 100%|██████████| 5000/5000 [00:17<00:00, 280.89it/s, loss=7.01]
layer: 1: 100%|██████████| 5000/5000 [00:17<00:00, 292.48it/s, loss=7.1]   
layer: 2: 100%|██████████| 5000/5000 [00:16<00:00, 301.11it/s, loss=10.2]  
layer: 3: 100%|██████████| 5000/5000 [00:15<00:00, 316.25it/s, loss=14.5]  
layer: 4: 100%|██████████| 5000/5000 [00:16<00:00, 295.54it/s, loss=18.1]  
layer: 5: 100%|██████████| 5000/5000 [00:16<00:00, 303.14it/s, loss=23.7]  
layer: 6: 100%|██████████| 5000/5000 [00:15<00:00, 322.70it/s, loss=33.1]  
layer: 7: 100%|██████████| 5000/5000 [00:15<00:00, 326.62it/s, loss=42.3] 
layer: 8: 100%|██████████| 5000/5000 [00:15<00:00, 330.66it/s, loss=46]   
layer: 9: 100%|██████████| 5000/5000 [00:15<00:00, 321.67it/s, loss=38.8]
layer: 10: 100%|██████████| 5000/5000 [00:16<00:00, 294.34it/s, loss=29.9]
layer: 11: 100%|██████████| 5000/5000 [00:17<00:00, 286.96it/s, loss=17.9]
layer: 12: 100%|██████████| 5000/5000 [00:17<00:00, 282.04it/s, loss=12.2]
layer: 13: 100%|█████

[*] average nmi = 0.006439217810822169
[*] average ncut = 1.3685896986048605

trial 17----------------------------
> hidden dims = [212, 180, 153, 130, 110, 93, 79, 67, 56, 47, 39, 33, 28, 23, 19, 16, 13, 11, 9]
> rho = 0.0034335100607329173
> beta = 0.6827687354505085


layer: 0: 100%|██████████| 2500/2500 [00:07<00:00, 319.09it/s, loss=66.4]
layer: 1: 100%|██████████| 2500/2500 [00:07<00:00, 344.34it/s, loss=20.9]
layer: 2: 100%|██████████| 2500/2500 [00:08<00:00, 295.23it/s, loss=16]  
layer: 3: 100%|██████████| 2500/2500 [00:08<00:00, 294.39it/s, loss=10.4]
layer: 4: 100%|██████████| 2500/2500 [00:08<00:00, 290.15it/s, loss=13.2]
layer: 5: 100%|██████████| 2500/2500 [00:08<00:00, 311.54it/s, loss=13.7]
layer: 6: 100%|██████████| 2500/2500 [00:08<00:00, 305.77it/s, loss=16.3] 
layer: 7: 100%|██████████| 2500/2500 [00:08<00:00, 292.06it/s, loss=18.6] 
layer: 8: 100%|██████████| 2500/2500 [00:08<00:00, 309.21it/s, loss=20.5] 
layer: 9: 100%|██████████| 2500/2500 [00:07<00:00, 321.85it/s, loss=23.4]
layer: 10: 100%|██████████| 2500/2500 [00:07<00:00, 313.65it/s, loss=24.1]
layer: 11: 100%|██████████| 2500/2500 [00:08<00:00, 309.97it/s, loss=27.5]
layer: 12: 100%|██████████| 2500/2500 [00:07<00:00, 314.42it/s, loss=24.2]
layer: 13: 100%|██████████| 2500

[*] average nmi = 0.017086920588152435
[*] average ncut = 0.9799537492641504

trial 18----------------------------
> hidden dims = [212, 180, 153, 130, 110, 93, 79, 67, 56, 47, 39, 33, 28, 23, 19, 16, 13, 11]
> rho = 0.004241127277176174
> beta = 0.6687968305456375


layer: 0: 100%|██████████| 2500/2500 [00:08<00:00, 300.56it/s, loss=76.4]
layer: 1: 100%|██████████| 2500/2500 [00:07<00:00, 321.09it/s, loss=20.8]
layer: 2: 100%|██████████| 2500/2500 [00:07<00:00, 322.59it/s, loss=19.5]
layer: 3: 100%|██████████| 2500/2500 [00:08<00:00, 310.98it/s, loss=6.25]
layer: 4: 100%|██████████| 2500/2500 [00:08<00:00, 302.19it/s, loss=14.4]
layer: 5: 100%|██████████| 2500/2500 [00:08<00:00, 301.31it/s, loss=6.79]
layer: 6: 100%|██████████| 2500/2500 [00:08<00:00, 306.40it/s, loss=11.7]
layer: 7: 100%|██████████| 2500/2500 [00:08<00:00, 294.97it/s, loss=7.46]
layer: 8: 100%|██████████| 2500/2500 [00:07<00:00, 319.10it/s, loss=10.1]
layer: 9: 100%|██████████| 2500/2500 [00:07<00:00, 312.86it/s, loss=8.45]
layer: 10: 100%|██████████| 2500/2500 [00:07<00:00, 316.65it/s, loss=9.64]
layer: 11: 100%|██████████| 2500/2500 [00:07<00:00, 315.54it/s, loss=8.94]
layer: 12: 100%|██████████| 2500/2500 [00:08<00:00, 308.65it/s, loss=9.45]
layer: 13: 100%|██████████| 2500/25

[*] average nmi = 0.008342580867956393
[*] average ncut = 1.574586618931203

trial 19----------------------------
> hidden dims = [200, 160, 128, 102, 81, 64, 51, 40, 32, 25, 20, 16, 12, 9, 7]
> rho = 0.020766244840873534
> beta = 1.7318945902290381


layer: 0: 100%|██████████| 5000/5000 [00:15<00:00, 313.19it/s, loss=57.4]
layer: 1: 100%|██████████| 5000/5000 [00:16<00:00, 304.25it/s, loss=13.9]
layer: 2: 100%|██████████| 5000/5000 [00:16<00:00, 304.14it/s, loss=11.1]
layer: 3: 100%|██████████| 5000/5000 [00:16<00:00, 303.77it/s, loss=13.7]
layer: 4: 100%|██████████| 5000/5000 [00:16<00:00, 307.24it/s, loss=17.3]  
layer: 5: 100%|██████████| 5000/5000 [00:16<00:00, 296.15it/s, loss=21.6]  
layer: 6: 100%|██████████| 5000/5000 [00:17<00:00, 283.64it/s, loss=26.6]  
layer: 7: 100%|██████████| 5000/5000 [00:15<00:00, 323.94it/s, loss=33.6]
layer: 8: 100%|██████████| 5000/5000 [00:15<00:00, 318.56it/s, loss=39.3]
layer: 9: 100%|██████████| 5000/5000 [00:14<00:00, 342.73it/s, loss=35.3]
layer: 10: 100%|██████████| 5000/5000 [00:14<00:00, 356.04it/s, loss=25]  
layer: 11: 100%|██████████| 5000/5000 [00:16<00:00, 307.75it/s, loss=21.4]
layer: 12: 100%|██████████| 5000/5000 [00:16<00:00, 308.33it/s, loss=20.6]
layer: 13: 100%|██████████| 5

[*] average nmi = 0.003824194187410867
[*] average ncut = 1.103303976353766
[*] best avg nmi = 0.020581842533657083
[*] best avg ncut = 0.9799537492641504
[*] best avg ncut avg nmi = 0.017086920588152435



