In [1]:
import optuna
import torch
import numpy as np
import tqdm
import matplotlib.pyplot as plt
import sklearn
import scipy
import networkx as nx
import random

  from .autonotebook import tqdm as notebook_tqdm


# 1. Model definition

In [2]:
class AutoEncoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.encoder = torch.nn.Linear(input_dim, hidden_dim)
        self.decoder = torch.nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        encoded = torch.sigmoid(self.encoder(x))
        decoded = torch.sigmoid(self.decoder(encoded))
        return encoded, decoded

In [3]:
class GraphEncoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super().__init__()
        self.autoencoders = torch.nn.ModuleList()
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            self.autoencoders.append(AutoEncoder(prev_dim, hidden_dim))
            prev_dim = hidden_dim

    def forward(self, x):
        for autoencoder in self.autoencoders:
            x = torch.sigmoid(autoencoder.encoder(x))
        encoded = x
        for autoencoder in reversed(self.autoencoders):
            x = torch.sigmoid(autoencoder.decoder(x))
        decoded = x
        return encoded, decoded

# 2. Test on benchmark "email"

## 2.1. Data loading

In [10]:
nxg = nx.read_gml("../datasets/reel/email/email.gml") # read the email gml file into a networkx graph
y = [nxg.nodes[n]["value"] for n in nxg.nodes] # extract the ground-truth community labels
s = nx.to_numpy_array(nxg) # generate the similarity matrix
s = s + np.diag(np.ones(nxg.number_of_nodes())) # we add self-loops (not indicated in the original paper but improves performance)
nts = s / np.sum(s, axis=1, keepdims=True) # generate the normalized training set
print("[*] nts.shape:", nts.shape)
print("[*] number of clusters:", len(set(y)))
cum = 0
NB_KMEANS_TESTS = 20
random.seed(0)
for _ in tqdm.tqdm(range(NB_KMEANS_TESTS)):
    kmeans = sklearn.cluster.KMeans(n_clusters=len(set(y)), algorithm="lloyd", random_state=random.randint(0, 10000))
    y_pred_origspace = kmeans.fit_predict(nts)
    cum += sklearn.metrics.normalized_mutual_info_score(y, y_pred_origspace)
print("[*] original space average nmi:", cum / NB_KMEANS_TESTS)

[*] nts.shape: (1005, 1005)
[*] number of clusters: 42


100%|██████████| 20/20 [00:02<00:00,  7.21it/s]

[*] original space average nmi: 0.35702058173203777





## 2.2. Manual Tuning

In [5]:
torch.manual_seed(0)
np.random.seed(0)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu'); print("device:", DEVICE)
BATCH_SIZE = 250
HIDDEN_DIMS =  [950, 850, 750, 650, 550, 300, 170, 90, 45]

# Create the model
model = GraphEncoder(input_dim=nts.shape[1], hidden_dims=HIDDEN_DIMS).to(DEVICE)

# Create the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Create initial dataloader
x_train = torch.tensor(nts, dtype=torch.float32).to(DEVICE)
current_x_train = x_train.clone()
dataloader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(current_x_train),
    batch_size=BATCH_SIZE,
    shuffle=True
)
dataloader_iter = iter(dataloader)

# Set some training parameters
NB_EPOCHS_PER_LAYER = 1000
nb_train_iters = NB_EPOCHS_PER_LAYER * len(dataloader)
rho=0.01
beta=1.0

# Launch the training loop
# For each layer in the stacked autoencoder: train the layer
for layer_number in range(len(model.autoencoders)):
    for _ in tqdm.tqdm(range(nb_train_iters), desc=f"layer: {layer_number+1}"):
        try:
            (x_batch,) = next(dataloader_iter)
        except StopIteration:
            dataloader_iter = iter(dataloader)
            (x_batch,) = next(dataloader_iter)
        optimizer.zero_grad()
        encoded, decoded = model.autoencoders[layer_number](x_batch)
        loss_1 = torch.nn.functional.mse_loss(decoded, x_batch, reduction='sum')
        rho_hat = torch.mean(encoded, dim=0)
        loss_2 = torch.sum(rho * torch.log(rho / rho_hat) + (1 - rho) * torch.log((1 - rho) / (1 - rho_hat)))
        loss = loss_1 + beta * loss_2
        loss.backward()
        optimizer.step()

    # Create new dataloader on the latent representations
    with torch.no_grad():
        latent_x_train, _ = model.autoencoders[layer_number](current_x_train)
        dataloader = torch.utils.data.DataLoader(
            torch.utils.data.TensorDataset(latent_x_train),
            batch_size=BATCH_SIZE,
            shuffle=True
        )
        dataloader_iter = iter(dataloader)
        current_x_train = latent_x_train.clone()

device: cpu


layer: 1: 100%|██████████| 5000/5000 [01:08<00:00, 73.24it/s]
layer: 2: 100%|██████████| 5000/5000 [00:50<00:00, 99.04it/s] 
layer: 3: 100%|██████████| 5000/5000 [00:39<00:00, 125.26it/s]
layer: 4: 100%|██████████| 5000/5000 [00:33<00:00, 150.31it/s]
layer: 5: 100%|██████████| 5000/5000 [00:28<00:00, 175.34it/s]
layer: 6: 100%|██████████| 5000/5000 [00:16<00:00, 299.09it/s]
layer: 7: 100%|██████████| 5000/5000 [00:11<00:00, 435.23it/s]
layer: 8: 100%|██████████| 5000/5000 [00:09<00:00, 528.86it/s]
layer: 9: 100%|██████████| 5000/5000 [00:08<00:00, 601.82it/s]


In [11]:
# Test on the latent space
with torch.no_grad():
    latent, _ = model(x_train)
cum = 0
NB_KMEANS_TESTS = 20
for _ in tqdm.tqdm(range(NB_KMEANS_TESTS)):
    kmeans = sklearn.cluster.KMeans(n_clusters=len(set(y)), algorithm="lloyd")
    y_pred_latent = kmeans.fit_predict(latent.to('cpu'))
    cum += sklearn.metrics.normalized_mutual_info_score(y, y_pred_latent)
print("[*] latent space nmi:", cum / NB_KMEANS_TESTS)

100%|██████████| 20/20 [00:00<00:00, 97.10it/s]

[*] latent space nmi: 0.24720822940449344





## 2.3. Model training with hyper-parameter tuning 

In [14]:
def objective(trial):

    # Print trial number
    print(f"\ntrial {trial.number}----------------------------")
    
    # Set globals
    global best_avg_nmi
    global best_loss
    global best_loss_avg_nmi
    
    # Set random seeds
    torch.manual_seed(0)
    np.random.seed(0)
    random.seed(0)

    # Suggest the number of layers and a decay rate for hidden dimensions
    n_layers = trial.suggest_int("n_layers", 1, 5, step=1)
    dim_decay_rate = trial.suggest_float("dim_decay_rate", 0.6, 0.8, step=0.1)

    # Compute the hidden dimensions
    hidden_dims = []
    prev_dim = x_train.shape[1]
    for _ in range(n_layers):
        next_dim = max(2, int(prev_dim * dim_decay_rate))
        hidden_dims.append(next_dim)
        prev_dim = next_dim
    
    # Create the model using the hidden dimensions
    model = GraphEncoder(input_dim=x_train.shape[1], hidden_dims=hidden_dims).to(DEVICE)

    # Suggest rho and beta for the sparsity constraint
    rho = trial.suggest_float("rho", 1e-4, 1e-1, log=True)
    beta = trial.suggest_float("beta", 1e-2, 1e3, log=True)
    
    # Suggest the optimizer (for now only AdamW is implemented)
    optimizer_name = trial.suggest_categorical("optimizer", ["AdamW"])
    
    # Create the optimizer based on the choice
    match optimizer_name:
        case "AdamW":
            
            # Suggest a learning rate
            lr = trial.suggest_float("lr", 1e-3, 1e-3, log=True)

            # Suggest weight_decay for AdamW
            weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-4, log=True)

            # Create the optimizer
            optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    # Suggest batch size
    batch_size = trial.suggest_int("batch_size", 34, 34)

    # Create initial dataloader
    current_x_train = x_train.clone().to(DEVICE)
    dataloader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(current_x_train),
        batch_size=batch_size,
        shuffle=True
    )
    dataloader_iter = iter(dataloader)

    nb_train_iters = NB_EPOCHS_PER_LAYER * len(dataloader)

    # Print some hyper parameters
    print("hidden dims =", hidden_dims)
    print("rho =", rho)
    print("beta =", beta)
    
    # Launch the training loop
    # For each layer in the stacked autoencoder: train the layer
    for layer_number in range(len(model.autoencoders)):
        for _ in (pb := tqdm.tqdm(range(nb_train_iters), desc=f"layer: {layer_number}")):
            try:
                (x_batch,) = next(dataloader_iter)
            except StopIteration:
                dataloader_iter = iter(dataloader)
                (x_batch,) = next(dataloader_iter)
            optimizer.zero_grad()
            encoded, decoded = model.autoencoders[layer_number](x_batch)
            loss_1 = torch.nn.functional.mse_loss(decoded, x_batch, reduction='sum')
            rho_hat = torch.mean(encoded, dim=0)
            loss_2 = torch.sum(rho * torch.log(rho / rho_hat) + (1 - rho) * torch.log((1 - rho) / (1 - rho_hat)))
            loss = loss_1 + beta * loss_2
            loss.backward()
            optimizer.step()
            pb.set_postfix({"loss": loss.item()})

        # Create new dataloader on the latent representations
        with torch.no_grad():
            current_x_train, _ = model.autoencoders[layer_number](current_x_train)
            dataloader = torch.utils.data.DataLoader(
                torch.utils.data.TensorDataset(current_x_train),
                batch_size=batch_size,
                shuffle=True
            )
            dataloader_iter = iter(dataloader)
    
    # Evaluate the model
    with torch.no_grad():
        
        # Evaluate loss
        encoded, decoded = model(x_train)
        loss_1 = torch.nn.functional.mse_loss(decoded, x_train, reduction='sum').item()
        rho_hat = torch.mean(encoded, dim=0)
        loss_2 = torch.sum(rho * torch.log(rho / rho_hat) + (1 - rho) * torch.log((1 - rho) / (1 - rho_hat))).item()
        loss = loss_1 + beta * loss_2
        print(f"[*] loss = {loss:.4f} (reconstruction: {loss_1:.4f}, sparsity: {loss_2:.4f})")
        
        # Evaluate average nmi
        cum = 0
        for _ in tqdm.tqdm(range(NB_KMEANS_TESTS), desc="average nmi"):
            kmeans = sklearn.cluster.KMeans(n_clusters=len(set(y)), algorithm="lloyd", random_state=random.randint(0, 10000))
            y_pred = kmeans.fit_predict(encoded)
            cum += sklearn.metrics.normalized_mutual_info_score(y, y_pred)
        avg_nmi = cum / NB_KMEANS_TESTS
        print("[*] average nmi =", avg_nmi)
        if avg_nmi > best_avg_nmi:
            best_avg_nmi = avg_nmi      
        if loss < best_loss:
            best_loss = loss
            best_loss_avg_nmi = avg_nmi

    return loss


# Create the optuna study
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(sampler=sampler,direction="minimize")
best_avg_nmi = 0.0
best_loss = float('inf')
best_loss_avg_nmi = 0.0
x_train = torch.tensor(nts, dtype=torch.float32)
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
NB_KMEANS_TESTS = 20
NB_EPOCHS_PER_LAYER = 100

# Run 10 trials
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=10)

# Display the best results
print("========================================================")
print("========================================================")
print("[*] best avg nmi =", best_avg_nmi)
print("[*] best loss =", best_loss)
print("[*] best loss avg nmi=", best_loss_avg_nmi)


trial 0----------------------------
hidden dims = [804, 643]
rho = 0.015702970884055395
beta = 9.846738873614559


layer: 0:   0%|          | 0/3000 [00:00<?, ?it/s]

layer: 0: 100%|██████████| 3000/3000 [00:21<00:00, 136.70it/s, loss=214]    
layer: 1: 100%|██████████| 3000/3000 [00:12<00:00, 230.90it/s, loss=21.3] 


[*] loss = 511.0034 (reconstruction: 490.7161, sparsity: 2.0603)


average nmi: 100%|██████████| 20/20 [00:02<00:00,  7.31it/s]


[*] average nmi = 0.20754237920575083

trial 1----------------------------
hidden dims = [603]
rho = 0.00014936568554617635
beta = 214.23021757741054


layer: 0: 100%|██████████| 3000/3000 [00:14<00:00, 202.36it/s, loss=6.49e+3]


[*] loss = 7787.6238 (reconstruction: 831.2121, sparsity: 32.4717)


average nmi: 100%|██████████| 20/20 [00:02<00:00,  8.07it/s]


[*] average nmi = 0.19856419945182924

trial 2----------------------------
hidden dims = [804, 643, 514, 411]
rho = 0.00011527987128232407
beta = 707.2114131472224


layer: 0: 100%|██████████| 3000/3000 [00:18<00:00, 166.00it/s, loss=3.01e+4]
layer: 1: 100%|██████████| 3000/3000 [00:12<00:00, 241.51it/s, loss=112]    
layer: 2: 100%|██████████| 3000/3000 [00:08<00:00, 341.85it/s, loss=3.09e+4]
layer: 3: 100%|██████████| 3000/3000 [00:06<00:00, 449.82it/s, loss=768]    


[*] loss = 2490.9825 (reconstruction: 2243.7710, sparsity: 0.3496)


average nmi: 100%|██████████| 20/20 [00:00<00:00, 54.43it/s]


[*] average nmi = 0.1608980721176483

trial 3----------------------------
hidden dims = [603, 361, 216, 129, 77]
rho = 0.0003511356313970409
beta = 0.08260808399079603


layer: 0: 100%|██████████| 3000/3000 [00:14<00:00, 206.65it/s, loss=19.8]
layer: 1: 100%|██████████| 3000/3000 [00:06<00:00, 452.71it/s, loss=1.51]
layer: 2: 100%|██████████| 3000/3000 [00:05<00:00, 585.65it/s, loss=5.42]
layer: 3: 100%|██████████| 3000/3000 [00:04<00:00, 695.00it/s, loss=1.01] 
layer: 4: 100%|██████████| 3000/3000 [00:03<00:00, 772.36it/s, loss=2.74]


[*] loss = 171.7106 (reconstruction: 169.0903, sparsity: 31.7199)


average nmi: 100%|██████████| 20/20 [00:00<00:00, 102.06it/s]

[*] average nmi = 0.2159634628835377

trial 4----------------------------





hidden dims = [703, 492]
rho = 0.0019762189340280074
beta = 0.2858549394196191


layer: 0: 100%|██████████| 3000/3000 [00:15<00:00, 188.79it/s, loss=32.7]
layer: 1: 100%|██████████| 3000/3000 [00:09<00:00, 326.62it/s, loss=4.27]


[*] loss = 195.7769 (reconstruction: 191.7770, sparsity: 13.9928)


average nmi: 100%|██████████| 20/20 [00:02<00:00,  9.63it/s]


[*] average nmi = 0.1895940464059702

trial 5----------------------------
hidden dims = [603, 361, 216, 129]
rho = 0.0007523742884534858
beta = 0.6789053271698483


layer: 0: 100%|██████████| 3000/3000 [00:14<00:00, 206.73it/s, loss=51.1]
layer: 1: 100%|██████████| 3000/3000 [00:06<00:00, 441.99it/s, loss=7.29]
layer: 2: 100%|██████████| 3000/3000 [00:04<00:00, 604.77it/s, loss=19.4]
layer: 3: 100%|██████████| 3000/3000 [00:04<00:00, 679.45it/s, loss=8.53]


[*] loss = 351.4947 (reconstruction: 343.2724, sparsity: 12.1110)


average nmi: 100%|██████████| 20/20 [00:00<00:00, 79.94it/s]


[*] average nmi = 0.18843784631434987

trial 6----------------------------
hidden dims = [804, 643, 514]
rho = 0.0003972110727381913
beta = 3.725393839578884


layer: 0: 100%|██████████| 3000/3000 [00:18<00:00, 160.46it/s, loss=191]   
layer: 1: 100%|██████████| 3000/3000 [00:14<00:00, 214.04it/s, loss=24.4]
layer: 2: 100%|██████████| 3000/3000 [00:09<00:00, 313.57it/s, loss=59.4] 


[*] loss = 2337.9276 (reconstruction: 2285.9788, sparsity: 13.9445)


average nmi: 100%|██████████| 20/20 [00:02<00:00,  9.22it/s]


[*] average nmi = 0.20820467713320276

trial 7----------------------------
hidden dims = [603, 361, 216]
rho = 0.006647135865318031
beta = 0.0712230583333387


layer: 0: 100%|██████████| 3000/3000 [00:14<00:00, 202.61it/s, loss=15.4]
layer: 1: 100%|██████████| 3000/3000 [00:06<00:00, 451.60it/s, loss=1.04] 
layer: 2: 100%|██████████| 3000/3000 [00:04<00:00, 601.35it/s, loss=4.06]


[*] loss = 158.1226 (reconstruction: 154.1049, sparsity: 56.4092)


average nmi: 100%|██████████| 20/20 [00:00<00:00, 68.40it/s]


[*] average nmi = 0.20762316391804805

trial 8----------------------------
hidden dims = [804]
rho = 0.07886714129990492
beta = 110.15056790269621


layer: 0: 100%|██████████| 3000/3000 [00:18<00:00, 163.32it/s, loss=208]    


[*] loss = 344.8956 (reconstruction: 251.4968, sparsity: 0.8479)


average nmi: 100%|██████████| 20/20 [00:02<00:00,  9.37it/s]


[*] average nmi = 0.19867550755885655

trial 9----------------------------
hidden dims = [603, 361]
rho = 0.01129013355909268
beta = 1.587678152692399


layer: 0: 100%|██████████| 3000/3000 [00:14<00:00, 208.50it/s, loss=59.8]
layer: 1: 100%|██████████| 3000/3000 [00:06<00:00, 472.61it/s, loss=9.9] 


[*] loss = 325.4538 (reconstruction: 315.6394, sparsity: 6.1816)


average nmi: 100%|██████████| 20/20 [00:00<00:00, 47.29it/s]

[*] average nmi = 0.21434911949799323
[*] best avg nmi = 0.2159634628835377
[*] best loss = 158.12258859370095
[*] best loss avg nmi= 0.20762316391804805



