In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse as sparse
from sklearn.cluster import KMeans
from itertools import permutations
from copy import deepcopy

def ScBM_generator(row_label, column_label, link_matrices, density=1):
    """Returns a multi-layer stochastic co-blockmodel graph (L*n*n)"""
    p_matrices = link_matrices[:, row_label][:, :, column_label]
    p_matrices = p_matrices - np.einsum('jk,ik->jki', np.diagonal(p_matrices, axis1=1, axis2=2), np.eye(p_matrices.shape[2])) 
    return np.random.binomial(1, density*p_matrices)


def dsog(scbm, K, K_clusters, method="DSoG", row=True):
    """
    Spectral co-clustering based on the DSoG or SoG
    param scbm: a multi-layer directed network, L*n*n
    param K: embedding dimension 
    param K_clusters: the number of clusters
    param method: {"SoG", "DSoG"} 
    """
    # K leading eigenvectors
    if row:
        square = np.zeros_like(scbm)
        for l in range(scbm.shape[0]):
            Al_sparse = sparse.coo_matrix(scbm[l])
            square[l] = Al_sparse.dot(Al_sparse.T).toarray()
    else:
        square = np.zeros_like(scbm)
        for l in range(scbm.shape[0]):
            Al_sparse = sparse.coo_matrix(scbm[l])
            square[l] = Al_sparse.T.dot(Al_sparse).toarray()

    if method == "SoG":
        # the sum of Gram matriecs
        _, v = np.linalg.eigh(np.sum(square, axis=0))
        vs = v[:, ::-1][:, 0:K]  # ascending order
    else:
        # the bias-adjusted sum of Gram matriecs
        _, v = np.linalg.eigh(np.sum(square, axis=0) - np.diag(np.sum(scbm, axis=0).sum(axis=1)))
        vs = v[:, ::-1][:, 0:K]

    # k-means
    k_means = KMeans(init="k-means++", n_clusters=K_clusters, n_init=scbm.shape[1])
    k_means.fit(vs)
    return k_means.labels_


def sum_adjs(scbm, K, K_clusters, row=True):
    """ Spectral co-clustering based on the Sum """
    # K leading eigenvectors
    u, _, v = np.linalg.svd(np.sum(scbm, axis=0))
    vs = u[:, 0:K] if row else v[0:K, ].T 
    # k-means
    k_means = KMeans(init="k-means++", n_clusters=K_clusters, n_init=scbm.shape[1])
    k_means.fit(vs)
    return k_means.labels_


def mase(scbm, K_ase, K_mase, K_clusters, row=True):
    """
    Multiple adjacency spectral embedding. see Arroyo et.al. JMLR 2021
    K_ase : embedding dimension of single-layer
    K_mase : embedding dimension of concatenated eigenvectors
    """
    try:
        u, _, v = np.linalg.svd(scbm)
        vs = u[:, :, 0:K_ase] if row else v[:, 0:K_ase].transpose((0,2,1)) 
        u_e, _, _ = np.linalg.svd(vs.transpose(0,2,1).reshape(-1,vs.shape[1]).T) #embedding
        vs = u_e[:, 0:K_mase] # K_mase leading eigenvectors
    except np.linalg.LinAlgError:
        unique_rows = np.random.rand(3, K_mase)
        vs = unique_rows[np.random.choice(3, scbm.shape[1])]

    # k-means
    k_means = KMeans(init="k-means++", n_clusters=K_clusters, n_init=scbm.shape[1])
    k_means.fit(vs)
    return k_means.labels_   
    

def min_mis_error(true_label, kmeans_labels, k_clusters):
    """optimal permutation"""
    kmeans_labels_p = deepcopy(kmeans_labels)
    location = []
    for i in range(k_clusters):
        location.append(np.where(kmeans_labels_p == i))
    
    error_min = 1
    for cp in permutations(range(k_clusters), k_clusters):
        for j, r in enumerate(cp):
            kmeans_labels_p[location[j]] = r
        error = np.sum(kmeans_labels_p != true_label) / true_label.shape[0]
        if error <= error_min:
            error_min = error
            best_p = cp
    
    for j, r in enumerate(best_p):
        kmeans_labels_p[location[j]] = r
    return error_min    


def repeat(row_label, column_label, link_matrices, K, K_sum, K_ase, K_mase, K_clusters, densities, n_trials=100, row=True):
    densities = np.atleast_1d(densities)
    mis_errors = np.zeros((4, n_trials, densities.shape[0])) #methods = ["Sum", "SoG", "DSoG", "MASE"]    
    true_label = row_label if row else column_label
    
    for i in range(n_trials):
        for j in range(densities.shape[0]):
            print("Trial: %s,  Density: %.3f;    " %(i+1, densities[j]), end="")
            scbm = ScBM_generator(row_label, column_label, link_matrices, density=densities[j])

            # Sum
            sum_labels = sum_adjs(scbm, K_sum, K_clusters, row=row)
            mis_errors[0, i, j] = min_mis_error(true_label, sum_labels, K_clusters)

            # SoG
            sog_labels = dsog(scbm, K, K_clusters, method="SoG", row=row)
            mis_errors[1, i, j] = min_mis_error(true_label, sog_labels, K_clusters)

            # DSoG
            dsog_labels = dsog(scbm, K, K_clusters, method="DSoG", row=row)
            mis_errors[2, i, j] = min_mis_error(true_label, dsog_labels, K_clusters)

            # multiple adjacency spectral embedding
            mase_labels = mase(scbm, K_ase, K_mase, K_clusters, row=row)
            mis_errors[3, i, j] = min_mis_error(true_label, mase_labels, K_clusters)
    
    return mis_errors

## Experiment 1


In [None]:
# full rank
U = np.array([[1/2, 1/2, -np.sqrt(2)/2], [1/2, 1/2, np.sqrt(2)/2], [np.sqrt(2)/2, -np.sqrt(2)/2, 0]])
V = np.array([[np.sqrt(2)/2, -np.sqrt(2)/2, 0], [1/2, 1/2, -np.sqrt(2)/2], [1/2, 1/2, np.sqrt(2)/2]])
Lambda1 = np.diag([1.5, 0.2, 0.4])
Lambda2 = np.diag([1.5, 0.2, -0.4])
B_1 = U@Lambda1@V.T
B_2 = U@Lambda2@V.T

layer = 50
n_nodes = 500
size_communities = [[200, 100, 200], [150, 200, 150]]
num_communities = [3, 3]
K_sum = 2
K = K_ase = K_mase = 3
K_clusters = 3

np.random.seed(2022)
true_row_label = np.random.choice(np.repeat(range(num_communities[0]), size_communities[0]), replace=False, size=np.sum(size_communities[0]))
true_column_label = np.random.choice(np.repeat(range(num_communities[1]), size_communities[1]), replace=False, size=np.sum(size_communities[1]))
link_matrices = np.concatenate((np.tile(B_1, (int(layer/2), 1, 1)), np.tile(B_2, (int(layer/2), 1, 1))))

densities = np.arange(0.02, 0.165, 0.005)
# row community reconstruction
mis_errors_r = repeat(true_row_label, true_column_label, link_matrices, K, K_sum, K_ase, K_mase, K_clusters, densities, n_trials=50)
# column community reconstruction
mis_errors_c = repeat(true_row_label, true_column_label, link_matrices, K, K_sum, K_ase, K_mase, K_clusters, densities, n_trials=50, row=False)

In [None]:
sns.set_theme(style="ticks")
font = {"family": "Times New Roman", "weight": "normal", "size": 13}    
color_list = ["#534439", "#9A5F30", "#62B17C", "#BAA89B"]

def plot_mis(mis_errors, methods_list=["Sum", "SoG", "DSoG", "MASE"]):
    """mis_errors is a (n_methodes, n_trials, n_densities) ndarray"""
    plt.figure(figsize=(6,3.6))
    sns.lineplot(pd.DataFrame(mis_errors.mean(axis=1).T[2:29], densities[2:29], methods_list), 
                 markers=["o", "X", "p", "^"], dashes=False, palette=color_list, linewidth=2)
    plt.xlabel(r"Overall edge density $\rho$ ", font)
    plt.ylabel("Misclassification rate", font)
    plt.legend(ncol=2,loc=1, prop=font)
    plt.tight_layout()
    plt.show()

# plot
plot_mis(mis_errors_r)
plot_mis(mis_errors_c)

## Experiment 2


In [None]:
# low rank
U = np.array([[0.5, 0.8446232, -0.19134172], [0.5, -0.46193977, -0.73253782], [0.70710678, -0.27059805, 0.65328148]])

Lambda1 = np.diag([1.2, 0.4, 0])
Lambda2 = np.diag([1.2, -0.4, 0])
B_1, B_2 = U@Lambda1@U.T, U@Lambda2@U.T

layer = 50
n_nodes = 500
size_communities = [[100, 150, 250], [100, 250, 150]]
num_communities = [3, 3]
K_sum = 1
K = K_ase = K_mase = 2
K_clusters = 3


np.random.seed(2022)
true_row_label = np.random.choice(np.repeat(range(num_communities[0]), size_communities[0]), 
                                  replace=False, size=np.sum(size_communities[0]))
true_column_label = np.random.choice(np.repeat(range(num_communities[1]), size_communities[1]), 
                                     replace=False, size=np.sum(size_communities[1]))
link_matrices = np.concatenate((np.tile(B_1, (int(layer/2), 1, 1)), np.tile(B_2, (int(layer/2), 1, 1))))

densities = np.arange(0.02, 0.165, 0.005)
# row community reconstruction
mis_errors_r = repeat(true_row_label, true_column_label, link_matrices, K, K_sum, K_ase, K_mase, K_clusters, densities, n_trials=50)
# column community reconstruction
mis_errors_c = repeat(true_row_label, true_column_label, link_matrices, K, K_sum, K_ase, K_mase, K_clusters, densities, n_trials=50, row=False)

In [None]:
sns.set_theme(style="ticks")
font = {"family": "Times New Roman", "weight": "normal", "size": 13}    
color_list = ["#534439", "#9A5F30", "#62B17C", "#BAA89B"]

def plot_mis(mis_errors, methods_list=["Sum", "SoG", "DSoG", "MASE"]):
    """mis_errors is a (n_methodes, n_trials, n_densities) ndarray"""
    plt.figure(figsize=(6,3.6))
    sns.lineplot(pd.DataFrame(mis_errors.mean(axis=1).T[2:29], densities[2:29], methods_list), 
                 markers=["o", "X", "p", "^"], dashes=False, palette=color_list, linewidth=2)
    plt.xlabel(r"Overall edge density $\rho$ ", font)
    plt.ylabel("Misclassification rate", font)
    plt.legend(ncol=2,loc=1, prop=font)
    plt.tight_layout()
    plt.show()

# plot
plot_mis(mis_errors_r)
plot_mis(mis_errors_c)

## Experiment 3


In [None]:
def mase(scbm, K_ase1, K_ase2, K_mase, K_clusters, row=True):
    """
    Multiple adjacency spectral embedding. see Arroyo et.al. JMLR 2021
    K_ase1, K_ase2 : embedding dimension of single-layer
    K_mase : embedding dimension of concatenated eigenvectors
    """
    try:
        u, _, v = np.linalg.svd(scbm)
        vs1 = u[0:10, :, 0:K_ase1] if row else v[0:10, 0:K_ase1].transpose((0,2,1)) 
        vs2 = u[10:, :, 0:K_ase2] if row else v[10:, 0:K_ase2].transpose((0,2,1)) 
        u_e, _, _ = np.linalg.svd(np.concatenate((vs1.transpose(0,2,1).reshape(-1,vs1.shape[1]).T, vs2.transpose(0,2,1).reshape(-1,vs2.shape[1]).T), axis=1))#embedding
        vs = u_e[:, 0:K_mase] # K_mase leading eigenvectors
    except np.linalg.LinAlgError:
        unique_rows = np.random.rand(3, K_mase)
        vs = unique_rows[np.random.choice(3, scbm.shape[1])]
    
    # k-means
    k_means = KMeans(init="k-means++", n_clusters=K_clusters, n_init=scbm.shape[1])
    k_means.fit(vs)
    return k_means.labels_   


def repeat(row_label, column_label, link_matrices, K, K_sum, K_ase1, K_ase2, K_mase, K_clusters, densities, n_trials=100, row=True):
    densities = np.atleast_1d(densities)
    mis_errors = np.zeros((4, n_trials, densities.shape[0])) #methods = ["Sum", "SoG", "DSoG", "MASE"]    
    true_label = row_label if row else column_label
    
    for i in range(n_trials):
        for j in range(densities.shape[0]):
            print("Trial: %s,  Density: %.3f;    " %(i+1, densities[j]), end="")
            scbm = ScBM_generator(row_label, column_label, link_matrices, density=densities[j])

            # Sum
            sum_labels = sum_adjs(scbm, K_sum, K_clusters, row=row)
            mis_errors[0, i, j] = min_mis_error(true_label, sum_labels, K_clusters)

            # SoG
            sog_labels = dsog(scbm, K, K_clusters, method="SoG", row=row)
            mis_errors[1, i, j] = min_mis_error(true_label, sog_labels, K_clusters)

            # DSoG
            dsog_labels = dsog(scbm, K, K_clusters, method="DSoG", row=row)
            mis_errors[2, i, j] = min_mis_error(true_label, dsog_labels, K_clusters)

            # multiple adjacency spectral embedding
            mase_labels = mase(scbm,  K_ase1, K_ase2, K_mase, K_clusters, row=row)
            mis_errors[3, i, j] = min_mis_error(true_label, mase_labels, K_clusters)
    
    return mis_errors


In [None]:
layer = 30
n_nodes = 300
size_communities = [[120, 100, 80], [80, 100, 120]]
num_communities = [3, 3]

B_1 = np.array([[0.3, 0, 0], [0, 0.2, 0], [0, 0, 0.3]])
B_2 = np.array([[0, 0.2, 0.2], [0, 0, 0.2], [0, 0, 0]])
B_3 = np.array([[0, 0, 0], [0.3, 0, 0], [0.5, 0.3, 0]])


np.random.seed(2022)
true_row_label = np.random.choice(np.repeat(range(num_communities[0]), size_communities[0]), 
                                  replace=False, size=np.sum(size_communities[0]))
true_column_label = np.random.choice(np.repeat(range(num_communities[1]), size_communities[1]), 
                                     replace=False, size=np.sum(size_communities[1]))
link_matrices = np.concatenate((np.tile(B_1, (int(layer/3), 1, 1)), np.tile(B_2, (int(layer/3), 1, 1)), np.tile(B_2, (int(layer/3), 1, 1))))


densities = np.arange(0.02, 0.165, 0.005)
K = K_sum = K_mase = 3
K_ase1 = 3 # B_1
K_ase2 = 2 # B_2 B_3
K_clusters = 3

# row community reconstruction
np.random.seed(2030)
mis_errors_r = repeat(true_row_label, true_column_label, link_matrices, K, K_sum, K_ase1, K_ase2, K_mase, K_clusters, densities, n_trials=50)
# column community reconstruction
np.random.seed(2030)
mis_errors_c = repeat(true_row_label, true_column_label, link_matrices, K, K_sum, K_ase1, K_ase2, K_mase, K_clusters, densities, n_trials=50, row=False)


In [None]:
sns.set_theme(style="ticks")
font = {"family": "Times New Roman", "weight": "normal", "size": 13}    
color_list = ["#534439", "#9A5F30", "#62B17C", "#BAA89B"]

def plot_mis(mis_errors, methods_list=["Sum", "SoG", "DSoG", "MASE"]):
    """mis_errors is a (n_methodes, n_trials, n_densities) ndarray"""
    plt.figure(figsize=(6,3.6))
    # from 0.03 to 0.17
    sns.lineplot(pd.DataFrame(mis_errors.mean(axis=1).T[2:31], densities[2:31], methods_list), 
                 markers=["o", "X", "p", "^"], dashes=False, palette=color_list, linewidth=2)
    plt.xlabel(r"Overall edge density $\rho$ ", font)
    plt.ylabel("Misclassification rate", font)
    plt.legend(ncol=2, loc=1, prop=font)
    plt.tight_layout()
    plt.show()


plot_mis(mis_errors_r)
plot_mis(mis_errors_c)