In [1]:
import numpy as np
from scipy.linalg import eigh as largest_eigh
import scipy.io as scio
from scipy import sparse
from sklearn import cluster
from sklearn.preprocessing import normalize
from sklearn.neighbors import kneighbors_graph


In [2]:
pip install munkres

Collecting munkres
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Downloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: munkres
Successfully installed munkres-1.1.4


In [3]:
%%writefile metrics.py



from munkres import Munkres
import numpy as np
from sklearn import metrics


def cal_clustering_acc(true_label, pred_label):
    # code from https://github.com/hyzhang98/AdaGAE
    l1 = list(set(true_label))
    numclass1 = len(l1)

    l2 = list(set(pred_label))
    numclass2 = len(l2)
    if numclass1 != numclass2:
        print('Class Not equal, Error!!!!')
        return 0

    cost = np.zeros((numclass1, numclass2), dtype=int)
    for i, c1 in enumerate(l1):
        mps = [i1 for i1, e1 in enumerate(true_label) if e1 == c1]
        for j, c2 in enumerate(l2):
            mps_d = [i1 for i1 in mps if pred_label[i1] == c2]

            cost[i][j] = len(mps_d)

    # match two clustering results by Munkres algorithm
    m = Munkres()
    cost = cost.__neg__().tolist()

    indexes = m.compute(cost)

    # get the match results
    new_predict = np.zeros(len(pred_label))
    for i, c in enumerate(l1):
        # correponding label in l2:
        c2 = l2[indexes[i][1]]

        # ai is the index with label==c2 in the pred_label list
        ai = [ind for ind, elm in enumerate(pred_label) if elm == c2]
        new_predict[ai] = c

    acc = metrics.accuracy_score(true_label, new_predict)

    return acc

Writing metrics.py


In [4]:
%%writefile metrics_extended.py
import numpy as np
from sklearn import metrics
from munkres import Munkres

# -------------------------------
# 1. Clustering Accuracy (same as before)
# -------------------------------
def cal_clustering_acc(true_label, pred_label):
    l1 = list(set(true_label))
    numclass1 = len(l1)
    l2 = list(set(pred_label))
    numclass2 = len(l2)
    if numclass1 != numclass2:
        print('⚠️ Class numbers differ, Hungarian matching may be partial.')
    cost = np.zeros((numclass1, numclass2), dtype=int)
    for i, c1 in enumerate(l1):
        mps = [i1 for i1, e1 in enumerate(true_label) if e1 == c1]
        for j, c2 in enumerate(l2):
            mps_d = [i1 for i1 in mps if pred_label[i1] == c2]
            cost[i][j] = len(mps_d)
    m = Munkres()
    cost = (-cost).tolist()
    indexes = m.compute(cost)
    new_predict = np.zeros(len(pred_label))
    for i, c in enumerate(l1):
        c2 = l2[indexes[i][1]]
        ai = [ind for ind, elm in enumerate(pred_label) if elm == c2]
        new_predict[ai] = c
    acc = metrics.accuracy_score(true_label, new_predict)
    return acc

# -------------------------------
# 2. Normalized Mutual Information (NMI)
# -------------------------------
def cal_nmi(true_label, pred_label):
    return metrics.normalized_mutual_info_score(true_label, pred_label, average_method='arithmetic')

# -------------------------------
# 3. Adjusted Rand Index (ARI)
# -------------------------------
def cal_ari(true_label, pred_label):
    return metrics.adjusted_rand_score(true_label, pred_label)

# -------------------------------
# 4. Purity
# -------------------------------
def cal_purity(true_label, pred_label):
    # contingency matrix (rows=true, cols=pred)
    contingency_matrix = metrics.cluster.contingency_matrix(true_label, pred_label)
    # Purity = sum of max value in each predicted cluster / total samples
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

# -------------------------------
# 5. Precision, Recall, F1-score
# -------------------------------
def cal_prf(true_label, pred_label):
    # You need to first align predicted labels using accuracy matching
    acc = cal_clustering_acc(true_label, pred_label)
    # For precision/recall/F1, we can compute macro-averaged
    precision = metrics.precision_score(true_label, pred_label, average='macro', zero_division=0)
    recall = metrics.recall_score(true_label, pred_label, average='macro', zero_division=0)
    f1 = metrics.f1_score(true_label, pred_label, average='macro', zero_division=0)
    return precision, recall, f1

# -------------------------------
# 6. All metrics together (main utility)
# -------------------------------
def evaluate_clustering(true_label, pred_label):
    acc = cal_clustering_acc(true_label, pred_label)
    nmi = cal_nmi(true_label, pred_label)
    ari = cal_ari(true_label, pred_label)
    purity = cal_purity(true_label, pred_label)
    precision, recall, f1 = cal_prf(true_label, pred_label)

    results = {
        'ACC': acc,
        'NMI': nmi,
        'ARI': ari,
        'Purity': purity,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    }
    return results

if __name__ == "__main__":
    # Example test
    true = np.array([1,1,2,2,3,3])
    pred = np.array([2,2,1,1,3,3])  # permuted clusters
    res = evaluate_clustering(true, pred)
    for k,v in res.items():
        print(f"{k}: {v:.4f}")


Writing metrics_extended.py


In [5]:
%%writefile cgl.py

import numpy as np
from metrics import cal_clustering_acc
from metrics_extended import evaluate_clustering
from munkres import Munkres
from scipy.linalg import eigh as largest_eigh
import scipy.io as scio
from scipy import sparse
from sklearn import cluster

from sklearn.preprocessing import normalize
from sklearn.neighbors import kneighbors_graph

def prox_weight_tensor_nuclear_norm(Y, C):
    # calculate the weighted tensor nuclear norm
    # min_X ||X||_w* + 0.5||X - Y||_F^2
    n1, n2, n3 = np.shape(Y)
    X = np.zeros((n1, n2, n3), dtype=complex)
    # Y = np.fft.fft(Y, n3)
    Y = np.fft.fftn(Y)
    # Y = np.fft.fftn(Y, s=[n1, n2, n3])
    eps = 1e-6
    for i in range(n3):
        U, S, V = np.linalg.svd(Y[:, :, i], full_matrices=False)
        temp = np.power(S - eps, 2) - 4 * (C - eps * S)
        ind = np.where(temp > 0)
        ind = np.array(ind)
        r = np.max(ind.shape)
        if np.min(ind.shape) == 0:
            r = 0
        if r >= 1:
            temp2 = S[ind] - eps + np.sqrt(temp[ind])
            S = temp2.reshape(temp2.size, )
            X[:, :, i] = np.dot(np.dot(U[:, 0:r], np.diag(S)), V[:, 0:r].T)
    newX = np.fft.ifftn(X)
    # newX = np.fft.ifftn(X, s=[n1, n2, n3])
    # newX = np.fft.ifft(X, n3)

    return np.real(newX)


def cal_knn_graph(distance, neighbor_num):
    # construct a knn graph
    neighbors_graph = kneighbors_graph(
        distance, neighbor_num, mode='connectivity', include_self=False)
    W = 0.5 * (neighbors_graph + neighbors_graph.T)
    return W



def consensus_graph_learning(A, cluster_num, lambda_1, rho, iteration_num):
    # optimize the consensus graph learning problem
    # min_H, Z 0.5||A - H'H||_F^2 + 0.5||Z - hatHhatH'||_F^2 + ||Z||_w*
    # s.t. H'H = I_k
    sample_num, sample_num, view_num = np.shape(A)
    # initial variables
    H = np.zeros((sample_num, cluster_num, view_num))
    HH = np.zeros((sample_num, sample_num, view_num))
    hatH = np.zeros((sample_num, cluster_num, view_num))
    hatHH = np.zeros((sample_num, sample_num, view_num))
    Q = np.zeros((sample_num, sample_num, view_num))
    Z = np.zeros((sample_num, sample_num, view_num))
    obj = np.zeros((iteration_num, 1))
    # loop
    for iter in range(iteration_num):
        # update H
        temp = np.zeros((sample_num, sample_num, view_num))
        G = np.zeros((sample_num, sample_num, view_num))
        for view in range(view_num):
            temp[:, :, view] = np.dot(
                np.dot(Q[:, :, view], 0.5 * (Z[:, :, view] + Z[:, :, view].T) - 0.5 * hatHH[:, :, view])
                , Q[:, :, view]
            )
            G[:, :, view] = lambda_1 * A[:, :, view] + temp[:, :, view]
            _, H[:, :, view] = largest_eigh(
                G[:, :, view], subset_by_index=[sample_num - cluster_num, sample_num - 1]
            )
            HH[:, :, view] = np.dot(H[:, :, view], H[:, :, view].T)
            Q[:, :, view] = np.diag(1 / np.sqrt(np.diag(HH[:, :, view])))
            hatH[:, :, view] = np.dot(Q[:, :, view], H[:, :, view])
            hatHH[:, :, view] = np.dot(hatH[:, :, view], hatH[:, :, view].T)
        # update Z
        hatHH2 = hatHH.transpose((0, 2, 1))
        Z2 = prox_weight_tensor_nuclear_norm(hatHH2, rho)
        Z = Z2.transpose((0, 2, 1))
        # update obj
        f = np.zeros((view_num, 1))
        for view in range(view_num):
            f[view] = 0.5 * lambda_1 * np.linalg.norm(A[:, :, view] - HH[:, :, view], ord='fro') + \
                      0.5 * np.linalg.norm(Z[:, :, view] - hatHH[:, :, view], ord='fro')
        obj[iter] = np.sum(f)

    # construct knn graph
    distance = np.zeros((sample_num, sample_num))
    for view in range(view_num):
        distance += hatHH[:, :, view]
    W = cal_knn_graph(1 - distance, 15)
    # perform spectral clustering
    laplacian = sparse.csgraph.laplacian(W, normed=True)
    _, vec = sparse.linalg.eigsh(sparse.identity(
        laplacian.shape[0]) - laplacian, cluster_num, sigma=None, which='LA')
    embedding = normalize(vec)
    est = cluster.KMeans(n_clusters=cluster_num, n_init="auto").fit(embedding)
    # reture results
    return W, est.labels_


if __name__ == '__main__':
    precomputed = 1
    if precomputed != 1:
        # precomputed graph with the neighbor graph learning method
        data_matrix = scio.loadmat('MSRCV1_A.mat')
        ground_truth = data_matrix['Y']
        A = data_matrix['A']
    else:
        # knn graph via sklearn.neighbors.kneighbors_graph
        data_matrix = scio.loadmat('MSRCV1_X.mat')
        ground_truth = data_matrix['Y']
        features = data_matrix['X']
        # construct view_specific knn graph
        view_num = features.shape[1]
        sample_num, _ = features[0][0].shape
        A = np.zeros((sample_num, sample_num, view_num))
        for view in range(view_num):
            knn_graph = cal_knn_graph(features[0][view], neighbor_num=15)
            S = sparse.identity(knn_graph.shape[0]) - sparse.csgraph.laplacian(knn_graph, normed=True).toarray()
            A[:, :, view] = S

    ground_truth = ground_truth.reshape(np.max(ground_truth.shape), )
    cluster_num = 7
    parameter_lambda = [1, 5, 10, 50, 100, 500, 1000, 5000]
    parameter_rho = [1, 5, 10, 50, 100, 500, 1000, 5000]
    ACC = np.zeros((8, 8))
    for i in range(8):
        for j in range(8):
            W, predict_label = consensus_graph_learning(A, cluster_num, parameter_lambda[i], parameter_rho[j], 100)
            predict_label = predict_label + 1


            results = evaluate_clustering(ground_truth, predict_label)
            print(f"λ={parameter_lambda[i]}, ρ={parameter_rho[j]} -> "
                  f"ACC={results['ACC']:.4f}, NMI={results['NMI']:.4f}, "
                  f"Purity={results['Purity']:.4f}, ARI={results['ARI']:.4f}, "
                  f"F1={results['F1-score']:.4f}")
            ACC[i, j] = results['ACC']

    print('max clustering accuracy: {}'.format(ACC.max()))

Writing cgl.py


In [6]:
!python cgl.py

λ=1, ρ=1 -> ACC=0.8667, NMI=0.7609, Purity=0.8667, ARI=0.7301, F1=0.0182
λ=1, ρ=5 -> ACC=0.8667, NMI=0.7613, Purity=0.8667, ARI=0.7311, F1=0.0097
λ=1, ρ=10 -> ACC=0.8667, NMI=0.7585, Purity=0.8667, ARI=0.7299, F1=0.1475
λ=1, ρ=50 -> ACC=0.8667, NMI=0.7613, Purity=0.8667, ARI=0.7311, F1=0.0189
λ=1, ρ=100 -> ACC=0.8667, NMI=0.7630, Purity=0.8667, ARI=0.7302, F1=0.0091
λ=1, ρ=500 -> ACC=0.8667, NMI=0.7634, Purity=0.8667, ARI=0.7292, F1=0.0186
λ=1, ρ=1000 -> ACC=0.8667, NMI=0.7613, Purity=0.8667, ARI=0.7311, F1=0.2608
λ=1, ρ=5000 -> ACC=0.8667, NMI=0.7613, Purity=0.8667, ARI=0.7311, F1=0.3726
λ=5, ρ=1 -> ACC=0.8667, NMI=0.7613, Purity=0.8667, ARI=0.7311, F1=0.1597
λ=5, ρ=5 -> ACC=0.8667, NMI=0.7585, Purity=0.8667, ARI=0.7299, F1=0.0333
λ=5, ρ=10 -> ACC=0.8714, NMI=0.7650, Purity=0.8714, ARI=0.7372, F1=0.0300
λ=5, ρ=50 -> ACC=0.8667, NMI=0.7609, Purity=0.8667, ARI=0.7301, F1=0.0245
λ=5, ρ=100 -> ACC=0.8667, NMI=0.7634, Purity=0.8667, ARI=0.7292, F1=0.1295
λ=5, ρ=500 -> ACC=0.8667, NMI=0.760