In [None]:
import numpy as np
import pandas as pd
from RMKMC import *

Implement the measurements 'Accuracy' and 'Purity'. NMI is already provided in sklearn.

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.utils.linear_assignment_ import linear_assignment
from sklearn.metrics.cluster import normalized_mutual_info_score

# Accuracy (inspired by https://smorbieu.gitlab.io/accuracy-from-classification-to-clustering-evaluation/)
def accu(y_true, y_pred):
    def _make_cost_m(cm):
        s = np.max(cm)
        return (- cm + s)
    cm = confusion_matrix(y_true, y_pred)
    indexes = linear_assignment(_make_cost_m(cm))
    js = [e[1] for e in sorted(indexes, key=lambda x: x[0])]
    cm2 = cm[:, js]
    print(cm2)
    return np.trace(cm2)/np.sum(cm)

# Purity 
def purity(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return np.sum(np.max(cm, axis = 0)) / np.sum(cm)

# Utility function to output the measurement results
def my_output(data_name, gamma, y_true, y_pred):
    accuracy = accu(y_true,y_pred)
    print("Accuracy of RMKMC on", data_name, "with gamma=", gamma, "is", accuracy)

    puri = purity(y_true,y_pred)
    print("Purity of RMKMC on", data_name, "with gamma=", gamma, "is", puri)

    nmi = normalized_mutual_info_score(y_true, y_pred)
    print("NMI of RMKMC on", data_name, "with gamma=", gamma, "is", nmi)
    return (accuracy, puri, nmi)

## Handwritten Numerals

Read raw datasets. The 6 views are already separated. There are exactly 200 samples in each class.

In [None]:
fou = pd.read_csv('mfeat-fou', sep=' +', header=None)
fac = pd.read_csv('mfeat-fac', sep=' +', header=None)
kar = pd.read_csv('mfeat-kar', sep=' +', header=None)
pix = pd.read_csv('mfeat-pix', sep=' +', header=None)
zer = pd.read_csv('mfeat-zer', sep=' +', header=None)
mor = pd.read_csv('mfeat-mor', sep=' +', header=None)

Create the true labels.

In [None]:
labels = [0]*200 + [1]*200 + [2]*200 + [3]*200 + [4]*200 + [5]*200 + [6]*200 + [7]*200 + [8]*200 + [9]*200

Xs = [fou.values, fac.values, kar.values, pix.values, zer.values, mor.values]

Run the algorithm using random initialization.

In [None]:
k = 10
gamma = 3
n_iterations = 300

ind_matrix, Fs, aa = RMKMC(Xs, k, gamma, n_iter = n_iterations)
print("weight distribution is", aa)
my_labels = deonehot(ind_matrix)

Report performance

In [None]:
my_output("digits", gamma, labels, my_labels)

Run the algorithm using kmeans++ intialization.

In [None]:
k = 10
gamma = 3
n_iterations = 300

ind_matrix, Fs, aa = RMKMC(Xs, k, gamma, n_iter = n_iterations, initialization = 'pp')
print("weight distribution is", aa)
my_labels = deonehot(ind_matrix)

Report performance.

In [None]:
my_output("digits", gamma, labels, my_labels)