In [None]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as spi
import sklearn.preprocessing as skp
import sklearn.neighbors as skn
import sklearn.metrics as skmt
import scipy.sparse as spr
rng = np.random.default_rng(0)
scaler = skp.StandardScaler()

In [None]:
dataset = "newthyroid"

In [None]:
file = open(dataset + ".csv", "w")
csvwriter = csv.writer(file)
csvwriter.writerow(["k*", "ari", "nnmi"])
file.close()

In [None]:
temp_data = np.loadtxt("keel_datasets/" + dataset + ".dat", dtype = str, delimiter = ",", comments = "@")
data = np.array(temp_data[:, :-1], dtype = float)
label = skp.LabelEncoder().fit_transform(temp_data[:, -1])

In [None]:
n, d = data.shape
p = 0.1

t = np.arange(n)

In [None]:
def add_noise(X):
    n, d = X.shape
    r = int(p * n)
    noise = rng.uniform(0, 1, (r, d))
    noise = X.min(0) + (X.max(0) - X.min(0)) * noise
    X = np.concatenate((X, noise))
    X = scaler.fit_transform(X)
    return X

In [None]:
def calculate_E(X):
    n = X.shape[0]
    E = skn.kneighbors_graph(X, n_neighbors = 10, metric = "cosine")
    E = E.toarray()
    return E

In [None]:
def calculate_I(E):
    n = E.shape[0]
    
    I = []
    for i in range(n):
        for j in range(i + 1, n):
            E[i, j] = np.min([E[i, j], E[j, i]])
            E[j, i] = E[i, j]
            if E[i, j] == 1:
                I.append(np.array([i, j]))
    I = np.array(I)
    
    return I

In [None]:
def calculate_W(E, I):
    n = E.shape[0]
    s = I.shape[0]
    deg = E.sum(0)
    
    W = np.zeros(s)
    for l in range(s):
        l1 = I[l, 0]
        l2 = I[l, 1]
        W[l] = 1 / (n * np.sqrt(deg[l1] * deg[l2]))
    W = W * deg.sum()

    return W

In [None]:
def calculate_cost(X, I, U, gamma, mu):
    n, d = X.shape
    s = I.shape[0]
    cost = 0
    for i in range(n):
        cost = cost + 0.5 * ((X[i] - U[i]) * (X[i] - U[i])).sum()
    for l in range(s):
        l1 = I[l, 0]
        l2 = I[l, 1]
        temp = ((U[l1] - U[l2]) * (U[l1] - U[l2])).sum()
        cost = cost + (gamma / 2) * W[l] * ((mu * temp) / (mu + temp))
    return cost

In [None]:
def calculate_L(I, U, mu):
    s = I.shape[0]
    L = np.zeros(s)
    for l in range(s):
        l1 = I[l, 0]
        l2 = I[l, 1]
        temp = ((U[l1] - U[l2]) * (U[l1] - U[l2])).sum()
        L[l] = (mu / (mu + temp)) ** 2
    return L

In [None]:
def calculate_A(n, I, L):
    s = I.shape[0]
    A = np.zeros([n, n])
    for l in range(s):
        l1 = I[l, 0]
        l2 = I[l, 1]
        e = np.zeros(n)
        e[l1] = 1
        e[l2] = -1
        A = A + W[l] * L[l] * (np.array([e]).T @ np.array([e]))
    return A

In [None]:
def calculate_U(X, A, gamma):
    n, d = X.shape
    M = np.identity(n) + gamma * A
    U = np.linalg.inv(M) @ X
    return U

In [None]:
k_star = []
ari = []
nnmi = []

for m in range(15):
    
    X = add_noise(data)
    X = scaler.fit_transform(X)
    
    E = calculate_E(X)
    I = calculate_I(E)
    W = calculate_W(E, I)
    n, d = X.shape
    s = I.shape[0]

    chi = np.sqrt((X * X).sum())
    temp = E * skmt.pairwise_distances(X)
    temp = np.sort(temp.flatten())
    r = int(0.02 * temp.shape[0])
    delta = temp[np.arange(r)].mean()

    U = np.copy(X)
    L = np.ones(s)
    temp = E * skmt.pairwise_distances(X)
    mu = 3 * (np.max(temp) ** 2)
    temp = calculate_A(n, I, L)
    gamma = chi / np.sqrt((temp * temp).sum())
    cost = calculate_cost(X, I, U, gamma, mu)

    condition = True
    i = 0
    while(condition):
        temp = cost
        L = calculate_L(I, U, mu)
        A = calculate_A(n, I, L)
        U = calculate_U(X, A, gamma)
        i = i + 1
        if i % 4 == 0:
            mu = np.max([mu / 2, delta / 2])
            gamma = chi / np.sqrt((A * A).sum())
        cost = calculate_cost(X, I, U, gamma, mu)
        condition = (np.absolute(cost - temp) < 0.1) or (i < 100)

    W_U = skmt.pairwise_distances(U) ** 2
    adj = (W_U <= delta).astype(int)
    temp = spr.csr_matrix(adj)
    r, Z = spr.csgraph.connected_components(temp, directed = False)

    Z = Z[t]

    k_star.append(np.unique(Z).shape[0])
    ari.append(skmt.adjusted_rand_score(label, Z))
    nnmi.append(skmt.adjusted_mutual_info_score(label, Z))

    print((m + 1), end = " ")

In [None]:
file = open(dataset + ".csv", "a")
csvwriter = csv.writer(file)
for m in range(m):
    csvwriter.writerow([k_star[m], ari[m], nnmi[m]])
file.close()