In [None]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as spi
import sklearn.preprocessing as skp
import sklearn.neighbors as skn
import sklearn.metrics as skmt
import scipy.sparse as spr
rng = np.random.default_rng(0)
scaler = skp.StandardScaler()

In [None]:
dataset = "newthyroid"

In [None]:
file = open(dataset + ".csv", "w")
csvwriter = csv.writer(file)
csvwriter.writerow(["k*", "ari", "nnmi"])
file.close()

In [None]:
alpha = 0.5
beta1 = 0.9
beta2 = 0.999
epsilon = 10 ** (-8)

In [None]:
temp_data = np.loadtxt("keel_datasets/" + dataset + ".dat", dtype = str, delimiter = ",", comments = "@")
data = np.array(temp_data[:, :-1], dtype = float)
label = skp.LabelEncoder().fit_transform(temp_data[:, -1])

In [None]:
n, d = data.shape
p = 0.1
gamma = 50000
mu = 100
k = 45
phi = 0.01
l = n // 20
b = n // l
N = 100
tol1 = 0.02
tol2 = 0.5

t = np.arange(n)

In [None]:
def add_noise(X):
    n, d = X.shape
    r = int(p * n)
    noise = rng.uniform(0, 1, (r, d))
    noise = X.min(0) + (X.max(0) - X.min(0)) * noise
    X = np.concatenate((X, noise))
    X = scaler.fit_transform(X)
    return X

In [None]:
def calculate_W(X):
    n = X.shape[0]
    W = skn.kneighbors_graph(X, n_neighbors = k, mode = "distance")
    W = W.toarray()

    for i in range(n):
        for j in range(i + 1, n):
            if W[i, j] > 0:
                W[i, j] = np.exp((-phi) * W[i, j])
                W[j, i] = W[i, j]
    
    return W

In [None]:
def find_median(X, U, B):
    l = B.shape[0]
    cost = np.zeros(l)
    for i in range(l):
        for j in B[i]:
            cost[i] = cost[i] + ((U[j] - X[j]) * (U[j] - X[j])).sum()
    lt = np.where(cost >= np.median(cost))[0][0]
    return lt

In [None]:
def calculate_grad(X, U, BI):
    n, d = U.shape
    grad = np.zeros([n, d])
    for j in range(n):
        if j in BI:
            grad[j] = grad[j] + (U[j] - X[j]) / b
        for i in np.where(W[j] > 0)[0]:
            if ((U[j] - U[i]) * (U[j] - U[i])).sum() < mu:
                grad[j] = grad[j] + (gamma * W[i, j]) * (U[j] - U[i])
    return grad

In [None]:
k_star = []
ari = []
nnmi = []

for m in range(15):
    
    X = add_noise(data)
    X = scaler.fit_transform(X)
    
    n, d = X.shape
    W = calculate_W(X)
    
    U = np.copy(X)
    M = np.zeros([n, d])
    V = np.zeros([n, d])

    for i in range(1, N + 1):
        temp = rng.permutation(np.arange(l * b))
        B = np.array(np.split(temp, l))
        lt = find_median(X, U, B)
        G = calculate_grad(X, U, B[lt])
        M = beta1 * M + (1 - beta1) * G
        V = beta2 * V + (1 - beta2) * (G * G).sum()
        M_hat = M / (1 - beta1 ** i)
        V_hat = V / (1 - beta2 ** i)
        U = U - (alpha * M_hat) / np.sqrt(V_hat + epsilon)

    W_U = skmt.pairwise_distances(U) ** 2
    eta = tol1 * (W_U.sum() / (n ** 2))

    adj = (W_U <= eta).astype(int)
    temp = spr.csr_matrix(adj)
    r, Z = spr.csgraph.connected_components(temp, directed = False)

    temp_Z = np.unique(Z)
    r = temp_Z.shape[0]
    eta = tol2 * (n / r)

    for i in range(r):
        temp = np.where(Z == temp_Z[i])[0]
        if temp.shape[0] <= eta:
            Z[temp] = -1

    Z = Z[t]

    k_star.append(np.unique(Z).shape[0])
    ari.append(skmt.adjusted_rand_score(label, Z))
    nnmi.append(skmt.adjusted_mutual_info_score(label, Z))

    print((m + 1), end = " ")

In [None]:
file = open(dataset + ".csv", "a")
csvwriter = csv.writer(file)
for m in range(m):
    csvwriter.writerow([k_star[m], ari[m], nnmi[m]])
file.close()