In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import normalized_mutual_info_score as NMI
import seaborn as sns
from predecon import PreDeCon

from IPython.display import Image
import os
import sys

base_path_testing = os.path.join('..','algorithm_verification','datasets')
base_path_imdb = os.path.join('..','graph_representations','without_labels')

# csvs = []
# for i in range(1,6):
#     csvs.append(f'IMDB-BINARY_gram_matrix_wl{i}.csv')

# PreDeCon(minPts=3, eps=1.0, delta=0.25, lambda_=1, kappa=100)

#### The comparison with ELKI uses the _unlabeled.csv files for clustering while for our own algorithm we load the labeled versions to compute the, e.g. NMI. More high-dimensional datasets yielded very strange results even with the ELKI-algorithm so we stuck with simpler datasets and compared these instead. As one can see, the results of the ELKI-algorithm and our implementation match really well except for very few single data points between clusters.

#### We also tested our implementation with the example data from exercise sheet 2. While this may not be very representative in itself, we think it might be useful to include it (since everyone has worked with this set in this course).

In [None]:
dataset = 'multiple-gaussian-2d'
X = np.loadtxt(os.path.join(base_path_testing, dataset, dataset + '.csv'), delimiter =' ')
labs = 2
X, lab = X[:,:labs], X[:,labs]

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.scatter(x=X[:,0],y=X[:,1],c=lab)
plt.show()

predecon = PreDeCon(minPts=8, eps=1, delta=0.5, lambda_=2, kappa=100)
predecon.fit(X)

print("\nClusterIDs of data-points:", predecon.labels)
print("\nDifferent ClusterIDs:", len(set(predecon.labels)))
print("\nNMI:", NMI(lab,predecon.labels))
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.scatter(x=X[:,0],y=X[:,1],c=predecon.labels)
plt.show()

# display results of elki
print()
display(Image(os.path.join(base_path_testing, dataset,'1__eps1__minPts8__delta0_5__kappa100__lambda2__legend.png')))
display(Image(os.path.join(base_path_testing, dataset,'1__eps1__minPts8__delta0_5__kappa100__lambda2.png')))

In [None]:
dataset = 'pov'
X = np.loadtxt(os.path.join(base_path_testing, dataset, dataset + '.csv'), delimiter =' ')
labs = 2
X, lab = X[:,:labs], X[:,labs]

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.scatter(x=X[:,0],y=X[:,1],c=lab)
plt.show()

predecon = PreDeCon(minPts=8, eps=0.7, delta=0.5, lambda_=2, kappa=100)
predecon.fit(X)

print("\nClusterIDs of data-points:", predecon.labels)
print("\nDifferent ClusterIDs:", len(set(predecon.labels)))
print("\nNMI:", NMI(lab,predecon.labels))
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.scatter(x=X[:,0],y=X[:,1],c=predecon.labels)
plt.show()

# display results of elki
print()
display(Image(os.path.join(base_path_testing, dataset,'1__eps0_7__minPts8__delta0_5__kappa100__lambda2__legend.png')))
display(Image(os.path.join(base_path_testing, dataset,'1__eps0_7__minPts8__delta0_5__kappa100__lambda2.png')))

In [None]:
# does not work properly
dataset = 'mouse'
X = np.loadtxt(os.path.join(base_path_testing, dataset, dataset + '.csv'), delimiter =' ')
labs = 2
X, lab = X[:,:labs], X[:,labs]

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.scatter(x=X[:,0],y=X[:,1],c=lab)
plt.show()

predecon = PreDeCon(minPts=25, eps=0.725, delta=0.3, lambda_=2, kappa=100)
predecon.fit(X)

print("\nClusterIDs of data-points:", predecon.labels)
print("\nDifferent ClusterIDs:", len(set(predecon.labels)))
print("\nNMI:", NMI(lab,predecon.labels))
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.scatter(x=X[:,0],y=X[:,1],c=predecon.labels)
plt.show()

# display results of elki
print()
display(Image(os.path.join(base_path_testing, dataset,'1__eps0_725__minPts25__delta0_3__kappa100__lambda2__legend.png')))
display(Image(os.path.join(base_path_testing, dataset,'1__eps0_725__minPts25__delta0_3__kappa100__lambda2.png')))

In [None]:
# Exercise 2 Dataset

X = np.array([
    [0, 3],
    [1, 3], # p_2
    [2, 3], # p_3
    [3, 3],
    [4, 3],
    [5, 3], # p_6
    [6, 5],
    [6, 4],
    [6, 3],
    [6, 2],
    [6, 1],
    [6, 0]
])
print(X, '\n', X.shape)

predecon = PreDeCon()
predecon.fit(X)

p2 = X[1]
p3 = X[2]
p6 = X[5]
p9 = X[8]

N_p3 = predecon._neighborhood_of_point(p3)
print("p3:   ", p3)
print("N(p3):", N_p3)

var_A0 = predecon._variance_along_attribute(p3, 0)
var_A1 = predecon._variance_along_attribute(p3, 1)
print("VAR_A0 for p3's neighborhood:", var_A0)
print("VAR_A1 for p3's neighborhood:", var_A1)

print("w_p3:", predecon._subspace_preference_vector(p3))
print("w_p6:", predecon._subspace_preference_vector(p6))

print("PDim for p3:", predecon._subspace_preference_dimensionality(p3))
print("PDim for p6:", predecon._subspace_preference_dimensionality(p6))

dist = predecon._preference_weighted_similarity_measure
print("dist(p6, p9) =", dist(p6, p9))
print("dist(p9, p6) =", dist(p9, p6))

dist_pref = predecon._general_preference_weighted_similarity_measure
print("dist_pref(p6, p9) =", dist_pref(p6, p9))

print("N_w for p3:", predecon._pref_neighborhood_of_point(p3), sep='\n')
print("N_w for p6:", predecon._pref_neighborhood_of_point(p6), sep='\n')

print("Is p3 a core point?", predecon._is_core_point(p3))
print("Is p6 a core point?", predecon._is_core_point(p6))

print("Is p2 directly preference weighted reachable from p3?", predecon._is_directly_preference_weighted_reachable(p3,p2))
print("Is p6 directly preference weighted reachable from p3?", predecon._is_directly_preference_weighted_reachable(p3,p6))

print("\nClusterIDs of data-points: \n", predecon._cluster_of_points)
print("\nClusterIDs of data-points:", predecon.labels)
print("ID of p3:", predecon._cluster_of_points[p3.tobytes()])
print("ID of p3:", predecon.labels[2])
print("p3 is noise: ", predecon._is_noise_point(p3))
print("ID of p6:", predecon._cluster_of_points[p6.tobytes()])
print("ID of p6:", predecon.labels[5])
print("p6 is noise: ", predecon._is_noise_point(p6))

# predecon._is_noise_point(np.array([7,0])) raises KeyError

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.scatter(x=X[:,0],y=X[:,1],c=predecon.labels)
plt.show()