In [76]:
import hnswlib
import numpy as np
import pickle


dim = 1024
num_elements = 120

# Generating sample data
dataset = np.load('Data/dataset.npy')
ids = np.arange(num_elements)

# Declaring index
p = hnswlib.Index(space = 'l2', dim = dim) # possible options are l2, cosine or ip

# Initializing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements, ef_construction = 200, M = 48)

# Element insertion (can be called several times):
p.add_items(dataset, ids)

# Controlling the recall by setting ef:
p.set_ef(50) # ef should always be > k

# Query dataset, k - number of the closest elements (returns 2 numpy arrays)
labels, distances = p.knn_query(dataset, k = 1)

# Index objects support pickling
# WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method!
# Note: ef parameter is included in serialization; random number generator is initialized with random_seed on Index load
p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip

### Index parameters are exposed as class properties:
print(f"Parameters passed to constructor:  space={p_copy.space}, dim={p_copy.dim}") 
print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}")
print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}")
print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}")

Parameters passed to constructor:  space=l2, dim=1024
Index construction: M=48, ef_construction=200
Index size is 120 and index capacity is 120
Search speed/quality trade-off parameter: ef=50


In [85]:
labels, distances = p.knn_query(dataset, k = 2)

print(distances)
np.save("Data/Labels.npy",labels)
np.save("Data/Distances.npy", distances)

[[0.         0.5919952 ]
 [0.         0.89309025]
 [0.         0.51948583]
 [0.         0.8137605 ]
 [0.         0.67417014]
 [0.         0.5910418 ]
 [0.         0.679408  ]
 [0.         0.66002214]
 [0.         0.5141001 ]
 [0.         0.7722529 ]
 [0.         0.82487357]
 [0.         0.29894763]
 [0.         0.44469672]
 [0.         0.9294003 ]
 [0.         0.72425526]
 [0.         0.5919952 ]
 [0.         0.56506383]
 [0.         0.52317095]
 [0.         0.5867695 ]
 [0.         0.70667106]
 [0.         0.6197484 ]
 [0.         0.488077  ]
 [0.         0.67417014]
 [0.         0.5583832 ]
 [0.         0.44469672]
 [0.         0.7136678 ]
 [0.         0.488077  ]
 [0.         0.62004447]
 [0.         0.51321244]
 [0.         0.5141001 ]
 [0.         1.1588746 ]
 [0.         0.6186466 ]
 [0.         0.63940394]
 [0.         0.72425526]
 [0.         0.7593664 ]
 [0.         0.37033266]
 [0.         1.0704563 ]
 [0.         0.74985087]
 [0.         0.5910418 ]
 [0.         0.8540947 ]


In [96]:
NUM_CLUSTER = 3
len_dataset = dataset.shape[0]

threshold = distances[:, 1].mean()


# Calculate quartiles
quartiles = np.percentile(dataset[:, 1], [33,66])

# Assign class labels based on quartiles
class_labels = np.zeros_like(dataset[:, 1])
class_labels[dataset[:, 1] < quartiles[0]] = 1
class_labels[(dataset[:, 1] >= quartiles[0]) & (dataset[:, 1] < quartiles[1])] = 2
class_labels[dataset[:, 1] >= quartiles[1]] = 3

# Print the class labels
print(class_labels)

[2. 1. 2. 2. 3. 2. 3. 1. 3. 1. 2. 2. 3. 2. 1. 1. 3. 1. 2. 1. 2. 2. 2. 2.
 2. 1. 3. 1. 3. 2. 1. 2. 1. 1. 1. 3. 2. 2. 3. 1. 2. 2. 1. 1. 1. 1. 2. 3.
 1. 3. 1. 1. 2. 3. 2. 3. 3. 3. 3. 2. 3. 3. 3. 1. 1. 3. 1. 1. 2. 1. 1. 1.
 1. 2. 1. 1. 3. 1. 3. 3. 3. 1. 2. 2. 3. 3. 3. 3. 1. 2. 2. 2. 3. 2. 3. 1.
 3. 2. 1. 2. 3. 2. 3. 2. 1. 3. 3. 2. 1. 3. 3. 2. 3. 1. 3. 2. 3. 2. 1. 3.]
