In [30]:
import numpy as np
from tqdm import tqdm
import hnswlib
import os
import time

# set all seed
def set_seed(seed):
    np.random.seed(seed)

set_seed(42)

In [31]:
# ====== Settings ======
MAIN_DIR = "../data/glove_6b_50d_split/"
DATA_PATH = MAIN_DIR + "X_data.txt"
QUERY_PATH = MAIN_DIR + "X_query.txt"
K = 10
OUTPUT_DIR = MAIN_DIR + f"hnswlib_k{K}/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [32]:
# ====== Utils ======
def load_vectors_only(path):
    vectors = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            vec = np.array(line.strip().split()[1:], dtype=np.float32)
            vectors.append(vec)
    return np.vstack(vectors)

def save_knn_results(prefix, indices, distances):
    np.savetxt(f"{prefix}_indices.txt", indices, fmt="%d")
    np.savetxt(f"{prefix}_distances.txt", distances, fmt="%.6f")
    print(f"Saved: {prefix}_indices.txt and {prefix}_distances.txt")

def recall_at_k(true_indices, pred_indices, k):
    assert true_indices.shape == pred_indices.shape
    correct = 0
    for i in range(true_indices.shape[0]):
        correct += len(set(true_indices[i, :k]) & set(pred_indices[i, :k]))
    return correct / (true_indices.shape[0] * k)

In [33]:
print("Loading vectors...")
X_data = load_vectors_only(DATA_PATH)
X_query = load_vectors_only(QUERY_PATH)

print(f"X_data shape: {X_data.shape}")
print(f"X_query shape: {X_query.shape}")

Loading vectors...
X_data shape: (300000, 50)
X_query shape: (100000, 50)


In [34]:
print("Building HNSW index...")
dim = X_data.shape[1]
num_elements = X_data.shape[0]
start_time = time.time()

p = hnswlib.Index(space='cosine', dim=dim)     # possible options are l2, cosine or ip
p.init_index(max_elements=num_elements, ef_construction=200, M=16)
p.add_items(X_data)     # add items to the index
p.set_ef(50)  # higher = better recall  (should always be > k)

end_time = time.time()
print(f"Index built in {end_time - start_time:.6f} seconds")

Building HNSW index...
Index built in 16.622266 seconds


In [35]:
print("Finding nearest neighbors using HNSW...")
start_time = time.time()

hnswlib_indices, hnswlib_distances = p.knn_query(X_query, k=K)  # query the index

end_time = time.time()
print(f"Nearest neighbors found in {end_time - start_time:.6f} seconds\n")

print("hnswlib_distances shape:", hnswlib_distances.shape)
print("hnswlib_indices shape:", hnswlib_indices.shape)

Finding nearest neighbors using HNSW...
Nearest neighbors found in 1.680057 seconds

hnswlib_distances shape: (100000, 10)
hnswlib_indices shape: (100000, 10)


In [36]:
print("Saving labels and distances...")
save_knn_results(os.path.join(OUTPUT_DIR, "query_hnswlib"), hnswlib_indices, hnswlib_distances)

Saving labels and distances...
Saved: ../data/glove_6b_50d_split/hnswlib_k10/query_hnswlib_indices.txt and ../data/glove_6b_50d_split/hnswlib_k10/query_hnswlib_distances.txt


In [37]:
# load the results to verify
hnswlib_indices = np.loadtxt(os.path.join(OUTPUT_DIR, "query_hnswlib_indices.txt"), dtype=int)
hnswlib_distances = np.loadtxt(os.path.join(OUTPUT_DIR, "query_hnswlib_distances.txt"), dtype=float)

### Calculating recall of hnswlib with KNN

In [38]:
knn_indices = np.loadtxt(os.path.join(MAIN_DIR, f"knn_k{K}/query_knn_indices.txt"), dtype=int)
knn_distances = np.loadtxt(os.path.join(MAIN_DIR, f"knn_k{K}/query_knn_distances.txt"), dtype=float)

print("knn_distances shape:", knn_distances.shape)
print("knn_indices shape:", knn_indices.shape)

knn_distances shape: (100000, 10)
knn_indices shape: (100000, 10)


In [39]:
# calculate recall
recall = recall_at_k(hnswlib_indices, knn_indices, K)
print(f"Recall at k={K}: {recall:.6f}")

Recall at k=10: 0.893610
