In [1]:
import numpy as np
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
import os
import time

def set_seed(seed):
    np.random.seed(seed)

set_seed(42)

In [None]:
MAIN_DIR = "../data/glove_6b_50d_split/"
DATA_PATH = MAIN_DIR + "X_data.txt"
QUERY_PATH = MAIN_DIR + "X_query.txt"
K = 10
OUTPUT_DIR = MAIN_DIR + f"knn_k{K}/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
def load_vectors_only(path):
    vectors = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            vec = np.array(line.strip().split()[1:], dtype=np.float32)  # skip the word
            vectors.append(vec)
    return np.vstack(vectors)

def save_knn_results(prefix, indices, distances):
    np.savetxt(f"{prefix}_indices.txt", indices, fmt="%d")
    np.savetxt(f"{prefix}_distances.txt", distances, fmt="%.6f")
    print(f"Saved: {prefix}_indices.txt and {prefix}_distances.txt")

In [5]:
print("Loading vectors...")
X_data = load_vectors_only(DATA_PATH)
X_query = load_vectors_only(QUERY_PATH)

print(f"X_data shape: {X_data.shape}")
print(f"X_query shape: {X_query.shape}")

Loading vectors...
X_data shape: (320000, 50)
X_query shape: (80000, 50)


In [None]:
print("Fitting Nearest Neighbors using k-NN (brute)...")
start_time = time.time()

nbrs = NearestNeighbors(n_neighbors=K, algorithm='brute', metric='cosine')
nbrs.fit(X_data)

end_time = time.time()
print(f"Time taken: {end_time - start_time:.6f} seconds")

Fitting Nearest Neighbors...
Time taken: 0.020998 seconds


In [None]:
print("Finding neighbors for query set...")
start_time = time.time()

knn_distances, knn_indices = nbrs.kneighbors(X_query)

end_time = time.time()
print(f"Time taken: {end_time - start_time:.6f} seconds\n")

print("knn_distances shape:", knn_distances.shape)
print("knn_indices shape:", knn_indices.shape)

Finding neighbors for query set...
Time taken: 507.087246 seconds
      
knn_distances shape: (80000, 10)
knn_indices shape: (80000, 10)


In [None]:
print("Saving labels and distances...")
save_knn_results(os.path.join(OUTPUT_DIR, "query_knn"), knn_indices, knn_distances)

Saving results...
Saved: ../data/glove_6b_50d_split\query_knn_indices.txt and ../data/glove_6b_50d_split\query_knn_distances.txt


In [None]:
# load the results to verify
knn_indices = np.loadtxt(os.path.join(OUTPUT_DIR, "query_knn_indices.txt"), dtype=int)
knn_distances = np.loadtxt(os.path.join(OUTPUT_DIR, "query_knn_distances.txt"), dtype=float)

print("Loaded indices shape:", knn_indices.shape)
print("Loaded distances shape:", knn_distances.shape)

Loaded indices shape: (80000, 10)
Loaded distances shape: (80000, 10)
