In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import hnswlib
from sklearn.neighbors import NearestNeighbors
from heapq import heappush, heappop

In [2]:

# Path to your GloVe file (update this based on your downloaded version)
glove_path = '/Users/tanishqchaudhari/Desktop/DataScience Proj  datasets/Dataset/glove.6B.100d.txt'

# Load GloVe vectors
word_to_vec = {}
words = []
vectors = []

with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]  # First token is the word
        vector = np.array(values[1:], dtype=np.float32)  # Rest are vector values
        word_to_vec[word] = vector
        words.append(word)
        vectors.append(vector)

# Convert to numpy array
vectors = np.array(vectors, dtype=np.float32)
print(f"Loaded {len(words)} word vectors of dimension {vectors.shape[1]}")

Loaded 400000 word vectors of dimension 100


In [3]:
print(vectors.shape)

(400000, 100)


In [4]:
import numpy as np
import time
import logging
import random
import math
from heapq import heappush, heappop
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [5]:
import numpy as np

def precompute_all_query_pcas(pca_models, pca_applied_layers, query_vectors):
    """
    Precomputes PCA projections for a batch of query vectors.
    Returns a list where each index corresponds to the layer number, and contains
    either the PCA-transformed queries (np.ndarray) or None if not used.
    """
    max_layer = max(pca_applied_layers) + 1 if pca_applied_layers else 0
    pca_queries = [None] * max_layer  # indexed list for O(1) access

    query_vectors_np = np.asarray(query_vectors)

    for l_idx in pca_applied_layers:
        model = pca_models[l_idx]  # direct index, avoid .get()
        pca_queries[l_idx] = model.transform(query_vectors_np)

    return pca_queries


In [6]:
def precompute_all_query_pcas(pca_models, pca_applied_layers, query_vectors):
    """
    Precomputes PCA projections for a batch of query vectors.
    Returns a list where each index = layer number, and value is
    either the (n_queries, pca_components) array or None.
    """
    if not pca_applied_layers:
        return []

    max_layer = max(pca_applied_layers) + 1
    pca_queries = [None] * max_layer
    Q = np.asarray(query_vectors)

    for l in pca_applied_layers:
        model = pca_models[l]
        pca_queries[l] = model.transform(Q)

    return pca_queries

In [7]:
import math
import random
import logging
import numpy as np
from sklearn.decomposition import PCA
from heapq import heappush, heappop
import time # Added for potential timing within methods if needed

logger = logging.getLogger(__name__)

# --- precompute_all_query_pcas function remains the same ---
def precompute_all_query_pcas(pca_models, pca_applied_layers, query_vectors):
    """
    Precomputes PCA projections for a batch of query vectors.
    Returns a list where each index = layer number, and value is
    either the (n_queries, pca_components) array or None.
    """
    if not pca_applied_layers:
        return []

    max_layer = max(pca_applied_layers) + 1
    pca_queries = [None] * max_layer
    Q = np.asarray(query_vectors)

    for l in pca_applied_layers:
        # Check if the model exists for the layer (robustness)
        if l in pca_models:
            model = pca_models[l]
            pca_queries[l] = model.transform(Q)
        # else: # Should not happen if pca_applied_layers is consistent
        #    logger.warning(f"Model for supposedly applied PCA layer {l} not found during query precomputation.")


    return pca_queries

# --- Modified optHNSWPCA Class ---
# --- Imports and Logger Setup ---
import math
import random
import logging
import numpy as np
from sklearn.decomposition import PCA
from heapq import heappush, heappop
import time

logger = logging.getLogger(__name__)

# --- optHNSWPCA Class (Global PCA Approach) ---

class optHNSWPCA:
    """
    HNSW with optional GLOBAL PCA-based acceleration (applied >= layer 1),
    transforming queries ONCE at the start of search.
    """
    def __init__(self, dim, max_elements, M=16, ef_construction=200,
                 pca_yes=False, pca_components=50):
        self.dim = dim
        self.max_elements = max_elements
        self.M = M
        self.ef_construction = ef_construction
        self.pca_yes = pca_yes and pca_components < dim
        self.pca_components = min(pca_components, dim)

        self.vectors = [] # Stores original vectors
        self.layers = []
        self.entry_point = None
        self.entry_point_level = -1

        # --- Global PCA Structures ---
        self.global_pca_model = None      # Single PCA model for the whole dataset
        self.global_reduced_vectors = None # Numpy array (N, pca_components)
        # We still need to know if PCA is conceptually applicable (enough nodes) per layer for logic switching
        # Let's reuse pca_applied_layers for this, but it now signifies "use PCA distance metric if l >= 1"
        self.pca_logic_layers = set() # Layers (>=1) dense enough to *consider* using PCA logic

        logger.info(f"Initialized HNSW (dim={dim}, PCA={'on (GLOBAL, layers >= 1)' if self.pca_yes else 'off'}, PCA components={self.pca_components if self.pca_yes else 'N/A'})")

    # _get_layer, _distance, _search_layer_standard, insert remain the same as previous version
    def _get_layer(self):
        ml = 1 / math.log(self.M) if self.M > 1 else 1
        return max(0, int(-math.log(random.random()) * ml))

    def _distance(self, idx1, idx2):
        if not (0 <= idx1 < len(self.vectors) and 0 <= idx2 < len(self.vectors)): return float('inf')
        return np.linalg.norm(self.vectors[idx1] - self.vectors[idx2])

    def _search_layer_standard(self, query_vec, layer_idx, ep_idx, ef):
        if layer_idx >= len(self.layers): return []
        graph = self.layers[layer_idx]
        if not graph: return []
        if ep_idx is None or ep_idx not in graph:
             try: ep_idx = next(iter(graph))
             except StopIteration: return []
        if ep_idx >= len(self.vectors): return [] # Safety check

        visited = {ep_idx}
        dist0 = np.linalg.norm(query_vec - self.vectors[ep_idx])
        candidates = [(dist0, ep_idx)]
        results = [(-dist0, ep_idx)]

        while candidates:
            dist, cur = heappop(candidates)
            if cur not in graph: continue
            farthest_dist_in_results = -results[0][0]
            if dist > farthest_dist_in_results and len(results) >= ef: break
            for nb in graph.get(cur, []):
                if nb in visited: continue
                visited.add(nb)
                if nb >= len(self.vectors): continue
                d = np.linalg.norm(query_vec - self.vectors[nb])
                if d < -results[0][0] or len(results) < ef:
                    heappush(results, (-d, nb))
                    if len(results) > ef: heappop(results)
                    heappush(candidates, (d, nb))
        return sorted([(-d, idx) for d, idx in results])[:ef]

    def insert(self, vector):
        # Simplified insert assuming it's called sequentially
        vec = np.asarray(vector)
        idx = len(self.vectors)
        if idx >= self.max_elements: raise MemoryError(f"Index full. Max elements: {self.max_elements}")
        self.vectors.append(vec)
        level = self._get_layer()
        while len(self.layers) <= level: self.layers.append({})

        ep = self.entry_point
        current_ep_level = self.entry_point_level

        # Phase 1: Find entry points from top down
        for l in range(current_ep_level, level, -1):
            if ep is None: break
            if l >= len(self.layers) or not self.layers[l]: continue
            res = self._search_layer_standard(vec, l, ep, ef=1)
            if res: ep = res[0][1]

        # Phase 2: Insert node and connect
        start_conn_level = min(level, current_ep_level if self.entry_point is not None else level)
        current_ep_for_level = ep

        for l in range(start_conn_level, -1, -1):
            graph = self.layers[l]
            if current_ep_for_level is not None and current_ep_for_level not in graph:
                 if graph: current_ep_for_level = next(iter(graph))
                 else: current_ep_for_level = None

            neigh = []
            if current_ep_for_level is not None:
                neigh = self._search_layer_standard(vec, l, current_ep_for_level, self.ef_construction)
                if neigh: current_ep_for_level = neigh[0][1]

            conns = [nid for _, nid in neigh[:self.M]]
            graph[idx] = conns

            for nb in conns:
                graph.setdefault(nb, []).append(idx)
                if len(graph[nb]) > self.M:
                    dists = [(self._distance(nb, c), c) for c in graph[nb]]
                    dists.sort()
                    graph[nb] = [c for _, c in dists[:self.M]]

        if self.entry_point is None or level > self.entry_point_level:
            self.entry_point = idx
            self.entry_point_level = level


    # --- MODIFIED finalize_pca for GLOBAL PCA ---
    def finalize_pca(self):
        if not self.pca_yes:
            logger.info("PCA is disabled. Skipping PCA finalization.")
            return

        start_time = time.perf_counter()
        n_vectors = len(self.vectors)
        if n_vectors == 0:
             logger.warning("Cannot finalize PCA: No vectors inserted.")
             return
        if n_vectors <= self.pca_components:
             logger.warning(f"Not enough vectors ({n_vectors}) to fit PCA with {self.pca_components} components. PCA disabled.")
             self.pca_yes = False # Disable PCA if not possible
             return

        logger.info(f"Fitting GLOBAL PCA (components={self.pca_components}) on all {n_vectors} vectors...")
        try:
            # Stack all vectors (consider memory for very large datasets)
            all_vectors_stacked = np.vstack(self.vectors)

            # Fit the single global model
            self.global_pca_model = PCA(n_components=self.pca_components)
            self.global_pca_model.fit(all_vectors_stacked)
            logger.info("Global PCA model fitted.")

            # Transform ALL vectors and store them
            logger.info("Transforming all vectors using global PCA model...")
            self.global_reduced_vectors = self.global_pca_model.transform(all_vectors_stacked).astype(np.float32)
            logger.info(f"Stored global reduced vectors with shape {self.global_reduced_vectors.shape}.")

            # Determine which layers (>=1) have enough nodes to warrant using PCA logic
            self.pca_logic_layers.clear()
            total_layers = len(self.layers)
            for l in range(1, total_layers): # Check layers 1 and up
                 if l < len(self.layers) and self.layers[l]:
                      node_count = len([i for i in self.layers[l] if i < n_vectors])
                      if node_count > self.pca_components: # Or some other threshold? For now, just > components.
                           self.pca_logic_layers.add(l)

            end_time = time.perf_counter()
            logger.info(f"Global PCA finalization complete in {end_time - start_time:.2f}s. PCA logic active for layers: {sorted(list(self.pca_logic_layers))}")

        except MemoryError:
             logger.error("MemoryError during global PCA fitting/transformation. Disabling PCA.")
             self.pca_yes = False
             self.global_pca_model = None
             self.global_reduced_vectors = None
             self.pca_logic_layers.clear()
        except Exception as e:
            logger.error(f"Error during global PCA finalization: {e}. Disabling PCA.")
            self.pca_yes = False
            self.global_pca_model = None
            self.global_reduced_vectors = None
            self.pca_logic_layers.clear()


    # --- NEW Layer Search using GLOBAL PCA vectors ---
    def _search_layer_global_pca(self, q_red_global, layer_idx, ep_idx, ef):
        # Search using globally reduced query and globally reduced node vectors
        if layer_idx >= len(self.layers): return []
        graph = self.layers[layer_idx]
        if not graph: return []

        # Check if global reduced vectors exist
        if self.global_reduced_vectors is None:
             logger.error("Global reduced vectors not available for PCA search. Falling back.")
             # Fallback needs original query - this function doesn't have it easily.
             # This indicates a fundamental issue if called when not ready.
             # For now, return empty, but ideally, search_pca_opt checks readiness.
             return []

        # Ensure ep_idx is valid for graph and reduced vector array
        if ep_idx is None or ep_idx not in graph or ep_idx >= len(self.global_reduced_vectors):
            try: ep_idx = next(iter(graph)) # Try starting from *any* node in layer
            except StopIteration: return []
            # Check again if this arbitrary node is valid for reduced vectors
            if ep_idx >= len(self.global_reduced_vectors):
                 logger.error(f"Cannot start global PCA search in layer {layer_idx}, even fallback ep {ep_idx} is invalid.")
                 return []

        visited = {ep_idx}
        # Initial distance using global PCA vectors
        dist0 = np.linalg.norm(q_red_global - self.global_reduced_vectors[ep_idx])
        candidates = [(dist0, ep_idx)]
        results = [(-dist0, ep_idx)]

        while candidates:
            dist, cur = heappop(candidates)
            if cur not in graph: continue
            farthest_dist_in_results = -results[0][0]
            if dist > farthest_dist_in_results and len(results) >= ef: break

            for nb in graph.get(cur, []):
                if nb in visited: continue
                visited.add(nb)
                # Check if neighbor index is valid for reduced vectors
                if nb >= len(self.global_reduced_vectors): continue

                # Calculate distance using GLOBAL reduced query and GLOBAL reduced neighbor
                d = np.linalg.norm(q_red_global - self.global_reduced_vectors[nb])

                if d < -results[0][0] or len(results) < ef:
                    heappush(results, (-d, nb))
                    if len(results) > ef: heappop(results)
                    heappush(candidates, (d, nb))
        return sorted([(-d, idx) for d, idx in results])[:ef]


    # search (standard) remains the same
    def search(self, query_vec, k=10):
        ef = max(self.ef_construction, k)
        ep = self.entry_point
        if ep is None: return []
        q = np.asarray(query_vec)
        current_ep_level = self.entry_point_level
        for l in range(current_ep_level, 0, -1):
             if l >= len(self.layers) or not self.layers[l]: continue
             res = self._search_layer_standard(q, l, ep, ef=1)
             if res: ep = res[0][1]
        res = self._search_layer_standard(q, 0, ep, ef)
        return [i for _, i in res[:k]]


    # --- MODIFIED search_pca_opt for GLOBAL PCA ---
    def search_pca_opt(self, query_vec, k=10):
        ef = max(self.ef_construction, k)
        ep = self.entry_point
        if ep is None: return []

        q_orig = np.asarray(query_vec) # Keep original query
        current_ep_level = self.entry_point_level

        # Check if global PCA is ready and transform query ONCE
        q_red_global = None
        pca_active = self.pca_yes and self.global_pca_model is not None and self.global_reduced_vectors is not None
        if pca_active:
            try:
                q_red_global = self.global_pca_model.transform(q_orig.reshape(1, -1))[0].astype(np.float32)
            except Exception as e:
                 logger.error(f"Failed to transform query using global PCA model: {e}. Falling back to standard search.")
                 pca_active = False # Disable PCA for this query if transform fails

        # Phase 1: Navigate top layers (down to layer 1)
        for l in range(current_ep_level, 0, -1): # Down to 1
            if l >= len(self.layers) or not self.layers[l]: continue

            # Use PCA logic if active and layer is >= 1
            # We don't strictly need to check pca_logic_layers if we always use PCA metric for l>=1 when active
            if pca_active and q_red_global is not None:
                 # Use the NEW global PCA layer search
                 res = self._search_layer_global_pca(q_red_global, l, ep, ef=1)
            else:
                 # Use standard search
                 res = self._search_layer_standard(q_orig, l, ep, ef=1)

            if res:
                ep = res[0][1]
            # else: ep = ep # Keep same ep if layer search yields nothing

        # Phase 2: Search in base layer (layer 0) - ALWAYS standard
        res = self._search_layer_standard(q_orig, 0, ep, ef)

        return [i for (_, i) in res[:k]]

In [8]:
# --- Main Execution Script (Mostly unchanged) ---
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

# Assume 'vectors' is pre-loaded and logging is configured
# logging.basicConfig(level=logging.INFO) # Ensure configured

if __name__ == "__main__":
    # --- parameters ---
    num_vectors = vectors.shape[0]
    num_queries = 5000  # Keep reasonable for testing
    dim = vectors.shape[1]
    k = 100
    M = 26
    ef_construction = 200
    pca_components = 32 # Let's try 32 components

    print(f"Dataset: {num_vectors} vectors, {dim} dimensions")
    print(f"Parameters: M={M}, ef_construction={ef_construction}, k={k}")
    print(f"PCA Settings: GLOBAL PCA, components={pca_components} (logic active layers >= 1 if possible)")

    print("\nSelecting query data...")
    actual_num_queries = min(num_queries, num_vectors)
    query_indices = np.random.choice(num_vectors, actual_num_queries, replace=False)
    queries = vectors[query_indices]
    print(f"Using {actual_num_queries} queries.")

    print("\nComputing ground truth...")
    t_gt0 = time.perf_counter()
    nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
    nn.fit(vectors)
    gt_d, gt_i = nn.kneighbors(queries)
    t_gt1 = time.perf_counter()
    print(f"Ground truth computed in {t_gt1 - t_gt0:.2f}s")

    # --- Build standard (non-PCA) index ---
    print("\nBuilding standard HNSW index (no PCA)...")
    idx_std = optHNSWPCA(dim, num_vectors + 10, M, ef_construction, pca_yes=False, pca_components=pca_components)
    t0 = time.perf_counter()
    # Seed random for potentially more comparable builds (optional)
    # random.seed(42); np.random.seed(42)
    for v in tqdm(vectors, desc="Indexing standard HNSW"): idx_std.insert(v)
    build_std = time.perf_counter() - t0
    print(f"Standard build time: {build_std:.2f}s")
    print(f"Standard index height: {idx_std.entry_point_level}")

    # --- Build PCA-enabled index ---
    print("\nBuilding PCA-enabled HNSW index...")
    idx_pca = optHNSWPCA(dim, num_vectors + 10, M, ef_construction, pca_yes=True, pca_components=pca_components)
    t0 = time.perf_counter()
    # Seed random for potentially more comparable builds (optional)
    # random.seed(42); np.random.seed(42) # Use same seed if comparing structures
    for v in tqdm(vectors, desc="Indexing PCA-enabled HNSW"): idx_pca.insert(v)
    build_pca = time.perf_counter() - t0
    print(f"PCA-enabled index build time (before finalize): {build_pca:.2f}s")
    print(f"PCA-enabled index height: {idx_pca.entry_point_level}")

    print("\nFinalizing GLOBAL PCA model...")
    t0 = time.perf_counter()
    idx_pca.finalize_pca() # Fits global PCA and transforms all vectors
    fit_pca = time.perf_counter() - t0
    print(f"PCA finalize time: {fit_pca:.2f}s")

    # helper for recall calculation
    def recall_at_k(ground_truth_indices, predicted_indices, k_val):
        if predicted_indices is None: predicted_indices = []
        k_actual_pred = min(k_val, len(predicted_indices))
        k_actual_gt = min(k_val, len(ground_truth_indices))
        gt_set = set(ground_truth_indices[:k_actual_gt])
        pred_set = set(predicted_indices[:k_actual_pred])
        return len(gt_set & pred_set) / k_val if k_val > 0 else 1.0

    # --- Evaluate standard search ---
    print("\nEvaluating standard search...")
    results_std = [None] * actual_num_queries
    times_std = []
    total_r_std = 0.0
    tstart_query_std = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="Standard query")):
        ts = time.perf_counter()
        results_std[i] = idx_std.search(q, k)
        te = time.perf_counter()
        times_std.append(te - ts)
        total_r_std += recall_at_k(gt_i[i], results_std[i], k)
    tend_query_std = time.perf_counter()
    tot_time_std = tend_query_std - tstart_query_std
    avg_time_std = np.mean(times_std) * 1000 if times_std else 0
    recall_std = total_r_std / actual_num_queries if actual_num_queries > 0 else 0.0

    # --- Evaluate PCA-optimized search ---
    print("\nEvaluating GLOBAL PCA-optimized search...")
    results_pca = [None] * actual_num_queries
    times_pca = []
    total_r_pca = 0.0
    avg_time_pca = 0.0
    tot_time_pca = 0.0

    # Only run if PCA was successfully finalized
    if idx_pca.pca_yes and idx_pca.global_pca_model is not None:
        tstart_query_pca = time.perf_counter()
        for i, q in enumerate(tqdm(queries, desc="Global PCA-opt query")):
            ts = time.perf_counter()
            results_pca[i] = idx_pca.search_pca_opt(q, k) # Call is simpler now
            te = time.perf_counter()
            times_pca.append(te - ts)
            total_r_pca += recall_at_k(gt_i[i], results_pca[i], k)
        tend_query_pca = time.perf_counter()
        tot_time_pca = tend_query_pca - tstart_query_pca
        avg_time_pca = np.mean(times_pca) * 1000 if times_pca else 0
        recall_pca = total_r_pca / actual_num_queries if actual_num_queries > 0 else 0.0
    else:
        print("Skipping GLOBAL PCA-optimized search evaluation as PCA was not successfully finalized.")
        recall_pca = 0.0

    # --- Output results ---
    print("\n--- Results ---")
    print(f"Parameters: M={M}, ef_construction={ef_construction}, k={k}")
    if idx_pca.pca_yes and idx_pca.global_pca_model:
        print(f"PCA Parameters: GLOBAL PCA, components={pca_components}, logic active for layers: {sorted(list(idx_pca.pca_logic_layers)) if idx_pca.pca_logic_layers else 'None Dense Enough'}")
    else: print("PCA Parameters: PCA Disabled or Failed")
    print("\nBuild Times:")
    print(f"Standard Build: {build_std:.2f}s")
    print(f"PCA Build (Index): {build_pca:.2f}s")
    print(f"PCA Build (Finalize Global): {fit_pca:.2f}s") # This might take longer now
    print(f"Total PCA Build: {build_pca + fit_pca:.2f}s")
    print(f"\nStandard: recall@{k}={recall_std:.4f}, total_query_time={tot_time_std:.2f}s, avg_query_time={avg_time_std:.2f}ms")
    if idx_pca.pca_yes and idx_pca.global_pca_model:
        print(f"PCA-opt:  recall@{k}={recall_pca:.4f}, total_query_time={tot_time_pca:.2f}s, avg_query_time={avg_time_pca:.2f}ms")
    else:
        print("PCA-opt:  Not evaluated.")

    print("\n--- Comparison Summary ---")
    print(f"Method         | Recall@{k} | TotalQuery(s) | AvgQuery(ms)")
    print(f"---------------|------------|---------------|-------------")
    print(f"Standard       | {recall_std:<10.4f} | {tot_time_std:<13.2f} | {avg_time_std:<11.2f}")
    if idx_pca.pca_yes and idx_pca.global_pca_model:
        print(f"PCA-optimized  | {recall_pca:<10.4f} | {tot_time_pca:<13.2f} | {avg_time_pca:<11.2f}")
    else:
        print("PCA-optimized  | N/A        | N/A           | N/A        ")

Dataset: 400000 vectors, 100 dimensions
Parameters: M=26, ef_construction=200, k=100
PCA Settings: GLOBAL PCA, components=32 (logic active layers >= 1 if possible)

Selecting query data...
Using 5000 queries.

Computing ground truth...


INFO:__main__:Initialized HNSW (dim=100, PCA=off, PCA components=N/A)


Ground truth computed in 1.66s

Building standard HNSW index (no PCA)...


Indexing standard HNSW: 100%|██████████| 400000/400000 [40:14<00:00, 165.64it/s] 
INFO:__main__:Initialized HNSW (dim=100, PCA=on (GLOBAL, layers >= 1), PCA components=32)


Standard build time: 2414.87s
Standard index height: 4

Building PCA-enabled HNSW index...


Indexing PCA-enabled HNSW: 100%|██████████| 400000/400000 [37:51<00:00, 176.09it/s]
INFO:__main__:Fitting GLOBAL PCA (components=32) on all 400000 vectors...


PCA-enabled index build time (before finalize): 2271.61s
PCA-enabled index height: 4

Finalizing GLOBAL PCA model...


INFO:__main__:Global PCA model fitted.
INFO:__main__:Transforming all vectors using global PCA model...
INFO:__main__:Stored global reduced vectors with shape (400000, 32).
INFO:__main__:Global PCA finalization complete in 0.72s. PCA logic active for layers: [1, 2]


PCA finalize time: 0.73s

Evaluating standard search...


Standard query: 100%|██████████| 5000/5000 [00:10<00:00, 498.04it/s]



Evaluating GLOBAL PCA-optimized search...


Global PCA-opt query: 100%|██████████| 5000/5000 [00:25<00:00, 198.13it/s]


--- Results ---
Parameters: M=26, ef_construction=200, k=100
PCA Parameters: GLOBAL PCA, components=32, logic active for layers: [1, 2]

Build Times:
Standard Build: 2414.87s
PCA Build (Index): 2271.61s
PCA Build (Finalize Global): 0.73s
Total PCA Build: 2272.33s

Standard: recall@100=0.0174, total_query_time=10.04s, avg_query_time=1.98ms
PCA-opt:  recall@100=0.4835, total_query_time=25.24s, avg_query_time=5.00ms

--- Comparison Summary ---
Method         | Recall@100 | TotalQuery(s) | AvgQuery(ms)
---------------|------------|---------------|-------------
Standard       | 0.0174     | 10.04         | 1.98       
PCA-optimized  | 0.4835     | 25.24         | 5.00       





In [9]:
print("Height of HNSW graph:", idx_pca.entry_point_level)

Height of HNSW graph: 4
