In [1]:
import numpy as np
import time
import logging
import random
import math
from heapq import heappush, heappop
from sklearn.decomposition import PCA # Kept for potential future use, but not active in this comparison
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from tqdm import tqdm # For progress bars

In [3]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
glove_path = 'C:\Sem 4\Data_Science\project\glove.6B\glove.6B.100d.txt' 

# Load GloVe vectors
word_to_vec = {}
words = []
vectors = []

try:
    with open(glove_path, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]  # First token is the word
            try:
                vector = np.array(values[1:], dtype=np.float32)  # Rest are vector values
                if vector.shape[0] == 100: # Ensure correct dimension
                     word_to_vec[word] = vector
                     words.append(word)
                     vectors.append(vector)
                else:
                     logger.warning(f"Skipping word '{word}': incorrect vector dimension {vector.shape[0]} (expected 100)")
            except ValueError:
                logger.warning(f"Skipping word '{word}': could not parse vector values")

    # Convert to numpy array
    vectors = np.array(vectors, dtype=np.float32)
    if vectors.shape[0] > 0:
        print(f"Loaded {len(words)} word vectors of dimension {vectors.shape[1]}")
    else:
        print("Error: No vectors loaded. Check GloVe path and file format.")
        # Handle error appropriately, maybe exit or raise exception
        raise FileNotFoundError(f"Could not load sufficient vectors from {glove_path}")

except FileNotFoundError:
    print(f"Error: GloVe file not found at {glove_path}")
    print("Please download glove.6B.100d.txt and update the 'glove_path' variable.")
    # Simulate dummy data if GloVe is not found, for testing the structure
    print("Using dummy data for demonstration purposes.")
    dim_sim = 100
    num_vectors_sim = 10000 # Use a smaller number for dummy data
    vectors = np.random.rand(num_vectors_sim, dim_sim).astype(np.float32)
    words = [f"word_{i}" for i in range(num_vectors_sim)]
    print(f"Generated {num_vectors_sim} dummy vectors of dimension {dim_sim}")


Loaded 400000 word vectors of dimension 100


# HNSW class

In [4]:
class HNSW_SCC:
    """
    HNSW implementation with optional SCC-Aware neighbor selection heuristic
    applied during construction in Layer 0.
    """
    def __init__(self, dim, max_elements, M=16, ef_construction=200,
                 scc_aware_yes=False, scc_alpha=0.01): # SCC-Aware params
        self.dim = dim
        self.max_elements = max_elements
        self.M = M
        self.ef_construction = ef_construction

        # --- Modification Flags ---
        self.scc_aware_yes = scc_aware_yes # Flag for SCC-Aware heuristic

        # --- SCC Parameters ---
        self.scc_alpha = scc_alpha # Weight for triangle bonus in pruning

        # --- Core HNSW Data Structures ---
        self.vectors = []
        self.layers = []
        self.entry_point = None
        self.element_count = 0

        mod_info = []
        if self.scc_aware_yes:
            mod_info.append(f"SCC-Aware (alpha={scc_alpha})")
        else:
            mod_info.append("Standard")

        logger.info(f"Initialized HNSW (dim={dim}, M={M}, efC={ef_construction}, Modifications: {', '.join(mod_info)})")

    def _get_layer(self):
        # Calculates the layer for a new node based on HNSW probability distribution
        ml = 1 / math.log(self.M) if self.M > 1 else 1
        level = int(-math.log(random.random()) * ml) if self.M > 1 else 0
        return max(0, level)

    def _distance(self, idx1, idx2):
        # Calculates Euclidean distance between two vectors using their indices
        if idx1 < len(self.vectors) and idx2 < len(self.vectors):
            return np.linalg.norm(self.vectors[idx1] - self.vectors[idx2])
        else:
            logger.error(f"Invalid index encountered in _distance: idx1={idx1}, idx2={idx2}, vector count={len(self.vectors)}")
            return float('inf')


    def _search_layer_standard(self, query_vec, layer_idx, ep_idx, ef):
        """Standard full-dimensional search on one layer."""
        # Standard HNSW greedy search within a single layer
        graph = self.layers[layer_idx]
        if not graph: # Empty layer
            return []
        # Ensure entry point is valid and exists in the graph for this layer
        if ep_idx is None or ep_idx >= len(self.vectors) or ep_idx not in graph:
             if graph: # Pick a random node from the layer as entry point if invalid
                ep_idx = random.choice(list(graph.keys()))
             else: return [] # Should not happen if graph is not empty

        q_vec_np = np.asarray(query_vec)
        ep_vec = self.vectors[ep_idx]
        init_dist = np.linalg.norm(q_vec_np - ep_vec)

        visited = {ep_idx}
        candidates = [(init_dist, ep_idx)] # Min-heap (distance, node_idx)
        results = [(-init_dist, ep_idx)] # Max-heap (-distance, node_idx)

        while candidates:
            dist, cur_idx = heappop(candidates)
            farthest_dist_neg, _ = results[0]

            if dist > -farthest_dist_neg and len(results) >= ef: break

            for nb_idx in graph.get(cur_idx, []): # Use .get for safety
                if nb_idx not in visited:
                    visited.add(nb_idx)
                    # Check if neighbor index is valid before accessing vector
                    if nb_idx >= len(self.vectors):
                         logger.warning(f"Neighbor index {nb_idx} out of bounds (max: {len(self.vectors)-1}). Skipping.")
                         continue
                    nb_vec = self.vectors[nb_idx]
                    d = np.linalg.norm(q_vec_np - nb_vec)

                    farthest_dist_neg, _ = results[0]
                    if d < -farthest_dist_neg or len(results) < ef:
                        heappush(results, (-d, nb_idx))
                        if len(results) > ef: heappop(results)
                        heappush(candidates, (d, nb_idx))

        return sorted([(-d, idx) for d, idx in results])[:ef]

    def _apply_scc_aware_pruning(self, nb_idx, layer_idx):
        """
        Applies SCC-aware pruning heuristic to the neighbors of nb_idx in layer_idx.
        Only applied if self.scc_aware_yes is True and layer_idx is 0.
        """
        graph = self.layers[layer_idx]
        current_neighbors = list(graph.get(nb_idx, []))
        num_neighbors = len(current_neighbors)

        if num_neighbors <= self.M: return # No pruning needed

        adjusted_scores = []
        neighbor_set = set(current_neighbors) # Faster lookups

        for c_idx in current_neighbors:
             # Ensure candidate index is valid
             if c_idx >= len(self.vectors): continue
             distance = self._distance(nb_idx, c_idx)
             triangle_count = 0
             # Check connections between c_idx and other valid neighbors of nb_idx
             c_neighbors_in_layer = set(graph.get(c_idx, []))
             for other_c_idx in current_neighbors:
                 # Ensure other candidate index is valid and different
                  if c_idx != other_c_idx and other_c_idx < len(self.vectors) and other_c_idx in c_neighbors_in_layer:
                       triangle_count += 1

             # Score: Lower is better. Subtract bonus for triangles.
             adjusted_score = distance - (self.scc_alpha * triangle_count)
             adjusted_scores.append((adjusted_score, c_idx))

        # Sort by adjusted score (ascending)
        adjusted_scores.sort()

        # Keep the top M neighbors based on the adjusted score
        graph[nb_idx] = [c for score, c in adjusted_scores[:self.M]]


    def insert(self, vector):
        """Inserts a vector into the HNSW index."""
        vec = np.asarray(vector)
        idx = self.element_count
        if idx >= self.max_elements:
            raise MemoryError(f"Index full (max_elements={self.max_elements} reached)")

        # Append vector before incrementing element_count if index is based on it
        if idx == len(self.vectors): self.vectors.append(vec)
        elif idx < len(self.vectors): self.vectors[idx] = vec # Overwrite pre-allocated? Unlikely for simple append.
        else: raise IndexError("Inconsistent vector index during insertion")

        self.element_count += 1
        level = self._get_layer()

        while len(self.layers) <= level: self.layers.append({})

        ep = self.entry_point
        current_top_layer = len(self.layers) - 1

        # Phase 1: Find entry points in upper layers
        for l in range(current_top_layer, level, -1):
             if ep is None or not self.layers[l]: continue
             res = self._search_layer_standard(vec, l, ep, ef=1)
             if res: ep = res[0][1]

        # Phase 2: Insert node layer by layer from `level` down to 0
        for l in range(min(level, current_top_layer), -1, -1):
            graph = self.layers[l]
            neighbors = []
            layer_ep = ep
            if layer_ep is None or layer_ep >= len(self.vectors) or layer_ep not in graph:
                 if graph: layer_ep = random.choice(list(graph.keys()))
                 else: layer_ep = None # Layer is empty

            if layer_ep is not None:
                 neighbors = self._search_layer_standard(vec, l, layer_ep, self.ef_construction)
                 if neighbors: ep = neighbors[0][1] # Update overall entry point

            selected_connections = [neighbor_idx for _, neighbor_idx in neighbors[:self.M]]
            graph[idx] = selected_connections # Set new node's neighbors

            for nb_idx in selected_connections:
                 # Ensure neighbor exists and is valid before adding back-link
                 if nb_idx >= len(self.vectors): continue # Skip invalid neighbor index
                 graph.setdefault(nb_idx, []).append(idx) # Add back-link

                 # --- Pruning Step (Potentially Modified) ---
                 if len(graph[nb_idx]) > self.M:
                      if self.scc_aware_yes and l == 0:
                           # Apply SCC-Aware pruning only on Layer 0
                           self._apply_scc_aware_pruning(nb_idx, l)
                      else:
                           # Standard HNSW pruning (distance-based)
                           dists = [(self._distance(nb_idx, c), c) for c in graph[nb_idx] if c < len(self.vectors)] # Ensure valid indices
                           dists.sort()
                           graph[nb_idx] = [c for _, c in dists[:self.M]]

        # Update the global entry point
        if self.entry_point is None or level > self._get_node_layer(self.entry_point):
            self.entry_point = idx

    def search(self, query_vec, k=10, ef_search=None):
        """Performs KNN search using the standard HNSW algorithm."""
        ef = max(self.ef_construction if ef_search is None else ef_search, k)
        ep = self.entry_point
        if ep is None or ep >= len(self.vectors): return [] # No elements or invalid entry point

        q = np.asarray(query_vec)
        current_top_layer = len(self.layers) - 1

        # Phase 1: Search upper layers to find entry point for Layer 0
        for l in range(current_top_layer, 0, -1):
             if not self.layers[l]: continue
             res = self._search_layer_standard(q, l, ep, ef=1)
             if res: ep = res[0][1]
             # Ensure ep remains valid after update
             if ep >= len(self.vectors):
                 logger.error(f"Entry point became invalid ({ep}) during top-down search at layer {l}.")
                 # Fallback: maybe try finding a random node in layer 0 if possible?
                 # Or simply return empty list as search cannot reliably continue.
                 return []


        # Phase 2: Perform search on Layer 0
        res = self._search_layer_standard(q, 0, ep, ef)
        return [idx for _, idx in res[:k]] # Return top K results

    def _get_node_layer(self, node_idx):
        """Finds the highest layer a node exists in."""
        if node_idx is None or node_idx >= len(self.vectors): return -1 # Handle invalid index
        for l in range(len(self.layers) - 1, -1, -1):
            if node_idx in self.layers[l]: return l
        return -1

# Testing

In [5]:
if __name__ == "__main__":
    # --- Parameters ---
    num_vectors = min(50_000, len(vectors)) # Use a smaller subset or max available
    num_queries = min(1_000, num_vectors // 10)
    if num_vectors <= 0:
         print("Cannot proceed without data. Exiting.")
         exit()

    # Randomly sample data and queries if using a subset
    if num_vectors < len(vectors):
        data_indices = np.random.choice(len(vectors), num_vectors, replace=False)
        data = vectors[data_indices]
        query_indices = np.random.choice(num_vectors, num_queries, replace=False)
        queries = data[query_indices]
        print(f"Using a subset of {num_vectors} vectors and {num_queries} queries.")
    else:
        data = vectors
        query_indices = np.random.choice(num_vectors, num_queries, replace=False)
        queries = data[query_indices]
        print(f"Using the full dataset of {num_vectors} vectors and {num_queries} queries.")


    dim = data.shape[1]
    k = 10 # Number of neighbors to retrieve
    M = 16 # Max connections per node per layer
    ef_construction = 100 # Candidate list size during construction
    ef_search = 100 # Candidate list size during search

    # SCC-Aware parameter
    scc_heuristic_alpha = 0.01 # <-- ** Role Explained Below **

    # --- Compute Ground Truth ---
    print("Computing ground truth...")
    t_gt_start = time.perf_counter()
    nn_brute = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
    nn_brute.fit(data)
    gt_d, gt_i = nn_brute.kneighbors(queries)
    t_gt_end = time.perf_counter()
    print(f"Ground truth computed in {t_gt_end - t_gt_start:.2f}s")

    # --- Build Standard Index ---
    print("\nBuilding standard HNSW index (scc_aware_yes=False)...")
    idx_std = HNSW_SCC(
        dim, num_vectors + 10, M, ef_construction,
        scc_aware_yes=False  # Standard HNSW
    )
    t0 = time.perf_counter()
    for v in tqdm(data, desc="Indexing standard HNSW"):
        idx_std.insert(v)
    build_std_time = time.perf_counter() - t0
    print(f"Standard build time: {build_std_time:.2f}s")

    # --- Build SCC-Aware Index ---
    print("\nBuilding SCC-Aware HNSW index (scc_aware_yes=True)...")
    idx_scc = HNSW_SCC(
        dim, num_vectors + 10, M, ef_construction,
        scc_aware_yes=True,  # Enable SCC-Aware heuristic
        scc_alpha=scc_heuristic_alpha
    )
    t0 = time.perf_counter()
    for v in tqdm(data, desc="Indexing SCC-Aware HNSW"):
        idx_scc.insert(v)
    build_scc_time = time.perf_counter() - t0
    print(f"SCC-Aware build time: {build_scc_time:.2f}s")


    # --- Evaluate Standard Search ---
    print("\nEvaluating standard search...")
    total_recall_std = 0
    times_std = []
    results_std = []
    t_eval_std_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="Standard query")):
        ts = time.perf_counter()
        out_indices = idx_std.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_std.append(te - ts)
        if k > 0: # Avoid division by zero if k=0
             recall = len(set(gt_i[i]) & set(out_indices)) / k
             total_recall_std += recall
        results_std.append(out_indices)
    t_eval_std_end = time.perf_counter()

    avg_time_std_ms = np.mean(times_std) * 1000 if times_std else 0
    recall_std = total_recall_std / num_queries if num_queries > 0 else 0
    total_query_time_std = t_eval_std_end - t_eval_std_start

    # --- Evaluate SCC-Aware Search ---
    print("\nEvaluating SCC-Aware search...")
    total_recall_scc = 0
    times_scc = []
    results_scc = []
    t_eval_scc_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="SCC-Aware query")):
        ts = time.perf_counter()
        out_indices = idx_scc.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_scc.append(te - ts)
        if k > 0: # Avoid division by zero if k=0
            recall = len(set(gt_i[i]) & set(out_indices)) / k
            total_recall_scc += recall
        results_scc.append(out_indices)
    t_eval_scc_end = time.perf_counter()

    avg_time_scc_ms = np.mean(times_scc) * 1000 if times_scc else 0
    recall_scc = total_recall_scc / num_queries if num_queries > 0 else 0
    total_query_time_scc = t_eval_scc_end - t_eval_scc_start

    # --- Output Results ---
    print("\n--- Comparison Summary ---")
    print(f"Parameters: k={k}, M={M}, efConstruction={ef_construction}, efSearch={ef_search}, SCC_Alpha={scc_heuristic_alpha if idx_scc.scc_aware_yes else 'N/A'}")
    print(f"Dataset size: {num_vectors} vectors, {num_queries} queries")
    print("-" * 70)
    print(f"{'Method':<16} | {'Build Time (s)':<15} | {'Recall@' + str(k):<10} | {'Total Query (s)':<15} | {'Avg Query (ms)':<15}")
    print("-" * 70)
    print(f"{'Standard HNSW':<16} | {build_std_time:<15.2f} | {recall_std:<10.4f} | {total_query_time_std:<15.2f} | {avg_time_std_ms:<15.2f}")
    print(f"{'SCC-Aware HNSW':<16} | {build_scc_time:<15.2f} | {recall_scc:<10.4f} | {total_query_time_scc:<15.2f} | {avg_time_scc_ms:<15.2f}")
    print("-" * 70)

    # Optional: Calculate speedup/recall difference
    if avg_time_std_ms > 0 and avg_time_scc_ms > 0:
         speedup = avg_time_std_ms / avg_time_scc_ms
         print(f"Avg Query Time Speedup (SCC vs Std): {speedup:.2f}x")
    recall_diff = recall_scc - recall_std
    print(f"Recall Difference (SCC - Std): {recall_diff:+.4f}")

# %%

Using a subset of 50000 vectors and 1000 queries.
Computing ground truth...


INFO:__main__:Initialized HNSW (dim=100, M=16, efC=100, Modifications: Standard)


Ground truth computed in 0.36s

Building standard HNSW index (scc_aware_yes=False)...


Indexing standard HNSW: 100%|██████████| 50000/50000 [05:51<00:00, 142.33it/s]
INFO:__main__:Initialized HNSW (dim=100, M=16, efC=100, Modifications: SCC-Aware (alpha=0.01))


Standard build time: 351.32s

Building SCC-Aware HNSW index (scc_aware_yes=True)...


Indexing SCC-Aware HNSW: 100%|██████████| 50000/50000 [07:55<00:00, 105.07it/s]


SCC-Aware build time: 475.88s

Evaluating standard search...


Standard query: 100%|██████████| 1000/1000 [00:05<00:00, 193.53it/s]



Evaluating SCC-Aware search...


SCC-Aware query: 100%|██████████| 1000/1000 [00:05<00:00, 188.88it/s]


--- Comparison Summary ---
Parameters: k=10, M=16, efConstruction=100, efSearch=100, SCC_Alpha=0.01
Dataset size: 50000 vectors, 1000 queries
----------------------------------------------------------------------
Method           | Build Time (s)  | Recall@10  | Total Query (s) | Avg Query (ms) 
----------------------------------------------------------------------
Standard HNSW    | 351.32          | 0.5204     | 5.16            | 5.07           
SCC-Aware HNSW   | 475.88          | 0.5224     | 5.30            | 5.21           
----------------------------------------------------------------------
Avg Query Time Speedup (SCC vs Std): 0.97x
Recall Difference (SCC - Std): +0.0020





In [6]:
scc_heuristic_alpha = 0.1

In [7]:
if __name__ == "__main__":    
    print("\nBuilding SCC-Aware HNSW index (scc_aware_yes=True)...")
    idx_scc = HNSW_SCC(
        dim, num_vectors + 10, M, ef_construction,
        scc_aware_yes=True,  # Enable SCC-Aware heuristic
        scc_alpha=scc_heuristic_alpha
    )
    t0 = time.perf_counter()
    for v in tqdm(data, desc="Indexing SCC-Aware HNSW"):
        idx_scc.insert(v)
    build_scc_time = time.perf_counter() - t0
    print(f"SCC-Aware build time: {build_scc_time:.2f}s")


    # --- Evaluate Standard Search ---
    print("\nEvaluating standard search...")
    total_recall_std = 0
    times_std = []
    results_std = []
    t_eval_std_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="Standard query")):
        ts = time.perf_counter()
        out_indices = idx_std.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_std.append(te - ts)
        if k > 0: # Avoid division by zero if k=0
             recall = len(set(gt_i[i]) & set(out_indices)) / k
             total_recall_std += recall
        results_std.append(out_indices)
    t_eval_std_end = time.perf_counter()

    avg_time_std_ms = np.mean(times_std) * 1000 if times_std else 0
    recall_std = total_recall_std / num_queries if num_queries > 0 else 0
    total_query_time_std = t_eval_std_end - t_eval_std_start

    # --- Evaluate SCC-Aware Search ---
    print("\nEvaluating SCC-Aware search...")
    total_recall_scc = 0
    times_scc = []
    results_scc = []
    t_eval_scc_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="SCC-Aware query")):
        ts = time.perf_counter()
        out_indices = idx_scc.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_scc.append(te - ts)
        if k > 0: # Avoid division by zero if k=0
            recall = len(set(gt_i[i]) & set(out_indices)) / k
            total_recall_scc += recall
        results_scc.append(out_indices)
    t_eval_scc_end = time.perf_counter()

    avg_time_scc_ms = np.mean(times_scc) * 1000 if times_scc else 0
    recall_scc = total_recall_scc / num_queries if num_queries > 0 else 0
    total_query_time_scc = t_eval_scc_end - t_eval_scc_start

    # --- Output Results ---
    print("\n--- Comparison Summary ---")
    print(f"Parameters: k={k}, M={M}, efConstruction={ef_construction}, efSearch={ef_search}, SCC_Alpha={scc_heuristic_alpha if idx_scc.scc_aware_yes else 'N/A'}")
    print(f"Dataset size: {num_vectors} vectors, {num_queries} queries")
    print("-" * 70)
    print(f"{'Method':<16} | {'Build Time (s)':<15} | {'Recall@' + str(k):<10} | {'Total Query (s)':<15} | {'Avg Query (ms)':<15}")
    print("-" * 70)
    print(f"{'Standard HNSW':<16} | {build_std_time:<15.2f} | {recall_std:<10.4f} | {total_query_time_std:<15.2f} | {avg_time_std_ms:<15.2f}")
    print(f"{'SCC-Aware HNSW':<16} | {build_scc_time:<15.2f} | {recall_scc:<10.4f} | {total_query_time_scc:<15.2f} | {avg_time_scc_ms:<15.2f}")
    print("-" * 70)

    # Optional: Calculate speedup/recall difference
    if avg_time_std_ms > 0 and avg_time_scc_ms > 0:
         speedup = avg_time_std_ms / avg_time_scc_ms
         print(f"Avg Query Time Speedup (SCC vs Std): {speedup:.2f}x")
    recall_diff = recall_scc - recall_std
    print(f"Recall Difference (SCC - Std): {recall_diff:+.4f}")

INFO:__main__:Initialized HNSW (dim=100, M=16, efC=100, Modifications: SCC-Aware (alpha=0.1))



Building SCC-Aware HNSW index (scc_aware_yes=True)...


Indexing SCC-Aware HNSW: 100%|██████████| 50000/50000 [08:25<00:00, 98.83it/s] 


SCC-Aware build time: 505.92s

Evaluating standard search...


Standard query: 100%|██████████| 1000/1000 [00:05<00:00, 177.66it/s]



Evaluating SCC-Aware search...


SCC-Aware query: 100%|██████████| 1000/1000 [00:06<00:00, 157.50it/s]


--- Comparison Summary ---
Parameters: k=10, M=16, efConstruction=100, efSearch=100, SCC_Alpha=0.1
Dataset size: 50000 vectors, 1000 queries
----------------------------------------------------------------------
Method           | Build Time (s)  | Recall@10  | Total Query (s) | Avg Query (ms) 
----------------------------------------------------------------------
Standard HNSW    | 351.32          | 0.5204     | 5.63            | 5.53           
SCC-Aware HNSW   | 505.92          | 0.5889     | 6.36            | 6.25           
----------------------------------------------------------------------
Avg Query Time Speedup (SCC vs Std): 0.88x
Recall Difference (SCC - Std): +0.0685





In [8]:
scc_heuristic_alpha = 0.5

In [9]:
if __name__ == "__main__":    
    print("\nBuilding SCC-Aware HNSW index (scc_aware_yes=True)...")
    idx_scc = HNSW_SCC(
        dim, num_vectors + 10, M, ef_construction,
        scc_aware_yes=True,  # Enable SCC-Aware heuristic
        scc_alpha=scc_heuristic_alpha
    )
    t0 = time.perf_counter()
    for v in tqdm(data, desc="Indexing SCC-Aware HNSW"):
        idx_scc.insert(v)
    build_scc_time = time.perf_counter() - t0
    print(f"SCC-Aware build time: {build_scc_time:.2f}s")


    # --- Evaluate Standard Search ---
    print("\nEvaluating standard search...")
    total_recall_std = 0
    times_std = []
    results_std = []
    t_eval_std_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="Standard query")):
        ts = time.perf_counter()
        out_indices = idx_std.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_std.append(te - ts)
        if k > 0: # Avoid division by zero if k=0
             recall = len(set(gt_i[i]) & set(out_indices)) / k
             total_recall_std += recall
        results_std.append(out_indices)
    t_eval_std_end = time.perf_counter()

    avg_time_std_ms = np.mean(times_std) * 1000 if times_std else 0
    recall_std = total_recall_std / num_queries if num_queries > 0 else 0
    total_query_time_std = t_eval_std_end - t_eval_std_start

    # --- Evaluate SCC-Aware Search ---
    print("\nEvaluating SCC-Aware search...")
    total_recall_scc = 0
    times_scc = []
    results_scc = []
    t_eval_scc_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="SCC-Aware query")):
        ts = time.perf_counter()
        out_indices = idx_scc.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_scc.append(te - ts)
        if k > 0: # Avoid division by zero if k=0
            recall = len(set(gt_i[i]) & set(out_indices)) / k
            total_recall_scc += recall
        results_scc.append(out_indices)
    t_eval_scc_end = time.perf_counter()

    avg_time_scc_ms = np.mean(times_scc) * 1000 if times_scc else 0
    recall_scc = total_recall_scc / num_queries if num_queries > 0 else 0
    total_query_time_scc = t_eval_scc_end - t_eval_scc_start

    # --- Output Results ---
    print("\n--- Comparison Summary ---")
    print(f"Parameters: k={k}, M={M}, efConstruction={ef_construction}, efSearch={ef_search}, SCC_Alpha={scc_heuristic_alpha if idx_scc.scc_aware_yes else 'N/A'}")
    print(f"Dataset size: {num_vectors} vectors, {num_queries} queries")
    print("-" * 70)
    print(f"{'Method':<16} | {'Build Time (s)':<15} | {'Recall@' + str(k):<10} | {'Total Query (s)':<15} | {'Avg Query (ms)':<15}")
    print("-" * 70)
    print(f"{'Standard HNSW':<16} | {build_std_time:<15.2f} | {recall_std:<10.4f} | {total_query_time_std:<15.2f} | {avg_time_std_ms:<15.2f}")
    print(f"{'SCC-Aware HNSW':<16} | {build_scc_time:<15.2f} | {recall_scc:<10.4f} | {total_query_time_scc:<15.2f} | {avg_time_scc_ms:<15.2f}")
    print("-" * 70)

    # Optional: Calculate speedup/recall difference
    if avg_time_std_ms > 0 and avg_time_scc_ms > 0:
         speedup = avg_time_std_ms / avg_time_scc_ms
         print(f"Avg Query Time Speedup (SCC vs Std): {speedup:.2f}x")
    recall_diff = recall_scc - recall_std
    print(f"Recall Difference (SCC - Std): {recall_diff:+.4f}")

INFO:__main__:Initialized HNSW (dim=100, M=16, efC=100, Modifications: SCC-Aware (alpha=0.5))



Building SCC-Aware HNSW index (scc_aware_yes=True)...


Indexing SCC-Aware HNSW: 100%|██████████| 50000/50000 [09:19<00:00, 89.42it/s] 


SCC-Aware build time: 559.15s

Evaluating standard search...


Standard query: 100%|██████████| 1000/1000 [00:05<00:00, 181.50it/s]



Evaluating SCC-Aware search...


SCC-Aware query: 100%|██████████| 1000/1000 [00:07<00:00, 128.74it/s]


--- Comparison Summary ---
Parameters: k=10, M=16, efConstruction=100, efSearch=100, SCC_Alpha=0.5
Dataset size: 50000 vectors, 1000 queries
----------------------------------------------------------------------
Method           | Build Time (s)  | Recall@10  | Total Query (s) | Avg Query (ms) 
----------------------------------------------------------------------
Standard HNSW    | 351.32          | 0.5204     | 5.52            | 5.42           
SCC-Aware HNSW   | 559.15          | 0.6832     | 7.78            | 7.66           
----------------------------------------------------------------------
Avg Query Time Speedup (SCC vs Std): 0.71x
Recall Difference (SCC - Std): +0.1628





In [12]:
if __name__ == "__main__":
    # --- Parameters ---
    num_vectors = min(50_000, len(vectors)) # Use a smaller subset or max available
    num_queries = min(1_000, num_vectors // 10)
    if num_vectors <= 0:
         print("Cannot proceed without data. Exiting.")
         exit()

    # Randomly sample data and queries if using a subset
    if num_vectors < len(vectors):
        data_indices = np.random.choice(len(vectors), num_vectors, replace=False)
        data = vectors[data_indices]
        query_indices = np.random.choice(num_vectors, num_queries, replace=False)
        queries = data[query_indices]
        print(f"Using a subset of {num_vectors} vectors and {num_queries} queries.")
    else:
        data = vectors
        query_indices = np.random.choice(num_vectors, num_queries, replace=False)
        queries = data[query_indices]
        print(f"Using the full dataset of {num_vectors} vectors and {num_queries} queries.")


    dim = data.shape[1]
    k = 10 # Number of neighbors to retrieve
    M = 16 # Max connections per node per layer
    ef_construction = 200 # Candidate list size during construction
    ef_search = 100 # Candidate list size during search

    # SCC-Aware parameter
    scc_heuristic_alpha = 0.5 # <-- ** Role Explained Below **

    # --- Compute Ground Truth ---
    print("Computing ground truth...")
    t_gt_start = time.perf_counter()
    nn_brute = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
    nn_brute.fit(data)
    gt_d, gt_i = nn_brute.kneighbors(queries)
    t_gt_end = time.perf_counter()
    print(f"Ground truth computed in {t_gt_end - t_gt_start:.2f}s")

    # --- Build Standard Index ---
    print("\nBuilding standard HNSW index (scc_aware_yes=False)...")
    idx_std = HNSW_SCC(
        dim, num_vectors + 10, M, ef_construction,
        scc_aware_yes=False  # Standard HNSW
    )
    t0 = time.perf_counter()
    for v in tqdm(data, desc="Indexing standard HNSW"):
        idx_std.insert(v)
    build_std_time = time.perf_counter() - t0
    print(f"Standard build time: {build_std_time:.2f}s")

    # --- Build SCC-Aware Index ---
    print("\nBuilding SCC-Aware HNSW index (scc_aware_yes=True)...")
    idx_scc = HNSW_SCC(
        dim, num_vectors + 10, M, ef_construction,
        scc_aware_yes=True,  # Enable SCC-Aware heuristic
        scc_alpha=scc_heuristic_alpha
    )
    t0 = time.perf_counter()
    for v in tqdm(data, desc="Indexing SCC-Aware HNSW"):
        idx_scc.insert(v)
    build_scc_time = time.perf_counter() - t0
    print(f"SCC-Aware build time: {build_scc_time:.2f}s")


    # --- Evaluate Standard Search ---
    print("\nEvaluating standard search...")
    total_recall_std = 0
    times_std = []
    results_std = []
    t_eval_std_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="Standard query")):
        ts = time.perf_counter()
        out_indices = idx_std.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_std.append(te - ts)
        if k > 0: # Avoid division by zero if k=0
             recall = len(set(gt_i[i]) & set(out_indices)) / k
             total_recall_std += recall
        results_std.append(out_indices)
    t_eval_std_end = time.perf_counter()

    avg_time_std_ms = np.mean(times_std) * 1000 if times_std else 0
    recall_std = total_recall_std / num_queries if num_queries > 0 else 0
    total_query_time_std = t_eval_std_end - t_eval_std_start

    # --- Evaluate SCC-Aware Search ---
    print("\nEvaluating SCC-Aware search...")
    total_recall_scc = 0
    times_scc = []
    results_scc = []
    t_eval_scc_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="SCC-Aware query")):
        ts = time.perf_counter()
        out_indices = idx_scc.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_scc.append(te - ts)
        if k > 0: # Avoid division by zero if k=0
            recall = len(set(gt_i[i]) & set(out_indices)) / k
            total_recall_scc += recall
        results_scc.append(out_indices)
    t_eval_scc_end = time.perf_counter()

    avg_time_scc_ms = np.mean(times_scc) * 1000 if times_scc else 0
    recall_scc = total_recall_scc / num_queries if num_queries > 0 else 0
    total_query_time_scc = t_eval_scc_end - t_eval_scc_start

    # --- Output Results ---
    print("\n--- Comparison Summary ---")
    print(f"Parameters: k={k}, M={M}, efConstruction={ef_construction}, efSearch={ef_search}, SCC_Alpha={scc_heuristic_alpha if idx_scc.scc_aware_yes else 'N/A'}")
    print(f"Dataset size: {num_vectors} vectors, {num_queries} queries")
    print("-" * 70)
    print(f"{'Method':<16} | {'Build Time (s)':<15} | {'Recall@' + str(k):<10} | {'Total Query (s)':<15} | {'Avg Query (ms)':<15}")
    print("-" * 70)
    print(f"{'Standard HNSW':<16} | {build_std_time:<15.2f} | {recall_std:<10.4f} | {total_query_time_std:<15.2f} | {avg_time_std_ms:<15.2f}")
    print(f"{'SCC-Aware HNSW':<16} | {build_scc_time:<15.2f} | {recall_scc:<10.4f} | {total_query_time_scc:<15.2f} | {avg_time_scc_ms:<15.2f}")
    print("-" * 70)

    # Optional: Calculate speedup/recall difference
    if avg_time_std_ms > 0 and avg_time_scc_ms > 0:
         speedup = avg_time_std_ms / avg_time_scc_ms
         print(f"Avg Query Time Speedup (SCC vs Std): {speedup:.2f}x")
    recall_diff = recall_scc - recall_std
    print(f"Recall Difference (SCC - Std): {recall_diff:+.4f}")

# %%

Using a subset of 50000 vectors and 1000 queries.
Computing ground truth...


INFO:__main__:Initialized HNSW (dim=100, M=16, efC=200, Modifications: Standard)


Ground truth computed in 0.23s

Building standard HNSW index (scc_aware_yes=False)...


Indexing standard HNSW: 100%|██████████| 50000/50000 [08:22<00:00, 99.40it/s] 
INFO:__main__:Initialized HNSW (dim=100, M=16, efC=200, Modifications: SCC-Aware (alpha=0.5))


Standard build time: 503.00s

Building SCC-Aware HNSW index (scc_aware_yes=True)...


Indexing SCC-Aware HNSW: 100%|██████████| 50000/50000 [14:00<00:00, 59.52it/s]


SCC-Aware build time: 840.12s

Evaluating standard search...


Standard query: 100%|██████████| 1000/1000 [00:05<00:00, 186.81it/s]



Evaluating SCC-Aware search...


SCC-Aware query: 100%|██████████| 1000/1000 [00:07<00:00, 137.91it/s]


--- Comparison Summary ---
Parameters: k=10, M=16, efConstruction=200, efSearch=100, SCC_Alpha=0.5
Dataset size: 50000 vectors, 1000 queries
----------------------------------------------------------------------
Method           | Build Time (s)  | Recall@10  | Total Query (s) | Avg Query (ms) 
----------------------------------------------------------------------
Standard HNSW    | 503.00          | 0.5763     | 5.36            | 5.26           
SCC-Aware HNSW   | 840.12          | 0.7361     | 7.25            | 7.14           
----------------------------------------------------------------------
Avg Query Time Speedup (SCC vs Std): 0.74x
Recall Difference (SCC - Std): +0.1598





In [13]:
if __name__ == "__main__":
    # --- Parameters ---
    num_vectors = min(50_000, len(vectors)) # Use a smaller subset or max available
    num_queries = min(1_000, num_vectors // 10)
    if num_vectors <= 0:
         print("Cannot proceed without data. Exiting.")
         exit()

    # Randomly sample data and queries if using a subset
    if num_vectors < len(vectors):
        data_indices = np.random.choice(len(vectors), num_vectors, replace=False)
        data = vectors[data_indices]
        query_indices = np.random.choice(num_vectors, num_queries, replace=False)
        queries = data[query_indices]
        print(f"Using a subset of {num_vectors} vectors and {num_queries} queries.")
    else:
        data = vectors
        query_indices = np.random.choice(num_vectors, num_queries, replace=False)
        queries = data[query_indices]
        print(f"Using the full dataset of {num_vectors} vectors and {num_queries} queries.")


    dim = data.shape[1]
    k = 10 # Number of neighbors to retrieve
    M = 16 # Max connections per node per layer
    ef_construction = 200 # Candidate list size during construction
    ef_search = 200 # Candidate list size during search

    # SCC-Aware parameter
    scc_heuristic_alpha = 0.5 # <-- ** Role Explained Below **

    # --- Compute Ground Truth ---
    print("Computing ground truth...")
    t_gt_start = time.perf_counter()
    nn_brute = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
    nn_brute.fit(data)
    gt_d, gt_i = nn_brute.kneighbors(queries)
    t_gt_end = time.perf_counter()
    print(f"Ground truth computed in {t_gt_end - t_gt_start:.2f}s")

    # --- Build Standard Index ---
    print("\nBuilding standard HNSW index (scc_aware_yes=False)...")
    idx_std = HNSW_SCC(
        dim, num_vectors + 10, M, ef_construction,
        scc_aware_yes=False  # Standard HNSW
    )
    t0 = time.perf_counter()
    for v in tqdm(data, desc="Indexing standard HNSW"):
        idx_std.insert(v)
    build_std_time = time.perf_counter() - t0
    print(f"Standard build time: {build_std_time:.2f}s")

    # --- Build SCC-Aware Index ---
    print("\nBuilding SCC-Aware HNSW index (scc_aware_yes=True)...")
    idx_scc = HNSW_SCC(
        dim, num_vectors + 10, M, ef_construction,
        scc_aware_yes=True,  # Enable SCC-Aware heuristic
        scc_alpha=scc_heuristic_alpha
    )
    t0 = time.perf_counter()
    for v in tqdm(data, desc="Indexing SCC-Aware HNSW"):
        idx_scc.insert(v)
    build_scc_time = time.perf_counter() - t0
    print(f"SCC-Aware build time: {build_scc_time:.2f}s")


    # --- Evaluate Standard Search ---
    print("\nEvaluating standard search...")
    total_recall_std = 0
    times_std = []
    results_std = []
    t_eval_std_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="Standard query")):
        ts = time.perf_counter()
        out_indices = idx_std.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_std.append(te - ts)
        if k > 0: # Avoid division by zero if k=0
             recall = len(set(gt_i[i]) & set(out_indices)) / k
             total_recall_std += recall
        results_std.append(out_indices)
    t_eval_std_end = time.perf_counter()

    avg_time_std_ms = np.mean(times_std) * 1000 if times_std else 0
    recall_std = total_recall_std / num_queries if num_queries > 0 else 0
    total_query_time_std = t_eval_std_end - t_eval_std_start

    # --- Evaluate SCC-Aware Search ---
    print("\nEvaluating SCC-Aware search...")
    total_recall_scc = 0
    times_scc = []
    results_scc = []
    t_eval_scc_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="SCC-Aware query")):
        ts = time.perf_counter()
        out_indices = idx_scc.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_scc.append(te - ts)
        if k > 0: # Avoid division by zero if k=0
            recall = len(set(gt_i[i]) & set(out_indices)) / k
            total_recall_scc += recall
        results_scc.append(out_indices)
    t_eval_scc_end = time.perf_counter()

    avg_time_scc_ms = np.mean(times_scc) * 1000 if times_scc else 0
    recall_scc = total_recall_scc / num_queries if num_queries > 0 else 0
    total_query_time_scc = t_eval_scc_end - t_eval_scc_start

    # --- Output Results ---
    print("\n--- Comparison Summary ---")
    print(f"Parameters: k={k}, M={M}, efConstruction={ef_construction}, efSearch={ef_search}, SCC_Alpha={scc_heuristic_alpha if idx_scc.scc_aware_yes else 'N/A'}")
    print(f"Dataset size: {num_vectors} vectors, {num_queries} queries")
    print("-" * 70)
    print(f"{'Method':<16} | {'Build Time (s)':<15} | {'Recall@' + str(k):<10} | {'Total Query (s)':<15} | {'Avg Query (ms)':<15}")
    print("-" * 70)
    print(f"{'Standard HNSW':<16} | {build_std_time:<15.2f} | {recall_std:<10.4f} | {total_query_time_std:<15.2f} | {avg_time_std_ms:<15.2f}")
    print(f"{'SCC-Aware HNSW':<16} | {build_scc_time:<15.2f} | {recall_scc:<10.4f} | {total_query_time_scc:<15.2f} | {avg_time_scc_ms:<15.2f}")
    print("-" * 70)

    # Optional: Calculate speedup/recall difference
    if avg_time_std_ms > 0 and avg_time_scc_ms > 0:
         speedup = avg_time_std_ms / avg_time_scc_ms
         print(f"Avg Query Time Speedup (SCC vs Std): {speedup:.2f}x")
    recall_diff = recall_scc - recall_std
    print(f"Recall Difference (SCC - Std): {recall_diff:+.4f}")

# %%

Using a subset of 50000 vectors and 1000 queries.
Computing ground truth...


INFO:__main__:Initialized HNSW (dim=100, M=16, efC=200, Modifications: Standard)


Ground truth computed in 0.35s

Building standard HNSW index (scc_aware_yes=False)...


Indexing standard HNSW: 100%|██████████| 50000/50000 [08:34<00:00, 97.22it/s] 
INFO:__main__:Initialized HNSW (dim=100, M=16, efC=200, Modifications: SCC-Aware (alpha=0.5))


Standard build time: 514.30s

Building SCC-Aware HNSW index (scc_aware_yes=True)...


Indexing SCC-Aware HNSW: 100%|██████████| 50000/50000 [10:10<00:00, 81.88it/s] 


SCC-Aware build time: 610.68s

Evaluating standard search...


Standard query: 100%|██████████| 1000/1000 [00:05<00:00, 193.64it/s]



Evaluating SCC-Aware search...


SCC-Aware query: 100%|██████████| 1000/1000 [00:07<00:00, 131.28it/s]


--- Comparison Summary ---
Parameters: k=10, M=16, efConstruction=200, efSearch=200, SCC_Alpha=0.5
Dataset size: 50000 vectors, 1000 queries
----------------------------------------------------------------------
Method           | Build Time (s)  | Recall@10  | Total Query (s) | Avg Query (ms) 
----------------------------------------------------------------------
Standard HNSW    | 514.30          | 0.6101     | 5.17            | 5.08           
SCC-Aware HNSW   | 610.68          | 0.8222     | 7.62            | 7.53           
----------------------------------------------------------------------
Avg Query Time Speedup (SCC vs Std): 0.68x
Recall Difference (SCC - Std): +0.2121





In [14]:
if __name__ == "__main__":
    # --- Parameters ---
    num_vectors = min(50_000, len(vectors)) # Use a smaller subset or max available
    num_queries = min(1_000, num_vectors // 10)
    if num_vectors <= 0:
        print("Cannot proceed without data. Exiting.")
        exit()

    # Randomly sample data and queries if using a subset
    if num_vectors < len(vectors):
        data_indices = np.random.choice(len(vectors), num_vectors, replace=False)
        data = vectors[data_indices]
        query_indices = np.random.choice(num_vectors, num_queries, replace=False)
        queries = data[query_indices]
        print(f"Using a subset of {num_vectors} vectors and {num_queries} queries.")
    else:
        data = vectors
        query_indices = np.random.choice(num_vectors, num_queries, replace=False)
        queries = data[query_indices]
        print(f"Using the full dataset of {num_vectors} vectors and {num_queries} queries.")


    dim = data.shape[1]
    k = 10 # Number of neighbors to retrieve
    M = 16 # Max connections per node per layer
    ef_construction = 200 # Candidate list size during construction, adjust as needed
    ef_search = 200 # Candidate list size during search, adjust as needed

    # SCC-Aware parameter (adjust this value to test different settings)
    scc_heuristic_alpha = 0.5 # Example value from your notebook

    # --- Compute Ground Truth ---
    print("Computing ground truth...")
    t_gt_start = time.perf_counter()
    nn_brute = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
    nn_brute.fit(data)
    gt_d, gt_i = nn_brute.kneighbors(queries)
    t_gt_end = time.perf_counter()
    print(f"Ground truth computed in {t_gt_end - t_gt_start:.2f}s")

    # --- Build Standard Index ---
    print("\nBuilding standard HNSW index (scc_aware_yes=False)...")
    idx_std = HNSW_SCC(
        dim, num_vectors + 10, M, ef_construction,
        scc_aware_yes=False # Standard HNSW
    )
    t0 = time.perf_counter()
    for v in tqdm(data, desc="Indexing standard HNSW"):
        idx_std.insert(v)
    build_std_time = time.perf_counter() - t0
    print(f"Standard build time: {build_std_time:.2f}s")

    # --- Build SCC-Aware Index ---
    print("\nBuilding SCC-Aware HNSW index (scc_aware_yes=True)...")
    idx_scc = HNSW_SCC(
        dim, num_vectors + 10, M, ef_construction,
        scc_aware_yes=True, # Enable SCC-Aware heuristic
        scc_alpha=scc_heuristic_alpha
    )
    t0 = time.perf_counter()
    for v in tqdm(data, desc="Indexing SCC-Aware HNSW"):
        idx_scc.insert(v)
    build_scc_time = time.perf_counter() - t0
    print(f"SCC-Aware build time: {build_scc_time:.2f}s")


    # --- Initialize lists for average distances ---
    all_avg_dists_std = [] # To store average distance per query for standard HNSW
    all_avg_dists_scc = [] # To store average distance per query for SCC-aware HNSW

    # --- Evaluate Standard Search ---
    print("\nEvaluating standard search...")
    total_recall_std = 0
    times_std = []
    results_std = []
    t_eval_std_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="Standard query")):
        ts = time.perf_counter()
        out_indices = idx_std.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_std.append(te - ts)

        # Recall calculation
        if k > 0: # Avoid division by zero if k=0
            recall = len(set(gt_i[i]) & set(out_indices)) / k
            total_recall_std += recall
        results_std.append(out_indices)

        # --- Calculate Average Distance for this query (Standard HNSW) ---
        query_distances_std = []
        if out_indices: # Check if any neighbors were found
            try:
                # Ensure indices are valid before accessing data
                valid_indices = [idx for idx in out_indices if idx < len(data)]
                if valid_indices:
                    neighbor_vectors = data[valid_indices] # Get vectors using valid indices
                    for neighbor_vec in neighbor_vectors:
                        dist = np.linalg.norm(q - neighbor_vec) # Calculate Euclidean distance
                        query_distances_std.append(dist)
            except IndexError as e:
                logger.error(f"Index error during std distance calculation for query {i}: {e}. Indices: {out_indices}")
                # Handle error, e.g., append NaN or skip

        # Store the average distance for *this* query
        avg_query_dist_std = np.mean(query_distances_std) if query_distances_std else np.nan
        all_avg_dists_std.append(avg_query_dist_std)
        # --- End Average Distance Calculation ---

    t_eval_std_end = time.perf_counter()

    avg_time_std_ms = np.mean(times_std) * 1000 if times_std else 0
    recall_std = total_recall_std / num_queries if num_queries > 0 else 0
    total_query_time_std = t_eval_std_end - t_eval_std_start
    # Calculate overall average distance for standard HNSW, ignoring NaNs
    overall_avg_dist_std = np.nanmean(all_avg_dists_std) if all_avg_dists_std else 0

    # --- Evaluate SCC-Aware Search ---
    print("\nEvaluating SCC-Aware search...")
    total_recall_scc = 0
    times_scc = []
    results_scc = []
    t_eval_scc_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="SCC-Aware query")):
        ts = time.perf_counter()
        out_indices = idx_scc.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_scc.append(te - ts)

        # Recall calculation
        if k > 0: # Avoid division by zero if k=0
            recall = len(set(gt_i[i]) & set(out_indices)) / k
            total_recall_scc += recall
        results_scc.append(out_indices)

        # --- Calculate Average Distance for this query (SCC-Aware HNSW) ---
        query_distances_scc = []
        if out_indices: # Check if any neighbors were found
            try:
                # Ensure indices are valid before accessing data
                valid_indices = [idx for idx in out_indices if idx < len(data)]
                if valid_indices:
                    neighbor_vectors = data[valid_indices] # Get vectors using valid indices
                    for neighbor_vec in neighbor_vectors:
                        dist = np.linalg.norm(q - neighbor_vec) # Calculate Euclidean distance
                        query_distances_scc.append(dist)
            except IndexError as e:
                logger.error(f"Index error during scc distance calculation for query {i}: {e}. Indices: {out_indices}")
                # Handle error, e.g., append NaN or skip

        # Store the average distance for *this* query
        avg_query_dist_scc = np.mean(query_distances_scc) if query_distances_scc else np.nan
        all_avg_dists_scc.append(avg_query_dist_scc)
        # --- End Average Distance Calculation ---

    t_eval_scc_end = time.perf_counter()

    avg_time_scc_ms = np.mean(times_scc) * 1000 if times_scc else 0
    recall_scc = total_recall_scc / num_queries if num_queries > 0 else 0
    total_query_time_scc = t_eval_scc_end - t_eval_scc_start
    # Calculate overall average distance for SCC-aware HNSW, ignoring NaNs
    overall_avg_dist_scc = np.nanmean(all_avg_dists_scc) if all_avg_dists_scc else 0

    # --- Output Results ---
    print("\n--- Comparison Summary ---")
    print(f"Parameters: k={k}, M={M}, efConstruction={ef_construction}, efSearch={ef_search}, SCC_Alpha={scc_heuristic_alpha if idx_scc.scc_aware_yes else 'N/A'}")
    print(f"Dataset size: {num_vectors} vectors, {num_queries} queries")
    print("-" * 85) # Adjusted width for new column
    # Updated Header
    print(f"{'Method':<16} | {'Build Time (s)':<15} | {'Recall@' + str(k):<10} | {'Avg Neighbor Dist':<18} | {'Total Query (s)':<15} | {'Avg Query (ms)':<15}")
    print("-" * 85) # Adjusted width for new column
    # Updated Data Rows
    print(f"{'Standard HNSW':<16} | {build_std_time:<15.2f} | {recall_std:<10.4f} | {overall_avg_dist_std:<18.4f} | {total_query_time_std:<15.2f} | {avg_time_std_ms:<15.2f}")
    print(f"{'SCC-Aware HNSW':<16} | {build_scc_time:<15.2f} | {recall_scc:<10.4f} | {overall_avg_dist_scc:<18.4f} | {total_query_time_scc:<15.2f} | {avg_time_scc_ms:<15.2f}")
    print("-" * 85) # Adjusted width for new column

    # Optional: Calculate speedup/recall/distance difference
    if avg_time_std_ms > 0 and avg_time_scc_ms > 0:
        speedup = avg_time_std_ms / avg_time_scc_ms
        print(f"Avg Query Time Speedup (SCC vs Std): {speedup:.2f}x")
    recall_diff = recall_scc - recall_std
    print(f"Recall Difference (SCC - Std): {recall_diff:+.4f}")
    dist_diff = overall_avg_dist_scc - overall_avg_dist_std
    print(f"Avg Distance Difference (SCC - Std): {dist_diff:+.4f}")

# %% # Keep the cell marker if it exists in the original notebook


Using a subset of 50000 vectors and 1000 queries.
Computing ground truth...


INFO:__main__:Initialized HNSW (dim=100, M=16, efC=200, Modifications: Standard)


Ground truth computed in 0.65s

Building standard HNSW index (scc_aware_yes=False)...


Indexing standard HNSW: 100%|██████████| 50000/50000 [08:30<00:00, 97.90it/s] 
INFO:__main__:Initialized HNSW (dim=100, M=16, efC=200, Modifications: SCC-Aware (alpha=0.5))


Standard build time: 510.74s

Building SCC-Aware HNSW index (scc_aware_yes=True)...


Indexing SCC-Aware HNSW: 100%|██████████| 50000/50000 [14:16<00:00, 58.40it/s]


SCC-Aware build time: 856.09s

Evaluating standard search...


Standard query: 100%|██████████| 1000/1000 [00:09<00:00, 101.13it/s]



Evaluating SCC-Aware search...


SCC-Aware query: 100%|██████████| 1000/1000 [00:13<00:00, 74.52it/s]


--- Comparison Summary ---
Parameters: k=10, M=16, efConstruction=200, efSearch=200, SCC_Alpha=0.5
Dataset size: 50000 vectors, 1000 queries
-------------------------------------------------------------------------------------
Method           | Build Time (s)  | Recall@10  | Avg Neighbor Dist  | Total Query (s) | Avg Query (ms) 
-------------------------------------------------------------------------------------
Standard HNSW    | 510.74          | 0.5909     | 3.3110             | 9.89            | 9.52           
SCC-Aware HNSW   | 856.09          | 0.8151     | 3.0909             | 13.43           | 13.04          
-------------------------------------------------------------------------------------
Avg Query Time Speedup (SCC vs Std): 0.73x
Recall Difference (SCC - Std): +0.2242
Avg Distance Difference (SCC - Std): -0.2201





In [16]:
if __name__ == "__main__":
    # --- Parameters ---
    num_vectors = min(50_000, len(vectors)) # Use a smaller subset or max available
    num_queries = min(1_000, num_vectors // 10)
    if num_vectors <= 0:
        print("Cannot proceed without data. Exiting.")
        exit()

    # Randomly sample data and queries if using a subset
    if num_vectors < len(vectors):
        data_indices = np.random.choice(len(vectors), num_vectors, replace=False)
        data = vectors[data_indices]
        query_indices = np.random.choice(num_vectors, num_queries, replace=False)
        queries = data[query_indices]
        print(f"Using a subset of {num_vectors} vectors and {num_queries} queries.")
    else:
        data = vectors
        query_indices = np.random.choice(num_vectors, num_queries, replace=False)
        queries = data[query_indices]
        print(f"Using the full dataset of {num_vectors} vectors and {num_queries} queries.")


    dim = data.shape[1]
    k = 100 # Number of neighbors to retrieve
    M = 16 # Max connections per node per layer
    ef_construction = 200 # Candidate list size during construction, adjust as needed
    ef_search = 200 # Candidate list size during search, adjust as needed

    # SCC-Aware parameter (adjust this value to test different settings)
    scc_heuristic_alpha = 0.5 # Example value from your notebook

    # --- Compute Ground Truth ---
    print("Computing ground truth...")
    t_gt_start = time.perf_counter()
    nn_brute = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
    nn_brute.fit(data)
    gt_d, gt_i = nn_brute.kneighbors(queries)
    t_gt_end = time.perf_counter()
    print(f"Ground truth computed in {t_gt_end - t_gt_start:.2f}s")

    # --- Build Standard Index ---
    print("\nBuilding standard HNSW index (scc_aware_yes=False)...")
    idx_std = HNSW_SCC(
        dim, num_vectors + 10, M, ef_construction,
        scc_aware_yes=False # Standard HNSW
    )
    t0 = time.perf_counter()
    for v in tqdm(data, desc="Indexing standard HNSW"):
        idx_std.insert(v)
    build_std_time = time.perf_counter() - t0
    print(f"Standard build time: {build_std_time:.2f}s")

    # --- Build SCC-Aware Index ---
    print("\nBuilding SCC-Aware HNSW index (scc_aware_yes=True)...")
    idx_scc = HNSW_SCC(
        dim, num_vectors + 10, M, ef_construction,
        scc_aware_yes=True, # Enable SCC-Aware heuristic
        scc_alpha=scc_heuristic_alpha
    )
    t0 = time.perf_counter()
    for v in tqdm(data, desc="Indexing SCC-Aware HNSW"):
        idx_scc.insert(v)
    build_scc_time = time.perf_counter() - t0
    print(f"SCC-Aware build time: {build_scc_time:.2f}s")


    # --- Initialize lists for average distances ---
    all_avg_dists_std = [] # To store average distance per query for standard HNSW
    all_avg_dists_scc = [] # To store average distance per query for SCC-aware HNSW

    # --- Evaluate Standard Search ---
    print("\nEvaluating standard search...")
    total_recall_std = 0
    times_std = []
    results_std = []
    t_eval_std_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="Standard query")):
        ts = time.perf_counter()
        out_indices = idx_std.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_std.append(te - ts)

        # Recall calculation
        if k > 0: # Avoid division by zero if k=0
            recall = len(set(gt_i[i]) & set(out_indices)) / k
            total_recall_std += recall
        results_std.append(out_indices)

        # --- Calculate Average Distance for this query (Standard HNSW) ---
        query_distances_std = []
        if out_indices: # Check if any neighbors were found
            try:
                # Ensure indices are valid before accessing data
                valid_indices = [idx for idx in out_indices if idx < len(data)]
                if valid_indices:
                    neighbor_vectors = data[valid_indices] # Get vectors using valid indices
                    for neighbor_vec in neighbor_vectors:
                        dist = np.linalg.norm(q - neighbor_vec) # Calculate Euclidean distance
                        query_distances_std.append(dist)
            except IndexError as e:
                logger.error(f"Index error during std distance calculation for query {i}: {e}. Indices: {out_indices}")
                # Handle error, e.g., append NaN or skip

        # Store the average distance for *this* query
        avg_query_dist_std = np.mean(query_distances_std) if query_distances_std else np.nan
        all_avg_dists_std.append(avg_query_dist_std)
        # --- End Average Distance Calculation ---

    t_eval_std_end = time.perf_counter()

    avg_time_std_ms = np.mean(times_std) * 1000 if times_std else 0
    recall_std = total_recall_std / num_queries if num_queries > 0 else 0
    total_query_time_std = t_eval_std_end - t_eval_std_start
    # Calculate overall average distance for standard HNSW, ignoring NaNs
    overall_avg_dist_std = np.nanmean(all_avg_dists_std) if all_avg_dists_std else 0

    # --- Evaluate SCC-Aware Search ---
    print("\nEvaluating SCC-Aware search...")
    total_recall_scc = 0
    times_scc = []
    results_scc = []
    t_eval_scc_start = time.perf_counter()
    for i, q in enumerate(tqdm(queries, desc="SCC-Aware query")):
        ts = time.perf_counter()
        out_indices = idx_scc.search(q, k, ef_search=ef_search)
        te = time.perf_counter()
        times_scc.append(te - ts)

        # Recall calculation
        if k > 0: # Avoid division by zero if k=0
            recall = len(set(gt_i[i]) & set(out_indices)) / k
            total_recall_scc += recall
        results_scc.append(out_indices)

        # --- Calculate Average Distance for this query (SCC-Aware HNSW) ---
        query_distances_scc = []
        if out_indices: # Check if any neighbors were found
            try:
                # Ensure indices are valid before accessing data
                valid_indices = [idx for idx in out_indices if idx < len(data)]
                if valid_indices:
                    neighbor_vectors = data[valid_indices] # Get vectors using valid indices
                    for neighbor_vec in neighbor_vectors:
                        dist = np.linalg.norm(q - neighbor_vec) # Calculate Euclidean distance
                        query_distances_scc.append(dist)
            except IndexError as e:
                logger.error(f"Index error during scc distance calculation for query {i}: {e}. Indices: {out_indices}")
                # Handle error, e.g., append NaN or skip

        # Store the average distance for *this* query
        avg_query_dist_scc = np.mean(query_distances_scc) if query_distances_scc else np.nan
        all_avg_dists_scc.append(avg_query_dist_scc)
        # --- End Average Distance Calculation ---

    t_eval_scc_end = time.perf_counter()

    avg_time_scc_ms = np.mean(times_scc) * 1000 if times_scc else 0
    recall_scc = total_recall_scc / num_queries if num_queries > 0 else 0
    total_query_time_scc = t_eval_scc_end - t_eval_scc_start
    # Calculate overall average distance for SCC-aware HNSW, ignoring NaNs
    overall_avg_dist_scc = np.nanmean(all_avg_dists_scc) if all_avg_dists_scc else 0

    # --- Output Results ---
    print("\n--- Comparison Summary ---")
    print(f"Parameters: k={k}, M={M}, efConstruction={ef_construction}, efSearch={ef_search}, SCC_Alpha={scc_heuristic_alpha if idx_scc.scc_aware_yes else 'N/A'}")
    print(f"Dataset size: {num_vectors} vectors, {num_queries} queries")
    print("-" * 85) # Adjusted width for new column
    # Updated Header
    print(f"{'Method':<16} | {'Build Time (s)':<15} | {'Recall@' + str(k):<10} | {'Avg Neighbor Dist':<18} | {'Total Query (s)':<15} | {'Avg Query (ms)':<15}")
    print("-" * 85) # Adjusted width for new column
    # Updated Data Rows
    print(f"{'Standard HNSW':<16} | {build_std_time:<15.2f} | {recall_std:<10.4f} | {overall_avg_dist_std:<18.4f} | {total_query_time_std:<15.2f} | {avg_time_std_ms:<15.2f}")
    print(f"{'SCC-Aware HNSW':<16} | {build_scc_time:<15.2f} | {recall_scc:<10.4f} | {overall_avg_dist_scc:<18.4f} | {total_query_time_scc:<15.2f} | {avg_time_scc_ms:<15.2f}")
    print("-" * 85) # Adjusted width for new column

    # Optional: Calculate speedup/recall/distance difference
    if avg_time_std_ms > 0 and avg_time_scc_ms > 0:
        speedup = avg_time_std_ms / avg_time_scc_ms
        print(f"Avg Query Time Speedup (SCC vs Std): {speedup:.2f}x")
    recall_diff = recall_scc - recall_std
    print(f"Recall Difference (SCC - Std): {recall_diff:+.4f}")
    dist_diff = overall_avg_dist_scc - overall_avg_dist_std
    print(f"Avg Distance Difference (SCC - Std): {dist_diff:+.4f}")

# %% # Keep the cell marker if it exists in the original notebook


Using a subset of 50000 vectors and 1000 queries.
Computing ground truth...


INFO:__main__:Initialized HNSW (dim=100, M=16, efC=200, Modifications: Standard)


Ground truth computed in 0.35s

Building standard HNSW index (scc_aware_yes=False)...


Indexing standard HNSW: 100%|██████████| 50000/50000 [07:43<00:00, 107.80it/s]
INFO:__main__:Initialized HNSW (dim=100, M=16, efC=200, Modifications: SCC-Aware (alpha=0.5))


Standard build time: 463.83s

Building SCC-Aware HNSW index (scc_aware_yes=True)...


Indexing SCC-Aware HNSW: 100%|██████████| 50000/50000 [09:07<00:00, 91.37it/s] 


SCC-Aware build time: 547.21s

Evaluating standard search...


Standard query: 100%|██████████| 1000/1000 [00:06<00:00, 165.01it/s]



Evaluating SCC-Aware search...


SCC-Aware query: 100%|██████████| 1000/1000 [00:08<00:00, 113.45it/s]


--- Comparison Summary ---
Parameters: k=100, M=16, efConstruction=200, efSearch=200, SCC_Alpha=0.5
Dataset size: 50000 vectors, 1000 queries
-------------------------------------------------------------------------------------
Method           | Build Time (s)  | Recall@100 | Avg Neighbor Dist  | Total Query (s) | Avg Query (ms) 
-------------------------------------------------------------------------------------
Standard HNSW    | 463.83          | 0.6484     | 3.6314             | 6.06            | 5.41           
SCC-Aware HNSW   | 547.21          | 0.7700     | 3.5761             | 8.82            | 8.13           
-------------------------------------------------------------------------------------
Avg Query Time Speedup (SCC vs Std): 0.67x
Recall Difference (SCC - Std): +0.1215
Avg Distance Difference (SCC - Std): -0.0553



