In [104]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import hnswlib
from sklearn.neighbors import NearestNeighbors
from heapq import heappush, heappop

In [105]:

# Path to your GloVe file (update this based on your downloaded version)
glove_path = '/Users/tanishqchaudhari/Desktop/DataScience Proj  datasets/Dataset/glove.6B.100d.txt'

# Load GloVe vectors
word_to_vec = {}
words = []
vectors = []

with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]  # First token is the word
        vector = np.array(values[1:], dtype=np.float32)  # Rest are vector values
        word_to_vec[word] = vector
        words.append(word)
        vectors.append(vector)

# Convert to numpy array
vectors = np.array(vectors, dtype=np.float32)
print(f"Loaded {len(words)} word vectors of dimension {vectors.shape[1]}")

Loaded 400000 word vectors of dimension 100


In [106]:
from tqdm import tqdm

In [107]:
# Step 1: Generate 10^ random query indices
num_queries = 40000
query_indices = np.random.randint(0, len(words), size=num_queries)

# Step 2: Get the corresponding query vectors
query_vectors = vectors[query_indices]

# Step 3: Use NearestNeighbors to find 100 nearest neighbors
k = 100
nn = NearestNeighbors(n_neighbors=k, algorithm='auto', metric='l2')
nn.fit(vectors)

# Step 4: For each query, get the indices of the 100 nearest neighbors
distances, neighbor_indices = nn.kneighbors(query_vectors)

# Step 5: Store results in a dictionary {query_word: [neighbor_words]}
query_to_neighbors = {}



# Step 5: Store results in a dictionary {query_word: [neighbor_words]}
query_to_neighbors = {}
for i in tqdm(range(len(query_indices)), desc="Finding nearest neighbors"):
    query_idx = query_indices[i]
    query_word = words[query_idx]
    neighbor_words = [words[idx] for idx in neighbor_indices[i]]
    query_to_neighbors[query_word] = neighbor_words

print(f"Stored nearest neighbors for {len(query_to_neighbors)} queries.")


Finding nearest neighbors: 100%|██████████| 40000/40000 [00:00<00:00, 54418.28it/s]

Stored nearest neighbors for 38055 queries.





In [109]:
query_to_neighbors_vectors = {}

for i in tqdm(range(len(query_indices)), desc="Finding nearest neighbors"):
    query_idx = query_indices[i]
    neighbor_indices_list = neighbor_indices[i]
    query_to_neighbors_vectors[query_idx] = neighbor_indices_list  # All indices, not vectors


Finding nearest neighbors: 100%|██████████| 40000/40000 [00:00<00:00, 2222442.18it/s]


In [110]:
from collections import Counter
import pandas as pd

# Flatten all neighbor indices into a single list
all_neighbors = [idx for neighbors in query_to_neighbors_vectors.values() for idx in neighbors]

# Count frequency of each neighbor index
freq_counter = Counter(all_neighbors)

# Create DataFrame and sort
freq_df = pd.DataFrame(freq_counter.items(), columns=["Index", "Frequency"]).sort_values("Frequency", ascending=False)



In [111]:
#printing the average
average = freq_df["Frequency"].mean()
print(f"Average frequency of neighbors: {average}")

Average frequency of neighbors: 11.96701876420995


In [112]:
def compute_hubs(
    neighbor_indices: np.ndarray,
    distances:        np.ndarray,
    freq_df:          pd.DataFrame,
    alpha:            float = 0.5,
    method:           str   = "percentile",
    threshold:        float = 95.0,
    top_k:            int   = None
) -> (pd.Series, set):
    """
    Compute combined hub‐scores f_i and select hub items H.

    Inputs:
      - neighbor_indices: shape (5000,100) np.int array of item‑IDs
      - distances:        shape (5000,100) np.float array of distances
      - freq_df:          pd.DataFrame with columns ["Index","Frequency"]
                          where Frequency = c_i = #queries whose top‑100 includes item i
      - alpha:            weight in [0,1] between freq and distance
      - method:           "percentile", "top_k", or "stddev"
      - threshold:        percentile p (if method="percentile"), or
                          k (if method="stddev" meaning μ + k·σ)
      - top_k:            required if method="top_k"

    Returns:
      - combined:   pd.Series indexed by item‑ID, values f_i ∈ [0,1]
      - hub_set:    set of item‑IDs chosen as hubs
    """

    # --- 1) Normalize frequency via min–max over c_i ---
    # freq_df["Index"] holds item‑IDs, freq_df["Frequency"] holds c_i
    freq_series = freq_df.set_index("Index")["Frequency"].astype(float)
    c_min, c_max = freq_series.min(), freq_series.max()
    if c_max > c_min:
        f_freq = (freq_series - c_min) / (c_max - c_min)
    else:
        f_freq = pd.Series(0.0, index=freq_series.index)

    # --- 2) Compute average distance d̄_i and normalize/flip to get f_dist ---
    df = pd.DataFrame({
        "Index":    neighbor_indices.ravel(),
        "Distance": distances.ravel()
    })
    d_bar = df.groupby("Index")["Distance"].mean()
    d_min, d_max = d_bar.min(), d_bar.max()
    if d_max > d_min:
        d_tilde = (d_bar - d_min) / (d_max - d_min)
    else:
        d_tilde = pd.Series(0.0, index=d_bar.index)
    f_dist = 1.0 - d_tilde

    # --- 3) Combine scores on the intersection of items ---
    common = f_freq.index.intersection(f_dist.index)
    combined = alpha * f_freq.loc[common] + (1 - alpha) * f_dist.loc[common]
    combined.name = "f_i"
    print(f_dist.loc[common])
    # --- 4) Select hubs ---
    if method == "percentile":
        cutoff = np.percentile(combined.values, threshold)
        hub_set = set(combined[combined >= cutoff].index)

    elif method == "stddev":
        mu, sigma = combined.mean(), combined.std()
        cutoff = mu + threshold * sigma
        hub_set = set(combined[combined >= cutoff].index)

    elif method == "top_k":
        if top_k is None:
            raise ValueError("top_k must be provided when method='top_k'")
        hub_set = set(combined.nlargest(top_k).index)

    else:
        raise ValueError(f"unknown method {method!r}")
    
    return combined, hub_set



In [113]:
combined_scores, hubs = compute_hubs(
    neighbor_indices,
    distances,
    freq_df,
    alpha=0.5,
    method="percentile",
    threshold=95.0
)

Index
154957    0.732232
289249    0.738054
364978    0.739057
224673    0.732303
262208    0.744023
            ...   
103030    0.548729
85487     0.540763
71104     0.523096
271552    0.510760
49582     1.000000
Name: Distance, Length: 317999, dtype: float64


In [121]:
import numpy as np
from heapq import heappush, heappop
import random
import math

# HNSW CLASS

# Obj Of HNSW CLASS

In [125]:
def _distance(self, i, j):
    return np.linalg.norm(self.vectors[i] - self.vectors[j])


In [127]:
import numpy as np
from heapq import heappush, heappop
import random
import math

class HNSW:
    """
    Standard HNSW implementation based on the provided code [1],
    with hub-related modifications removed.
    """
    def __init__(self, dim, max_elements, M=16, ef_construction=200):
        """
        Initializes the HNSW index.

        Args:
            dim (int): Dimensionality of the vectors.
            max_elements (int): Estimated maximum number of elements (informational).
            M (int): Maximum number of connections per node per layer.
            ef_construction (int): Size of the dynamic candidate list during construction.
        """
        self.dim = dim
        self.max_elements = max_elements # Informational
        self.M = M
        self.ef_construction = ef_construction

        # Store graph layers: layers[l] is a dict {node_idx: [neighbor_indices]}
        self.layers = []
        # Store the actual vectors, index corresponds to node_idx
        self.vectors = []
        # Track the entry point (index of the node in the highest layer)
        self.entry_point = None
        # Precompute layer multiplier
        self.ml = 1 / math.log(self.M) if self.M > 1 else 1


    def _get_layer(self):
        """
        Determine the layer for a new node based on exponential decay
        using the precomputed multiplier (mL).

        Returns:
            int: The selected layer index (>= 0).
        """
        # Calculate layer using the standard HNSW formula
        layer = max(0, int(-math.log(random.random()) * self.ml))
        return layer

    # --- _distance method remains unchanged ---
    def _distance(self, idx1, idx2):
        """Calculate Euclidean distance between two vectors by index."""
        if not (0 <= idx1 < len(self.vectors) and 0 <= idx2 < len(self.vectors)):
            # Handle cases where one or both indices are out of bounds
            # This might happen during pruning if a node was considered but
            # doesn't actually exist in the current vector list state.
            return float('inf')
        try:
            return np.linalg.norm(self.vectors[idx1] - self.vectors[idx2])
        except IndexError:
             # Should ideally not happen if bounds check passes, but as extra safety
             return float('inf')


    # --- _search_layer method remains unchanged ---
    # (Includes robustness checks from original code [1])
    def _search_layer(self, query_vec, layer_idx, ep_idx, ef):
        """Search within a specific layer starting from an entry point."""
        # Basic layer validity check
        if layer_idx < 0 or layer_idx >= len(self.layers):
            return []
        layer_graph = self.layers[layer_idx]
        if not layer_graph: # Layer exists but is empty
            return []

        # Robustness: Check if entry point is valid for this layer and vector list
        if ep_idx not in layer_graph or ep_idx >= len(self.vectors):
            # If invalid, try to find a random valid node in the layer as a fallback EP
            try:
                valid_indices = [idx for idx in layer_graph if idx < len(self.vectors)]
                if not valid_indices:
                    return [] # No valid nodes in this layer
                ep_idx = random.choice(valid_indices) # Choose a random valid node
            except IndexError: # Should not happen if valid_indices check passed
                return []

        visited = set()
        candidates = [] # Min-heap: (distance, node_idx)
        results = []    # Max-heap: (-distance, node_idx)

        # Initialize search with the entry point
        try:
             # Ensure vector exists before calculating distance
             if ep_idx >= len(self.vectors):
                 # This case should be caught by the initial check, but for safety:
                 return []
             initial_dist = np.linalg.norm(query_vec - self.vectors[ep_idx])
             heappush(candidates, (initial_dist, ep_idx))
             heappush(results, (-initial_dist, ep_idx))
             visited.add(ep_idx)
        except IndexError:
             # Fallback if vector access fails unexpectedly
             # print(f"Warning: IndexError accessing vector for entry point {ep_idx}...") # Optional warning
             return []

        # Greedy search loop
        while candidates:
            try:
                dist_candidate, current_idx = heappop(candidates)
            except IndexError: # Heap is empty
                break

            # Get distance of the farthest node found so far
            farthest_dist_in_results = -results[0][0] if results else float('inf')

            # Early termination condition
            if dist_candidate > farthest_dist_in_results and len(results) >= ef:
                 break # All remaining candidates are farther than the worst result

            # Check if current node is still valid (might be needed with concurrent modifications, less so here)
            # Also check if it exists in the current layer graph
            if current_idx not in layer_graph or current_idx >= len(self.vectors):
                continue # Skip if node became invalid or is not in this layer

            # Explore neighbors
            for neighbor_idx in layer_graph.get(current_idx, []):
                 # Check if neighbor is valid and not visited
                 if neighbor_idx not in visited and neighbor_idx < len(self.vectors):
                    visited.add(neighbor_idx)
                    try:
                        dist_neighbor = np.linalg.norm(query_vec - self.vectors[neighbor_idx])
                    except IndexError:
                        continue # Skip if neighbor vector access fails

                    # Get updated farthest distance
                    farthest_dist_in_results = -results[0][0] if results else float('inf')

                    # If neighbor is closer than farthest result or results list is not full
                    if dist_neighbor < farthest_dist_in_results or len(results) < ef:
                        # Add to results (maintaining max-heap property)
                        heappush(results, (-dist_neighbor, neighbor_idx))
                        # If results exceed ef, remove the farthest
                        if len(results) > ef:
                            heappop(results)
                        # Add neighbor to candidates for further exploration
                        heappush(candidates, (dist_neighbor, neighbor_idx))

        # Convert max-heap results to sorted list (distance, idx)
        final_results = sorted([(-d, idx) for d, idx in results])
        return final_results[:ef] # Return top ef results


    def insert(self, vector):
        """Insert a vector into the HNSW index."""
        new_idx = len(self.vectors) # Index for the new vector
        self.vectors.append(np.array(vector)) # Store the vector

        # Determine the layers the new node will belong to
        node_max_layer = self._get_layer() # Use standard layer selection

        current_ep_idx = self.entry_point

        # Ensure enough layers exist in the graph structure
        while len(self.layers) <= node_max_layer:
            self.layers.append(dict()) # Add new empty layer dictionaries

        # --- Rest of insert method unchanged from original code [1] ---
        # (Includes robustness checks)
        current_insert_vec = self.vectors[new_idx] # Use the newly added vector

        # Phase 1: Find entry points in upper layers (down to node_max_layer + 1)
        for l in range(len(self.layers) - 1, node_max_layer, -1):
             if current_ep_idx is None: break # Cannot proceed without an entry point
             if not self.layers[l]: continue # Skip empty layers

             # Ensure entry point validity before search in this layer
             if current_ep_idx not in self.layers[l] or current_ep_idx >= len(self.vectors):
                 try:
                    # Fallback: Find a random valid node in the layer
                    valid_indices = [idx for idx in self.layers[l] if idx < len(self.vectors)]
                    if not valid_indices: continue # Skip layer if no valid nodes
                    current_ep_idx = random.choice(valid_indices)
                 except IndexError: continue # Should not happen

             # Search for the closest node in layer l to the new vector (ef=1)
             search_results = self._search_layer(current_insert_vec, l, current_ep_idx, ef=1)
             if search_results:
                 # Update the entry point for the next lower layer
                 found_ep_idx = search_results[0][1]
                 # Check validity before assignment
                 if found_ep_idx < len(self.vectors):
                      current_ep_idx = found_ep_idx

        # Phase 2: Insert node in layers node_max_layer down to 0
        for l in range(min(node_max_layer, len(self.layers) - 1), -1, -1):
            layer_graph = self.layers[l]

            # Ensure valid entry point for the search in this layer
            if current_ep_idx is None or current_ep_idx not in layer_graph or current_ep_idx >= len(self.vectors):
                 if layer_graph: # If the layer is not empty
                     try:
                         # Fallback: Find a random valid node
                         valid_indices = [idx for idx in layer_graph if idx < len(self.vectors)]
                         if not valid_indices: current_ep_idx = None # No valid EPs possible
                         else: current_ep_idx = random.choice(valid_indices)
                     except IndexError: current_ep_idx = None
                 else: current_ep_idx = None # Layer is empty, no EP

            # Find neighbors using search_layer with ef_construction
            if current_ep_idx is None:
                 neighbors = [] # No entry point, cannot search
            else:
                 # Search for ef_construction nearest neighbors
                 neighbors = self._search_layer(current_insert_vec, l, current_ep_idx, self.ef_construction)
                 # Update entry point for next layer (closest neighbor found)
                 if neighbors:
                      found_ep_idx = neighbors[0][1]
                      # Check validity before assignment
                      if found_ep_idx < len(self.vectors):
                           current_ep_idx = found_ep_idx

            # Select M best neighbors based on distance
            connections = [idx for dist, idx in neighbors[:self.M]]
            # Add connections for the new node in this layer
            layer_graph[new_idx] = connections

            # Add backlinks from neighbors to the new node, maintaining M limit
            for neighbor_idx in connections:
                 # Ensure neighbor exists and is valid before modifying its connections
                 if neighbor_idx in layer_graph and neighbor_idx < len(self.vectors):
                     neighbor_connections = layer_graph[neighbor_idx]
                     if new_idx not in neighbor_connections: # Avoid duplicate connections
                          neighbor_connections.append(new_idx)
                          # Prune connections if exceeding M
                          if len(neighbor_connections) > self.M:
                               # Ensure all connections are valid before distance calculation for pruning
                               valid_conns = [cidx for cidx in neighbor_connections if cidx < len(self.vectors)]
                               if len(valid_conns) < len(neighbor_connections):
                                   # If some connections became invalid, just update the list
                                   layer_graph[neighbor_idx] = valid_conns
                                   # Re-check length; might not need pruning anymore
                                   if len(valid_conns) <= self.M:
                                        continue # Skip pruning if count is now okay

                               # Calculate distances only for valid connections
                               distances = []
                               for conn_idx in valid_conns:
                                   dist = self._distance(neighbor_idx, conn_idx)
                                   if dist != float('inf'): # Only consider valid distances
                                       distances.append((dist, conn_idx))

                               # Sort by distance and keep the M closest
                               distances.sort()
                               layer_graph[neighbor_idx] = [idx for dist, idx in distances[:self.M]]

        # Update the global entry point if the new node is in a higher layer
        current_ep_layer = self._get_node_layer(self.entry_point) # Find current EP's highest layer
        if self.entry_point is None or node_max_layer > current_ep_layer:
            self.entry_point = new_idx


    # --- Helper method to find the highest layer a node exists in ---
    def _get_node_layer(self, node_idx):
        """Helper to find the highest layer index a node exists in."""
        if node_idx is None or node_idx < 0: return -1
        for l in range(len(self.layers) - 1, -1, -1):
             # Check if layer exists and node is in the layer's keys
             if l < len(self.layers) and node_idx in self.layers[l]:
                 return l
        return -1 # Node not found in any layer


    # --- search method remains unchanged ---
    # (Includes robustness checks from original code [1])
    def search(self, query_vec, k=10):
        """Search for the k nearest neighbors of query_vec."""
        # Determine ef for search (at least k)
        ef_search = max(self.ef_construction, k)
        current_ep_idx = self.entry_point

        # Check if entry point is valid at the start
        if current_ep_idx is None or current_ep_idx >= len(self.vectors):
            #print("Warning: Entry point is None or invalid at search start.") # Optional warning
            # Potentially try to find *any* valid node if layers exist?
            # For now, return empty if no valid starting point.
            return []

        query_vec = np.array(query_vec) # Ensure query is a numpy array

        # Phase 1: Navigate upper layers (down to layer 1) to find good entry point for layer 0
        for l in range(len(self.layers) - 1, 0, -1): # Stop at layer 1
            if not self.layers[l]: continue # Skip empty layers

            # Ensure current entry point is valid for this layer
            if current_ep_idx not in self.layers[l] or current_ep_idx >= len(self.vectors):
                 try:
                      # Fallback: Find a random valid node in the layer
                      valid_indices = [idx for idx in self.layers[l] if idx < len(self.vectors)]
                      if not valid_indices: continue # Skip layer if no valid nodes
                      current_ep_idx = random.choice(valid_indices)
                 except IndexError: continue # Should not happen

            # Search layer l for the closest node (ef=1)
            search_results = self._search_layer(query_vec, l, current_ep_idx, ef=1)
            if search_results:
                 # Update entry point for the next lower layer
                 found_ep_idx = search_results[0][1]
                 # Check validity before assignment
                 if found_ep_idx < len(self.vectors):
                      current_ep_idx = found_ep_idx

        # Phase 2: Search layer 0 using the refined entry point
        # Ensure the final entry point for layer 0 is valid before the final search
        if current_ep_idx >= len(self.vectors) or \
           (0 < len(self.layers) and current_ep_idx not in self.layers[0]):
             # If layer 0 exists and has nodes, try to find a fallback EP within it
             if 0 < len(self.layers) and self.layers[0]:
                  try:
                      valid_indices_l0 = [idx for idx in self.layers[0] if idx < len(self.vectors)]
                      if not valid_indices_l0: return [] # No valid nodes in layer 0
                      current_ep_idx = random.choice(valid_indices_l0)
                  except IndexError: return [] # Should not happen
             else: return [] # Layer 0 doesn't exist or is empty

        # Perform final search in layer 0 with ef_search
        neighbors = self._search_layer(query_vec, 0, current_ep_idx, ef_search)

        # Return the top k results by index
        return [idx for dist, idx in neighbors[:k]]



In [128]:
import numpy as np
from heapq import heappush, heappop
import random
import math

class HNSWfreqdistance:
    """
    HNSW implementation based on paste.txt [1], modified to boost layer
    probability for specified 'hub' nodes during insertion.
    Search performance remains identical to the base HNSW implementation.
    """
    def __init__(self, dim, max_elements, hubs, boost_const, M=16, ef_construction=200):
        """
        Initializes the HNSWfreqdistance index.

        Args:
            dim (int): Dimensionality of the vectors.
            max_elements (int): Estimated maximum number of elements (informational).
            hubs (set): A set of integer indices representing 'hub' nodes.
                        Nodes whose *future* index is in this set will have
                        their layer assignment probability boosted.
            boost_const (float): A small constant added to the layer calculation
                                 multiplier (mL) for hub nodes.
            M (int): Maximum number of connections per node per layer.
            ef_construction (int): Size of the dynamic candidate list during construction.
        """
        self.dim = dim
        self.max_elements = max_elements # Informational
        self.M = M
        self.ef_construction = ef_construction
        self.hubs = hubs if hubs is not None else set() # Ensure hubs is a set
        self.boost_const = boost_const # Constant for boosting hub layer probability

        # Store graph layers: layers[l] is a dict {node_idx: [neighbor_indices]}
        self.layers = []
        # Store the actual vectors, index corresponds to node_idx
        self.vectors = []
        # Track the entry point (index of the node in the highest layer)
        self.entry_point = None
        # Precompute layer multiplier
        self.ml = 1 / math.log(self.M) if self.M > 1 else 1


    # *** MODIFIED to accept node_idx and apply boost ***
    def _get_layer(self, node_idx):
        """
        Determine the layer for a new node based on exponential decay.
        If node_idx is in self.hubs, boost the probability of higher layers.

        Args:
            node_idx (int): The index the new node *will* have upon insertion.

        Returns:
            int: The selected layer index (>= 0).
        """
        # Use precomputed base multiplier
        current_ml = self.ml

        # Apply boost if the node index is designated as a hub
        if node_idx in self.hubs:
            current_ml += self.boost_const

        # Calculate layer using the (potentially boosted) multiplier
        layer = max(0, int(-math.log(random.random()) * current_ml))
        return layer

    # --- _distance method remains unchanged from paste.txt [1] ---
    def _distance(self, idx1, idx2):
        if not (0 <= idx1 < len(self.vectors) and 0 <= idx2 < len(self.vectors)):
            return float('inf')
        return np.linalg.norm(self.vectors[idx1] - self.vectors[idx2])

    # --- _search_layer method remains unchanged from paste.txt [1] ---
    # (Includes robustness checks from paste.txt [1])
    def _search_layer(self, query_vec, layer_idx, ep_idx, ef):
        if layer_idx < 0 or layer_idx >= len(self.layers): return []
        layer_graph = self.layers[layer_idx]
        if not layer_graph: return []

        if ep_idx not in layer_graph or ep_idx >= len(self.vectors):
            try:
                valid_indices = [idx for idx in layer_graph if idx < len(self.vectors)]
                if not valid_indices: return []
                ep_idx = random.choice(valid_indices)
            except IndexError: return []

        visited = set()
        candidates = []
        results = []

        try:
            if ep_idx >= len(self.vectors): return []
            initial_dist = np.linalg.norm(query_vec - self.vectors[ep_idx])
            heappush(candidates, (initial_dist, ep_idx))
            heappush(results, (-initial_dist, ep_idx))
            visited.add(ep_idx)
        except IndexError:
             # print(f"Warning: IndexError accessing vector for entry point {ep_idx}...") # Optional warning
             return []

        while candidates:
            try:
                dist_candidate, current_idx = heappop(candidates)
            except IndexError: break

            farthest_dist_in_results = -results[0][0] if results else float('inf')
            if dist_candidate > farthest_dist_in_results and len(results) >= ef:
                 break

            if current_idx not in layer_graph or current_idx >= len(self.vectors):
                continue

            for neighbor_idx in layer_graph.get(current_idx, []):
                if neighbor_idx not in visited and neighbor_idx < len(self.vectors):
                    visited.add(neighbor_idx)
                    dist_neighbor = np.linalg.norm(query_vec - self.vectors[neighbor_idx])
                    farthest_dist_in_results = -results[0][0] if results else float('inf')

                    if dist_neighbor < farthest_dist_in_results or len(results) < ef:
                        heappush(results, (-dist_neighbor, neighbor_idx))
                        if len(results) > ef: heappop(results)
                        heappush(candidates, (dist_neighbor, neighbor_idx))

        final_results = sorted([(-d, idx) for d, idx in results])
        return final_results[:ef]

    # *** MODIFIED to pass new_idx to _get_layer ***
    def insert(self, vector):
        """Insert a vector into the HNSW index."""
        new_idx = len(self.vectors) # Determine index before layer calculation
        self.vectors.append(np.array(vector))

        # *** MODIFIED CALL ***: Pass the node's future index to _get_layer
        node_max_layer = self._get_layer(new_idx)

        current_ep_idx = self.entry_point

        while len(self.layers) <= node_max_layer:
            self.layers.append(dict())

        # --- Rest of insert method unchanged from paste.txt [1] ---
        # (Includes robustness checks)
        current_insert_vec = self.vectors[new_idx] # Use stored vector for search

        for l in range(len(self.layers) - 1, node_max_layer, -1):
             if current_ep_idx is None: break
             if not self.layers[l]: continue

             # Ensure entry point validity before search
             if current_ep_idx not in self.layers[l] or current_ep_idx >= len(self.vectors):
                 try:
                    valid_indices = [idx for idx in self.layers[l] if idx < len(self.vectors)]
                    if not valid_indices: continue
                    current_ep_idx = random.choice(valid_indices)
                 except IndexError: continue

             search_results = self._search_layer(current_insert_vec, l, current_ep_idx, ef=1)
             if search_results:
                 found_ep_idx = search_results[0][1]
                 if found_ep_idx < len(self.vectors): # Check validity before assignment
                      current_ep_idx = found_ep_idx

        for l in range(min(node_max_layer, len(self.layers) - 1), -1, -1):
            layer_graph = self.layers[l]

            # Ensure valid entry point for search
            if current_ep_idx is None or current_ep_idx not in layer_graph or current_ep_idx >= len(self.vectors):
                 if layer_graph:
                     try:
                         valid_indices = [idx for idx in layer_graph if idx < len(self.vectors)]
                         if not valid_indices: current_ep_idx = None
                         else: current_ep_idx = random.choice(valid_indices)
                     except IndexError: current_ep_idx = None
                 else: current_ep_idx = None

            if current_ep_idx is None:
                 neighbors = []
            else:
                 neighbors = self._search_layer(current_insert_vec, l, current_ep_idx, self.ef_construction)
                 if neighbors:
                      found_ep_idx = neighbors[0][1]
                      if found_ep_idx < len(self.vectors): # Check validity
                           current_ep_idx = found_ep_idx

            connections = [idx for dist, idx in neighbors[:self.M]]
            layer_graph[new_idx] = connections

            for neighbor_idx in connections:
                 # Check validity before adding backlink
                 if neighbor_idx in layer_graph and neighbor_idx < len(self.vectors):
                     neighbor_connections = layer_graph[neighbor_idx]
                     if new_idx not in neighbor_connections:
                          neighbor_connections.append(new_idx)
                          if len(neighbor_connections) > self.M:
                               # Check connection validity before distance calc for pruning
                               valid_conns = [cidx for cidx in neighbor_connections if cidx < len(self.vectors)]
                               if len(valid_conns) < len(neighbor_connections):
                                   layer_graph[neighbor_idx] = valid_conns
                                   continue

                               distances = [(self._distance(neighbor_idx, conn_idx), conn_idx) for conn_idx in valid_conns]
                               distances.sort()
                               layer_graph[neighbor_idx] = [idx for dist, idx in distances[:self.M]]

        # Update global entry point logic (unchanged from paste.txt [1])
        current_ep_layer = self._get_node_layer(self.entry_point) # Use helper
        if self.entry_point is None or node_max_layer > current_ep_layer:
            self.entry_point = new_idx


    # --- Helper method from paste-2.txt [2] added for clarity ---
    def _get_node_layer(self, node_idx):
        # Helper to find highest layer a node exists in
        if node_idx is None or node_idx < 0: return -1
        for l in range(len(self.layers) - 1, -1, -1):
             if node_idx in self.layers[l]:
                 return l
        return -1 # Node not found

    # --- search method remains unchanged from paste.txt [1] ---
    # (Includes robustness checks)
    def search(self, query_vec, k=10):
        ef_search = max(self.ef_construction, k)
        current_ep_idx = self.entry_point

        if current_ep_idx is None or current_ep_idx >= len(self.vectors):
            #print("Warning: Entry point is None or invalid...") # Optional
            return []

        query_vec = np.array(query_vec)

        for l in range(len(self.layers) - 1, 0, -1): # Down to layer 1
            if not self.layers[l]: continue

            # Ensure entry point validity
            if current_ep_idx not in self.layers[l] or current_ep_idx >= len(self.vectors):
                 try:
                     valid_indices = [idx for idx in self.layers[l] if idx < len(self.vectors)]
                     if not valid_indices: continue
                     current_ep_idx = random.choice(valid_indices)
                 except IndexError: continue

            search_results = self._search_layer(query_vec, l, current_ep_idx, ef=1)
            if search_results:
                 found_ep_idx = search_results[0][1]
                 if found_ep_idx < len(self.vectors): # Check validity
                      current_ep_idx = found_ep_idx

        # Ensure final entry point for layer 0 is valid
        if current_ep_idx >= len(self.vectors) or (0 < len(self.layers) and current_ep_idx not in self.layers[0]):
             if 0 < len(self.layers) and self.layers[0]:
                  try:
                      valid_indices_l0 = [idx for idx in self.layers[0] if idx < len(self.vectors)]
                      if not valid_indices_l0: return []
                      current_ep_idx = random.choice(valid_indices_l0)
                  except IndexError: return []
             else: return []

        neighbors = self._search_layer(query_vec, 0, current_ep_idx, ef_search)
        return [idx for dist, idx in neighbors[:k]]



In [129]:
# import numpy as np
# import time
# from sklearn.neighbors import NearestNeighbors

# # --- Assume these are pre-loaded/defined ---
# # vectors: np.ndarray (N x Dim) - Your dataset (e.g., GloVe)
# # hubs: set - Set of integer indices designated as hubs
# # HNSWfreqdistance: class - Your custom class definition (from previous step) must be available

# # --- Parameters ---
# K = 10                   # Number of neighbors for recall
# NUM_QUERIES = 5000       # Number of queries to use
# # Ensure vectors is defined before accessing shape
# if 'vectors' not in globals() or not isinstance(vectors, np.ndarray):
#     raise NameError("The 'vectors' numpy array is not defined.")
# DIMENSION = vectors.shape[1]
# NUM_ELEMENTS = vectors.shape[0]

# # HNSW parameters for your custom class
# M = 16
# # M_HUB is not used in the provided HNSWfreqdistance class, only boost_const
# EF_CONSTRUCTION = 200
# BOOST_CONST = 0.1        # For HNSWfreqdistance

# # Select query vectors (first N)
# query_vectors = vectors[:min(NUM_QUERIES, NUM_ELEMENTS)]
# num_actual_queries = len(query_vectors)
# all_indices = np.arange(NUM_ELEMENTS)

# # --- Helper Function ---
# def calculate_recall(ground_truth, found_neighbors, k):
#     total_found = 0
#     num_queries = len(ground_truth)
#     if num_queries == 0: return 0.0
#     expected_total = num_queries * k
#     for i in range(num_queries):
#         # Ensure ground truth has k elements for comparison if possible
#         gt_set = set(ground_truth[i][:k])
#         # Handle cases where ANN search returns less than k
#         found_set = set(found_neighbors[i][:k]) if found_neighbors[i] is not None else set()
#         total_found += len(gt_set.intersection(found_set))
#     if expected_total == 0: return 1.0
#     return total_found / expected_total

# # --- Ground Truth Calculation ---
# def calculate_ground_truth(all_vectors, query_vectors, k):
#     num_actual_queries = len(query_vectors)
#     print(f"Calculating ground truth for {num_actual_queries} queries (k={k})...")
#     start_time = time.perf_counter()
#     # Ensure k is not larger than the number of elements fit
#     nn_k = min(k, all_vectors.shape[0])
#     if nn_k != k:
#         print(f"Warning: Requested k={k}, but only {all_vectors.shape[0]} elements exist. Using k={nn_k} for NearestNeighbors.")
#     bf_index = NearestNeighbors(n_neighbors=nn_k, algorithm='brute', metric='euclidean')
#     bf_index.fit(all_vectors)
#     # Request the original k, kneighbors handles k > n_samples_fit
#     _, ground_truth_indices = bf_index.kneighbors(query_vectors, n_neighbors=k)
#     time_taken = time.perf_counter() - start_time
#     print(f"Ground truth calculated in {time_taken:.4f} s.")
#     return ground_truth_indices

# # --- HNSWfreqdistance Evaluation Function ---
# def evaluate_custom_hnsw(
#     HNSWClass, # Pass the class itself
#     all_vectors,
#     query_vectors,
#     ground_truth_indices,
#     k,
#     hubs,        # Specific to HNSWfreqdistance
#     boost_const, # Specific to HNSWfreqdistance
#     M,
#     ef_construction):
#     """Builds and evaluates the custom HNSW class."""

#     print(f"\n--- Evaluating {HNSWClass.__name__} ---")
#     dim = all_vectors.shape[1]
#     num_elements = all_vectors.shape[0]

#     # Instantiate the custom class
#     custom_index = HNSWClass(dim=dim, max_elements=num_elements,
#                              hubs=hubs, boost_const=boost_const,
#                              M=M, ef_construction=ef_construction)

#     # --- Indexing ---
#     print("Indexing...")
#     start_time_idx = time.perf_counter()
#     for i in range(num_elements):
#         custom_index.insert(all_vectors[i])
#     index_time = time.perf_counter() - start_time_idx
#     print(f"Indexing time: {index_time:.4f} s")

#     # --- Querying ---
#     print("Querying...")
#     custom_results = []
#     start_time_query = time.perf_counter()
#     for i in range(len(query_vectors)):
#         neighbors = custom_index.search(query_vectors[i], k=k)
#         custom_results.append(neighbors)
#     query_time = time.perf_counter() - start_time_query
#     avg_query_time = query_time / len(query_vectors) if len(query_vectors) > 0 else 0
#     print(f"Query time: {query_time:.4f} s ({avg_query_time*1000:.4f} ms/query avg)")

#     # --- Recall ---
#     recall = calculate_recall(ground_truth_indices, custom_results, k)
#     print(f"Recall@{k}: {recall:.4f}")

#     return recall, index_time, query_time, avg_query_time

# # --- Main Execution ---

# # 1. Calculate Ground Truth
# # Ensure the custom class definition is available before calling this script
# if 'HNSWfreqdistance' not in globals():
#      raise NameError("The HNSWfreqdistance class is not defined in the current scope.")

# gt_indices = calculate_ground_truth(vectors, query_vectors, K)

# # 2. Evaluate your HNSWfreqdistance class
# recall_cust, index_time_cust, query_time_cust, avg_query_time_cust = evaluate_custom_hnsw(
#     HNSWClass=HNSWfreqdistance,
#     all_vectors=vectors,
#     query_vectors=query_vectors,
#     ground_truth_indices=gt_indices,
#     k=K,
#     hubs=hubs,
#     boost_const=BOOST_CONST,
#     M=M,
#     ef_construction=EF_CONSTRUCTION
# )

# # --- Final Results ---
# print("\n--- Custom HNSW Performance Summary ---")
# print(f"Class Name:         {HNSWfreqdistance.__name__}")
# print(f"Indexing Time:      {index_time_cust:.4f} s")
# print(f"Total Query Time:   {query_time_cust:.4f} s")
# print(f"Avg Query Time:     {avg_query_time_cust*1000:.4f} ms")
# print(f"Recall@{K}:           {recall_cust:.4f}")



In [130]:
# import numpy as np
# import time
# from sklearn.neighbors import NearestNeighbors

# # --- Ensure your HNSW and HNSWfreqdistance classes are in scope ---
# # from your_module import HNSW, HNSWfreqdistance

# # --- User parameters ---
# K = 10                   # Number of neighbors for recall\ nNUM_QUERIES = 5000       # Number of queries to use

# # --- Load your data ---
# # Replace with actual loading logic, e.g.: vectors = np.load('vectors.npy')
# vectors = np.random.random((400000, 128))  # placeholder: 400k vectors of dim 128
# hubs = set(np.random.choice(np.arange(vectors.shape[0]), size=1000, replace=False))  # placeholder hubs

# DIMENSION = vectors.shape[1]
# NUM_ELEMENTS = vectors.shape[0]

# # --- Select query vectors ---
# idxs = np.random.choice(NUM_ELEMENTS, size=min(40000, NUM_ELEMENTS), replace=False)
# query_vectors = vectors[idxs]
# num_actual_queries = len(query_vectors)

# # --- Helper functions ---
# def calculate_recall(ground_truth, found, k):
#     total_found = 0
#     num_q = len(ground_truth)
#     expected = num_q * k

#     for i in range(num_q):
#         gt_set = set(ground_truth[i][:k])
#         res = found[i]
#         found_set = set(res[:k]) if res is not None else set()
#         total_found += len(gt_set & found_set)

#     return total_found / expected if expected > 0 else 1.0

# # --- Ground Truth Calculation ---
# def compute_ground_truth(data, queries, k):
#     print(f"Computing ground truth with brute-force for {len(queries)} queries...")
#     t0 = time.perf_counter()
#     nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
#     nn.fit(data)
#     _, gt = nn.kneighbors(queries, n_neighbors=k)
#     t1 = time.perf_counter()
#     print(f"Done in {t1 - t0:.2f}s")
#     return gt

# # --- Evaluation for base HNSW ---
# def evaluate_hnsw_base(all_vecs, queries, gt, k, M=16, ef_const=200):
#     print("\nEvaluating HNSW (base)")
#     index = HNSW(dim=all_vecs.shape[1], max_elements=all_vecs.shape[0], M=M, ef_construction=ef_const)

#     # Indexing
#     t0 = time.perf_counter()
#     for v in all_vecs:
#         index.insert(v)
#     t1 = time.perf_counter()

#     # Querying
#     results = []
#     t_q0 = time.perf_counter()
#     for q in queries:
#         results.append(index.search(q, k=k))
#     t_q1 = time.perf_counter()

#     recall = calculate_recall(gt, results, k)
#     total_q = t_q1 - t_q0
#     avg_q = total_q / len(queries)

#     print(f"Index time: {t1 - t0:.2f}s")
#     print(f"Total query time: {total_q:.2f}s, Avg/query: {avg_q*1000:.2f}ms")
#     print(f"Recall@{k}: {recall:.4f}")

#     return (t1 - t0), total_q, avg_q, recall

# # --- Evaluation for HNSWfreqdistance ---



In [131]:
# def evaluate_hnsw_freq(all_vecs, queries, gt, k, hubs, boost, M=16, ef_const=200):
#     print("\nEvaluating HNSWfreqdistance")
#     index = HNSWfreqdistance(dim=all_vecs.shape[1], max_elements=all_vecs.shape[0],
#                               hubs=hubs, boost_const=boost,
#                               M=M, ef_construction=ef_const)

#     # Indexing
#     t0 = time.perf_counter()
#     for v in all_vecs:
#         index.insert(v)
#     t1 = time.perf_counter()

#     # Querying
#     results = []
#     t_q0 = time.perf_counter()
#     for q in queries:
#         results.append(index.search(q, k=k))
#     t_q1 = time.perf_counter()

#     recall = calculate_recall(gt, results, k)
#     total_q = t_q1 - t_q0
#     avg_q = total_q / len(queries)

#     print(f"Index time: {t1 - t0:.2f}s")
#     print(f"Total query time: {total_q:.2f}s, Avg/query: {avg_q*1000:.2f}ms")
#     print(f"Recall@{k}: {recall:.4f}")

#     return (t1 - t0), total_q, avg_q, recall


In [132]:
import time

In [133]:
# --- User parameters ---
K = 10                   # Number of neighbors for recall
NUM_QUERIES = 5000       # Number of queries to use


DIMENSION = vectors.shape[1]
NUM_ELEMENTS = vectors.shape[0]

# --- Select query vectors ---
idxs = np.random.choice(NUM_ELEMENTS, size=min(40000, NUM_ELEMENTS), replace=False)
query_vectors = vectors[idxs]
num_actual_queries = len(query_vectors)

# --- Helper functions ---
def calculate_recall(ground_truth, found, k):
    total_found = 0
    num_q = len(ground_truth)
    expected = num_q * k

    for i in range(num_q):
        gt_set = set(ground_truth[i][:k])
        res = found[i]
        found_set = set(res[:k]) if res is not None else set()
        total_found += len(gt_set & found_set)

    return total_found / expected if expected > 0 else 1.0

# --- Ground Truth Calculation ---
def compute_ground_truth(data, queries, k):
    print(f"Computing ground truth with brute-force for {len(queries)} queries...")
    t0 = time.perf_counter()
    nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
    nn.fit(data)
    _, gt = nn.kneighbors(queries, n_neighbors=k)
    t1 = time.perf_counter()
    print(f"Done in {t1 - t0:.2f}s")
    return gt

# --- Evaluation for base HNSW ---
def evaluate_hnsw_base(all_vecs, queries, gt, k, M=16, ef_const=200):
    print("\nEvaluating HNSW (base)")
    index = HNSW(dim=all_vecs.shape[1], max_elements=all_vecs.shape[0], M=M, ef_construction=ef_const)

    # Indexing with progress bar
    t0 = time.perf_counter()
    for v in tqdm(all_vecs, total=all_vecs.shape[0], desc="HNSW (base) indexing"):
        index.insert(v)
    t1 = time.perf_counter()

    # Querying with progress bar
    results = []
    t_q0 = time.perf_counter()
    for q in tqdm(queries, total=len(queries), desc="HNSW (base) querying"):
        results.append(index.search(q, k=k))
    t_q1 = time.perf_counter()

    recall = calculate_recall(gt, results, k)
    total_q = t_q1 - t_q0
    avg_q = total_q / len(queries)

    print(f"Index time: {t1 - t0:.2f}s")
    print(f"Total query time: {total_q:.2f}s, Avg/query: {avg_q*1000:.2f}ms")
    print(f"Recall@{k}: {recall:.4f}")

    return (t1 - t0), total_q, avg_q, recall



In [134]:
# --- Evaluation for HNSWfreqdistance ---
def evaluate_hnsw_freq(all_vecs, queries, gt, k, hubs, boost, M=16, ef_const=200):
    print("\nEvaluating HNSWfreqdistance")
    index = HNSWfreqdistance(dim=all_vecs.shape[1], max_elements=all_vecs.shape[0],
                              hubs=hubs, boost_const=boost,
                              M=M, ef_construction=ef_const)

    # Indexing with progress bar
    t0 = time.perf_counter()
    for v in tqdm(all_vecs, total=all_vecs.shape[0], desc="HNSWfreqdistance indexing"):
        index.insert(v)
    t1 = time.perf_counter()

    # Querying with progress bar
    results = []
    t_q0 = time.perf_counter()
    for q in tqdm(queries, total=len(queries), desc="HNSWfreqdistance querying"):
        results.append(index.search(q, k=k))
    t_q1 = time.perf_counter()

    recall = calculate_recall(gt, results, k)
    total_q = t_q1 - t_q0
    avg_q = total_q / len(queries)

    print(f"Index time: {t1 - t0:.2f}s")
    print(f"Total query time: {total_q:.2f}s, Avg/query: {avg_q*1000:.2f}ms")
    print(f"Recall@{k}: {recall:.4f}")

    return (t1 - t0), total_q, avg_q, recall



In [135]:
# --- Main Comparison ---
if __name__ == '__main__':
    gt_indices = compute_ground_truth(vectors, query_vectors, K)

    # Base HNSW
    idx_time_base, q_time_base, avg_q_base, recall_base = evaluate_hnsw_base(
        vectors, query_vectors, gt_indices, K,
        M=16, ef_const=200
    )

    # HNSW with hub boosting
    idx_time_freq, q_time_freq, avg_q_freq, recall_freq = evaluate_hnsw_freq(
        vectors, query_vectors, gt_indices, K,
        hubs=hubs, boost=0.1, M=16, ef_construction=200
    )

    # Summary
    print("\n--- Summary ---")
    print(f"Method            | Index(s) | Query(s) | AvgQuery(ms) | Recall@{K}")
    print(f"HNSW (base)       | {idx_time_base:.2f}    | {q_time_base:.2f}   | {avg_q_base*1000:.2f}        | {recall_base:.4f}")
    print(f"HNSWfreqdistance  | {idx_time_freq:.2f}    | {q_time_freq:.2f}   | {avg_q_freq*1000:.2f}        | {recall_freq:.4f}")


Computing ground truth with brute-force for 40000 queries...


Done in 7.38s

Evaluating HNSW (base)


HNSW (base) indexing: 100%|██████████| 400000/400000 [42:57<00:00, 155.20it/s]   
HNSW (base) querying: 100%|██████████| 40000/40000 [02:24<00:00, 276.02it/s]


Index time: 1575.04s
Total query time: 144.92s, Avg/query: 3.62ms
Recall@10: 0.4918


TypeError: evaluate_hnsw_freq() got an unexpected keyword argument 'ef_construction'

# HNSW with HUB BOOSTING

In [137]:
if __name__ == '__main__':

    # HNSW with hub boosting
    idx_time_freq, q_time_freq, avg_q_freq, recall_freq = evaluate_hnsw_freq(
        vectors, query_vectors, gt_indices, K,
        hubs=hubs, boost=0.1, M=16, ef_const=200
    )

    # Summary
    print("\n--- Summary ---")
    print(f"Method            | Index(s) | Query(s) | AvgQuery(ms) | Recall@{K}")
    print(f"HNSW (base)       | {idx_time_base:.2f}    | {q_time_base:.2f}   | {avg_q_base*1000:.2f}        | {recall_base:.4f}")
    print(f"HNSWfreqdistance  | {idx_time_freq:.2f}    | {q_time_freq:.2f}   | {avg_q_freq*1000:.2f}        | {recall_freq:.4f}")



Evaluating HNSWfreqdistance


HNSWfreqdistance indexing: 100%|██████████| 400000/400000 [1:24:41<00:00, 78.72it/s]    
HNSWfreqdistance querying: 100%|██████████| 40000/40000 [02:20<00:00, 284.04it/s]


Index time: 1544.15s
Total query time: 140.83s, Avg/query: 3.52ms
Recall@10: 0.4748

--- Summary ---
Method            | Index(s) | Query(s) | AvgQuery(ms) | Recall@10
HNSW (base)       | 1575.04    | 144.92   | 3.62        | 0.4918
HNSWfreqdistance  | 1544.15    | 140.83   | 3.52        | 0.4748
