In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import hnswlib
from sklearn.neighbors import NearestNeighbors
from heapq import heappush, heappop

In [None]:

# Path to your GloVe file (update this based on your downloaded version)
glove_path = "/Users/tejasmacipad/Desktop/Projects/DataScience/datascience-HNSW/glove/glove.6B.100d.txt"

# Load GloVe vectors
word_to_vec = {}
words = []
vectors = []

with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]  # First token is the word
        vector = np.array(values[1:], dtype=np.float32)  # Rest are vector values
        word_to_vec[word] = vector
        words.append(word)
        vectors.append(vector)

# Convert to numpy array
vectors = np.array(vectors, dtype=np.float32)
print(f"Loaded {len(words)} word vectors of dimension {vectors.shape[1]}")

In [None]:
dim = vectors.shape[1]  # Embedding dimension
num_elements = len(words)  # Number of words

# Initialize HNSW index
hnsw_index = hnswlib.Index(space="l2", dim=dim)  # "l2" is Euclidean distance

# Set up the index
hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16)

# Add word vectors to the index
hnsw_index.add_items(vectors)

print("HNSW index built successfully!")

In [None]:
dim = vectors.shape[1]  # Embedding dimension
num_elements = len(words)  # Number of words

# Initialize HNSW index
hnsw_index = hnswlib.Index(space="l2", dim=dim)  # "l2" is Euclidean distance

# Set up the index
hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16)

# Add word vectors to the index
hnsw_index.add_items(vectors)

print("HNSW index built successfully!")

In [None]:
# Example Query
find_similar_words("king", k=5)

In [None]:
# Example Query
find_similar_words("king", k=5)

In [None]:
def find_similar_words_cosine(query_word, k=6):
    if query_word not in word_to_vec:
        return f"'{query_word}' not found in vocabulary!"

    query_vector = word_to_vec[query_word].reshape(1, -1)
    labels, distances = hnsw_index_cosine.knn_query(query_vector, k=k)

    print(f"\nTop {k} words similar to '{query_word}':")
    for i, index in enumerate(labels[0]):
        print(f"{i+1}. {words[index]} (Distance: {distances[0][i]:.4f})")

In [None]:
for a in range(1):
    find_similar_words_cosine("king", k=10)

In [None]:
# Step 1: Generate 10^5 random query indices
num_queries = 10**5
query_indices = np.random.randint(0, len(words), size=num_queries)

# Step 2: Get the corresponding query vectors
query_vectors = vectors[query_indices]

# Step 3: Use NearestNeighbors to find 100 nearest neighbors
k = 100
nn = NearestNeighbors(n_neighbors=k, algorithm='auto', metric='l2')
nn.fit(vectors)

# Step 4: For each query, get the indices of the 100 nearest neighbors
distances, neighbor_indices = nn.kneighbors(query_vectors)

# Step 5: Store results in a dictionary {query_word: [neighbor_words]}
query_to_neighbors = {}

from tqdm import tqdm

# Step 5: Store results in a dictionary {query_word: [neighbor_words]}
query_to_neighbors = {}
for i in tqdm(range(len(query_indices)), desc="Finding nearest neighbors"):
    query_idx = query_indices[i]
    query_word = words[query_idx]
    neighbor_words = [words[idx] for idx in neighbor_indices[i]]
    query_to_neighbors[query_word] = neighbor_words

print(f"Stored nearest neighbors for {len(query_to_neighbors)} queries.")


In [None]:
import json

with open("query_neighbors.json", "w") as f:
    json.dump(query_to_neighbors, f)



In [None]:
import json

# Load the saved query-to-neighbors mapping
with open("query_neighbors.json", "r") as f:
    query_to_neighbors = json.load(f)

print(f"Loaded {len(query_to_neighbors)} queries from file.")

# Step 6: Create a dictionary mapping each unique query_word to its vector
query_word_vectors = {word: word_to_vec[word] for word in query_to_neighbors}

print(f"Stored vectors for {len(query_word_vectors)} unique query words.")



In [None]:
query_to_neighbors_vectors = {}

for i in tqdm(range(len(query_indices)), desc="Finding nearest neighbors"):
    query_idx = query_indices[i]
    neighbor_indices_list = neighbor_indices[i]
    query_to_neighbors_vectors[query_idx] = neighbor_indices_list  # All indices, not vectors


In [None]:
from collections import Counter
import pandas as pd

# Flatten all neighbor indices into a single list
all_neighbors = [idx for neighbors in query_to_neighbors_vectors.values() for idx in neighbors]

# Count frequency of each neighbor index
freq_counter = Counter(all_neighbors)

# Create DataFrame and sort
freq_df = pd.DataFrame(freq_counter.items(), columns=["Index", "Frequency"]).sort_values("Frequency", ascending=False)



In [None]:
print(len(all_neighbors))


In [None]:
print(freq_df.head())  # Preview
print(freq_df.tail())  # Preview

In [None]:
#printing the average
average = freq_df["Frequency"].mean()
print(f"Average frequency of neighbors: {average}")

In [None]:
print(len(freq_df))  # Total unique neighbors

In [None]:
print(len(freq_df))  # Total unique neighbors

In [None]:
import matplotlib.pyplot as plt
import squarify  # pip install squarify if not already installed
import numpy as np

# Select top N most frequent indices
top_n = 30
freq_top = freq_df.head(top_n)

# 1. Horizontal Bar Chart
plt.figure(figsize=(10, 6))
plt.barh(freq_top["Index"].astype(str), freq_top["Frequency"], color='skyblue')
plt.xlabel("Frequency")
plt.ylabel("Index")
plt.title(f"Top {top_n} Most Frequent Neighbor Indices")
plt.gca().invert_yaxis()  # Highest freq on top
plt.tight_layout()
plt.show()

# 3. Cumulative Frequency Line Plot
freq_df_sorted = freq_df.sort_values("Frequency", ascending=False)
cum_freq = np.cumsum(freq_df_sorted["Frequency"])
cum_freq = cum_freq / cum_freq.max()  # Normalize to 0–1

plt.figure(figsize=(10, 4))
plt.plot(range(len(cum_freq)), cum_freq, marker='o', linestyle='-', color='green')
plt.title("Cumulative Frequency of Neighbor Indices")
plt.xlabel("Index Rank")
plt.ylabel("Cumulative Frequency (Normalized)")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import squarify  # pip install squarify if not already installed
import numpy as np

# Select top N most frequent indices
top_n = 30
freq_top = freq_df.head(top_n)

# 1. Horizontal Bar Chart
plt.figure(figsize=(10, 6))
plt.barh(freq_top["Index"].astype(str), freq_top["Frequency"], color='skyblue')
plt.xlabel("Frequency")
plt.ylabel("Index")
plt.title(f"Top {top_n} Most Frequent Neighbor Indices")
plt.gca().invert_yaxis()  # Highest freq on top
plt.tight_layout()
plt.show()

# 3. Cumulative Frequency Line Plot
freq_df_sorted = freq_df.sort_values("Frequency", ascending=False)
cum_freq = np.cumsum(freq_df_sorted["Frequency"])
cum_freq = cum_freq / cum_freq.max()  # Normalize to 0–1

plt.figure(figsize=(10, 4))
plt.plot(range(len(cum_freq)), cum_freq, marker='o', linestyle='-', color='green')
plt.title("Cumulative Frequency of Neighbor Indices")
plt.xlabel("Index Rank")
plt.ylabel("Cumulative Frequency (Normalized)")
plt.grid(True)
plt.tight_layout()
plt.show()
