# Basic Vector Search from Scratch

For this project we will implement basic vector search
from scratch with just numpy.<br/>
This will give us a feel
for what's happening under the hood in vector databases.

In [20]:
import numpy as np
import pandas as pd
from typing import Union
from IPython.display import display

# ----------------------------------------
# 1. Euclidean Distance
# ----------------------------------------
def euclidean_distance(v1: np.ndarray, v2: np.ndarray) -> Union[float, np.ndarray]:
    diff = v1 - v2
    return np.linalg.norm(diff, axis=len(diff.shape)-1)

# Test Euclidean Distance
v1 = np.array([1, 2, 3])
v2 = np.array([4, 5, 6])
expected = np.sqrt(np.sum((v2 - v1) ** 2))
print("✅ Euclidean Distance Test Passed:", euclidean_distance(v1, v2) == expected)

# ----------------------------------------
# 2. Cosine Distance
# ----------------------------------------
def cosine_distance(v1: np.ndarray, v2: np.ndarray) -> Union[float, np.ndarray]:
    vecs = (v1, v2) if len(v1.shape) >= len(v2.shape) else (v2, v1)
    return 1 - np.dot(*vecs) / (
        np.linalg.norm(v1, axis=len(v1.shape)-1) * np.linalg.norm(v2, axis=len(v2.shape)-1)
    )

# Test Cosine Distance
v1 = np.array([1, 2, 3])
v2 = np.array([4, 5, 6])
dot = np.dot(v1, v2)
norms = np.linalg.norm(v1) * np.linalg.norm(v2)
expected = 1 - dot / norms
print("✅ Cosine Distance Test Passed:", np.isclose(cosine_distance(v1, v2), expected))

# ----------------------------------------
# 3. KNN Search
# ----------------------------------------
def find_nearest_neighbors(query: np.ndarray, vectors: np.ndarray, k: int = 1, distance_metric="euclidean") -> np.ndarray:
    if distance_metric == "euclidean":
        distances = euclidean_distance(query, vectors)
    elif distance_metric == "cosine":
        distances = cosine_distance(query, vectors)
    else:
        raise ValueError(f"Unsupported metric: {distance_metric}")
    indices = np.argsort(distances)[:k]
    return vectors[indices, :]

# Test KNN Search
np.random.seed(0)
mat = np.random.randn(1000, 32)
query = np.random.randn(32)
k = 5
neighbors = find_nearest_neighbors(query, mat, k=k, distance_metric="euclidean")
print(f"\n✅ KNN Search Test Passed - Returned shape: {neighbors.shape}")

# Generate random matrix and query
np.random.seed(42)
mat = np.random.randn(1000, 32)
query = np.random.randn(32)
k = 5

# Find nearest neighbors
neighbors = find_nearest_neighbors(query, mat, k=k, distance_metric="euclidean")

# Print query vector
print("🔎 Query Vector (shape={}):\n".format(query.shape), query)

# Print the k nearest neighbors
print("\n🏃‍♂️ k Nearest Neighbors (shape={}):".format(neighbors.shape))
for i, neighbor in enumerate(neighbors):
    print(f"\nNeighbor {i+1}:\n", neighbor)

# Optional: Print Euclidean distances for clarity
print("\n📏 Euclidean Distances to Neighbors:")
distances = [euclidean_distance(query, neighbor) for neighbor in neighbors]
for i, dist in enumerate(distances):
    print(f"Distance to Neighbor {i+1}: {dist:.6f}")


# ----------------------------------------
# 4. Generate Vectors (Normalized/Unnormalized)
# ----------------------------------------
def generate_vectors(num_vectors: int, num_dim: int, normalize: bool = True) -> np.ndarray:
    vectors = np.random.rand(num_vectors, num_dim)
    if normalize:
        vectors /= np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors

# ----------------------------------------
# 5. Exploration: Normalized vs Unnormalized
# ----------------------------------------
query3d = np.array([0.5, 0.5, 0.5])
query3d_norm = query3d / np.linalg.norm(query3d)

unnormalized = generate_vectors(5, 3, normalize=False)
normalized = generate_vectors(5, 3, normalize=True)

euclidean_raw = np.linalg.norm(unnormalized - query3d, axis=1)
cosine_raw = [cosine_distance(query3d, vec) for vec in unnormalized]

euclidean_norm = np.linalg.norm(normalized - query3d_norm, axis=1)
cosine_norm = [cosine_distance(query3d_norm, vec) for vec in normalized]

# ----------------------------------------
# 6. Show Comparison Table
# ----------------------------------------
df = pd.DataFrame({
    "Unnormalized Euclidean": euclidean_raw,
    "Unnormalized Cosine": cosine_raw,
    "Normalized Euclidean": euclidean_norm,
    "Normalized Cosine": cosine_norm
})

print("\n🔍 Comparison: Normalized vs Unnormalized Distances")
display(df)


✅ Euclidean Distance Test Passed: True
✅ Cosine Distance Test Passed: True

✅ KNN Search Test Passed - Returned shape: (5, 32)
🔎 Query Vector (shape=(32,)):
 [ 0.73201427 -1.61156562 -0.67634599 -1.36804796  0.18930106 -1.32334433
 -1.98104718 -0.23747734 -2.20907905  0.25873819 -1.25762952 -0.84057975
 -2.28041597  1.65010221 -1.46291489 -0.36400123 -1.79173859 -0.83566781
  1.75182178  0.29747674 -0.723631   -0.03417348 -0.56055906 -0.09242058
  0.74386872 -0.33109799  0.21999147  0.2364955  -0.32614565 -0.73592599
 -0.52518336  0.69918525]

🏃‍♂️ k Nearest Neighbors (shape=(5, 32)):

Neighbor 1:
 [ 0.12625587 -0.61285799 -1.37762566 -1.55037687 -1.53706799  0.66904474
 -2.49028171 -0.66647993 -0.76565329 -0.28215954 -1.85643155  0.40388809
 -1.27316139  1.13240554 -0.62176569 -0.09561302  0.00377386  0.24424105
  0.03701438 -0.42711055  0.34304853  0.46614713 -0.76513467  0.42636782
  1.1917203   0.7058942   0.93237524  0.17192085 -0.93615045 -0.80964446
  0.31988797  0.94276373]

Ne

Unnamed: 0,Unnormalized Euclidean,Unnormalized Cosine,Normalized Euclidean,Normalized Cosine
0,0.508752,0.190273,0.581355,0.168987
1,0.426603,0.102581,0.51162,0.130878
2,0.399099,0.109357,0.414335,0.085837
3,0.508209,0.066403,0.227314,0.025836
4,0.454771,0.047693,0.36344,0.066044
