From 553f8ef7cc2d99019cf572b997325ef60f266aaa Mon Sep 17 00:00:00 2001
From: Aaditya <146899562+Aaditya-Chunekar@users.noreply.github.com>
Date: Tue, 7 Oct 2025 15:08:09 +0530
Subject: [PATCH 1/2] Create approx_nearest_neighbours.py

---
 machine_learning/approx_nearest_neighbours.py | 118 ++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 machine_learning/approx_nearest_neighbours.py

diff --git a/machine_learning/approx_nearest_neighbours.py b/machine_learning/approx_nearest_neighbours.py
new file mode 100644
index 000000000000..ee347340f9d1
--- /dev/null
+++ b/machine_learning/approx_nearest_neighbours.py
@@ -0,0 +1,118 @@
+"""
+Approximate Nearest Neighbor (ANN) Search
+https://en.wikipedia.org/wiki/Nearest_neighbor_search#Approximate_nearest_neighbor
+
+ANN search finds "close enough" vectors instead of the exact nearest neighbor,
+which makes it much faster for large datasets.
+
+This implementation uses a simple **random projection hashing** method.
+Steps:
+1. Generate random hyperplanes to hash vectors into buckets.
+2. Place dataset vectors into buckets.
+3. For a query vector, look into its bucket (and maybe nearby buckets).
+4. Return the approximate nearest neighbor from those candidates.
+
+Each result contains:
+    1. The nearest (approximate) vector.
+    2. Its distance from the query vector.
+"""
+from __future__ import annotations
+
+import math
+from collections import defaultdict
+
+import numpy as np
+def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
+    """
+    Calculates Euclidean distance between two vectors.
+    >>> euclidean(np.array([0]), np.array([1]))
+    1.0
+    >>> euclidean(np.array([1, 2]), np.array([1, 5]))
+    3.0
+    """
+    return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b)))
+
+
+class ANN:
+    """
+    Approximate Nearest Neighbor using random projection hashing.
+    """
+
+    def __init__(self, dataset: np.ndarray, n_planes: int = 5, seed: int = 42) -> None:
+        """
+        :param dataset: ndarray of shape (n_samples, n_features)
+        :param n_planes: number of random hyperplanes for hashing
+        :param seed: random seed for reproducibility
+        """
+        self.dataset = dataset
+        self.n_planes = n_planes
+        rng = np.random.default_rng(seed)
+        self.planes = rng.standard_normal((n_planes, dataset.shape[1]))
+        self.buckets: dict[str, list[np.ndarray]] = defaultdict(list)
+        self._build_index()
+
+    def _hash_vector(self, vec: np.ndarray) -> str:
+        """
+        Hash a vector based on which side of each hyperplane it falls on.
+        Returns a bit string.
+
+        >>> dataset = np.array([[1, 2]])
+        >>> ann = ANN(dataset, n_planes=2, seed=0)
+        >>> h = ann._hash_vector(np.array([1, 2]))
+        >>> isinstance(h, str)
+        True
+        >>> len(h) == ann.n_planes
+        True
+        """
+        signs = (vec @ self.planes.T) >= 0
+        return "".join(["1" if s else "0" for s in signs])
+
+    def _build_index(self) -> None:
+        """
+        Build hash buckets for all dataset vectors.
+
+        >>> dataset = np.array([[0, 0], [1, 1]])
+        >>> ann = ANN(dataset, n_planes=2, seed=0)
+        >>> all(isinstance(k, str) for k in ann.buckets.keys())
+        True
+        >>> sum(len(v) for v in ann.buckets.values()) == len(dataset)
+        True
+        """
+        for vec in self.dataset:
+            h = self._hash_vector(vec)
+            self.buckets[h].append(vec)
+
+    def query(self, query_vectors: np.ndarray) -> list[list[list[float] | float]]:
+        """
+        Find approximate nearest neighbor for query vector(s).
+        :param query_vectors: ndarray of shape (m, n_features)
+        :return: list of [nearest_vector, distance]
+
+        >>> dataset = np.array([[0, 0], [1, 1], [2, 2], [10, 10]])
+        >>> ann = ANN(dataset, n_planes=4, seed=0)
+        >>> ann.query(np.array([[0, 1]]))  # doctest: +NORMALIZE_WHITESPACE
+        [[[0, 0], 1.0]]
+        """
+        results = []
+        for vec in query_vectors:
+            h = self._hash_vector(vec)
+            candidates = self.buckets[h]
+
+            if not candidates:  # fallback: search entire dataset
+                candidates = self.dataset
+
+            # Approximate NN search among candidates
+            best_vec = candidates[0]
+            best_dist = euclidean(vec, best_vec)
+            for cand in candidates[1:]:
+                d = euclidean(vec, cand)
+                if d < best_dist:
+                    best_vec, best_dist = cand, d
+            results.append([best_vec.tolist(), best_dist])
+        return results
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+

From 524dfec145e74201ce07b1539cf68d747892cae1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 7 Oct 2025 09:38:59 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/approx_nearest_neighbours.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/machine_learning/approx_nearest_neighbours.py b/machine_learning/approx_nearest_neighbours.py
index ee347340f9d1..e165fc2a81a8 100644
--- a/machine_learning/approx_nearest_neighbours.py
+++ b/machine_learning/approx_nearest_neighbours.py
@@ -16,12 +16,15 @@
     1. The nearest (approximate) vector.
     2. Its distance from the query vector.
 """
+
 from __future__ import annotations
 
 import math
 from collections import defaultdict
 
 import numpy as np
+
+
 def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
     """
     Calculates Euclidean distance between two vectors.
@@ -114,5 +117,5 @@ def query(self, query_vectors: np.ndarray) -> list[list[list[float] | float]]:
 
 if __name__ == "__main__":
     import doctest
-    doctest.testmod()
 
+    doctest.testmod()