In [1]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")


=== Annoy ===
Build time: 17.817012786865234 detik
Query time: 0.0009992122650146484 detik
Neighbors: [619199, 800683, 696865, 831336, 994773] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.008003473281860352 detik
Query time: 0.007004976272583008 detik
Neighbors: [619199 800683 696865 831336 994773] ...

=== HNSW (hnswlib) ===
Build time: 110.47869825363159 detik
Query time: 0.0 detik
Neighbors: [619199 800683 696865 831336 994773] ...


## Percobaan Metrics Berbeda

In [4]:
import pandas as pd

results = []

sizes = [100_000, 1_000_000]
dimension = [2, 5]

for n_data in sizes:
    for dim in dimension:
        # ===============================
        # 1. Buat dataset 1 juta data 5D
        # ===============================
    
        X = np.random.random((n_data, dim)).astype(np.float32)
        
        # Query point
        query = np.random.random((1, dim)).astype(np.float32)
        k = 10
        
        # ===============================
        # 2. Annoy
        # ===============================
        ann_index = AnnoyIndex(dim, 'euclidean')
        
        start = time.time()
        for i in range(n_data):
            ann_index.add_item(i, X[i])
        ann_index.build(10)  # 10 trees
        build_time = time.time() - start
        
        start = time.time()
        neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
        query_time = time.time() - start
    
        results.append({
            "Metric": 'Annoy',
            "Jumlah data": f"{n_data:,}",
            "Dimensi": dim,
            "Build time": f"{build_time} detik",
            "Query time": f"{query_time} detik",
            "Neighbors": f"{neighbors[0][:5]} ...",
        })
        
        # ===============================
        # 3. FAISS (Flat Index)
        # ===============================
        faiss_index = faiss.IndexFlatL2(dim)
        
        start = time.time()
        faiss_index.add(X)
        build_time = time.time() - start
        
        start = time.time()
        distances, indices = faiss_index.search(query, k)
        query_time = time.time() - start
        
        results.append({
            "Metric": 'FAISS',
            "Jumlah data": f"{n_data:,}",
            "Dimensi": dim,
            "Build time": f"{build_time} detik",
            "Query time": f"{query_time} detik",
            "Neighbors": f"{indices[0][:5]} ...",
        })
        
        # ===============================
        # 4. HNSW (hnswlib)
        # ===============================
        hnsw_index = hnswlib.Index(space='l2', dim=dim)
        
        start = time.time()
        hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
        hnsw_index.add_items(X)
        build_time = time.time() - start
        
        hnsw_index.set_ef(50)
        
        start = time.time()
        labels, distances = hnsw_index.knn_query(query, k=k)
        query_time = time.time() - start
        
        results.append({
            "Metric": 'HNSW',
            "Jumlah data": f"{n_data:,}",
            "Dimensi": dim,
            "Build time": f"{build_time} detik",
            "Query time": f"{query_time} detik",
            "Neighbors": f"{labels[0][:5]} ...",
        })

# --- Convert ke DataFrame ---
df = pd.DataFrame(results)

In [5]:
# --- Tampilkan tabel rapi ---
print(df.to_string(index=False))

Metric Jumlah data  Dimensi                  Build time                  Query time                                    Neighbors
 Annoy     100,000        2     2.046398162841797 detik                   0.0 detik      [59728, 37753, 29815, 20959, 92129] ...
 FAISS     100,000        2                   0.0 detik 0.0015110969543457031 detik          [59728 37753 29815 20959 92129] ...
  HNSW     100,000        2      6.58749794960022 detik                   0.0 detik          [59728 37753 29815 20959 92129] ...
 Annoy     100,000        5    2.2665460109710693 detik                   0.0 detik       [59885, 16225, 75248, 1447, 44078] ...
 FAISS     100,000        5  0.001004934310913086 detik 0.0009958744049072266 detik          [59885 16225 75248  1447 44078] ...
  HNSW     100,000        5     7.610845565795898 detik                   0.0 detik          [59885 16225 75248  1447 44078] ...
 Annoy   1,000,000        2    20.684722661972046 detik                   0.0 detik [293335, 1319