In [2]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")

=== Annoy ===
Build time: 2.5652265548706055 detik
Query time: 0.0 detik
Neighbors: [957551, 587556, 363767, 955526, 102184] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.0030002593994140625 detik
Query time: 0.003116130828857422 detik
Neighbors: [957551 587556 363767 955526 102184] ...

=== HNSW (hnswlib) ===
Build time: 13.674740076065063 detik
Query time: 0.0010013580322265625 detik
Neighbors: [957551 587556 363767 955526 102184] ...


In [3]:
import numpy as np
import time
import pandas as pd
from annoy import AnnoyIndex
import faiss
import hnswlib

def run_all_experiments(n_data, dim, k):
    """
    Fungsi untuk menjalankan perbandingan Annoy, FAISS (Flat), dan HNSWlib
    untuk metrik L2 dan Inner Product.
    """
    print(f"Menyiapkan dataset: {n_data} data, {dim}D...")
    
    # Buat dataset
    np.random.seed(42)
    X = np.random.random((n_data, dim)).astype(np.float32)
    query = np.random.random((1, dim)).astype(np.float32)

    results = []

    # ==========================
    # Skenario 1: L2 (Euclidean)
    # ==========================
    print("\n--- Menjalankan Eksperimen Metrik L2 (Euclidean) ---")
    
    # --- Annoy ---
    ann_index = AnnoyIndex(dim, 'euclidean')
    start = time.time()
    for i in range(n_data):
        ann_index.add_item(i, X[i])
    ann_index.build(10) # 10 trees
    build_time = time.time() - start
    start = time.time()
    ann_index.get_nns_by_vector(query[0], k)
    query_time = time.time() - start
    results.append({"Library": "Annoy", "Metrik Jarak": "L2", "Waktu Build (s)": build_time, "Waktu Query (s)": query_time})

    # --- FAISS (IndexFlatL2) ---
    faiss_index = faiss.IndexFlatL2(dim)
    start = time.time()
    faiss_index.add(X)
    build_time = time.time() - start
    start = time.time()
    faiss_index.search(query, k)
    query_time = time.time() - start
    results.append({"Library": "FAISS (Flat)", "Metrik Jarak": "L2", "Waktu Build (s)": build_time, "Waktu Query (s)": query_time})

    # --- HNSW (hnswlib) ---
    hnsw_index = hnswlib.Index(space='l2', dim=dim)
    start = time.time()
    hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
    hnsw_index.add_items(X)
    build_time = time.time() - start
    hnsw_index.set_ef(50)
    start = time.time()
    hnsw_index.knn_query(query, k=k)
    query_time = time.time() - start
    results.append({"Library": "HNSWlib", "Metrik Jarak": "L2", "Waktu Build (s)": build_time, "Waktu Query (s)": query_time})

    # ==============================
    # Skenario 2: Inner Product (IP)
    # ==============================
    print("\n--- Menjalankan Eksperimen Metrik Inner Product ---")
    # PENTING: Normalisasi data untuk perbandingan IP yang adil (setara Cosine Similarity)
    X_norm = X.copy()
    query_norm = query.copy()
    faiss.normalize_L2(X_norm)
    faiss.normalize_L2(query_norm)
    
    # --- Annoy (menggunakan 'angular' untuk Cosine/IP) ---
    ann_index_ip = AnnoyIndex(dim, 'angular')
    start = time.time()
    for i in range(n_data):
        ann_index_ip.add_item(i, X_norm[i])
    ann_index_ip.build(10)
    build_time = time.time() - start
    start = time.time()
    ann_index_ip.get_nns_by_vector(query_norm[0], k)
    query_time = time.time() - start
    results.append({"Library": "Annoy", "Metrik Jarak": "IP", "Waktu Build (s)": build_time, "Waktu Query (s)": query_time})

    # --- FAISS (IndexFlatIP) ---
    faiss_index_ip = faiss.IndexFlatIP(dim)
    start = time.time()
    faiss_index_ip.add(X_norm)
    build_time = time.time() - start
    start = time.time()
    faiss_index_ip.search(query_norm, k)
    query_time = time.time() - start
    results.append({"Library": "FAISS (Flat)", "Metrik Jarak": "IP", "Waktu Build (s)": build_time, "Waktu Query (s)": query_time})

    # --- HNSW (hnswlib) ---
    hnsw_index_ip = hnswlib.Index(space='ip', dim=dim)
    start = time.time()
    hnsw_index_ip.init_index(max_elements=n_data, ef_construction=200, M=16)
    hnsw_index_ip.add_items(X_norm)
    build_time = time.time() - start
    hnsw_index_ip.set_ef(50)
    start = time.time()
    hnsw_index_ip.knn_query(query_norm, k=k)
    query_time = time.time() - start
    results.append({"Library": "HNSWlib", "Metrik Jarak": "IP", "Waktu Build (s)": build_time, "Waktu Query (s)": query_time})

    # Tampilkan hasil
    df = pd.DataFrame(results)
    print("\n--- Hasil Eksperimen ---")
    print(df.round(6).to_markdown(index=False))


# --- Jalankan Eksperimen ---
run_all_experiments(n_data=1_000_000, dim=5, k=10)

Menyiapkan dataset: 1000000 data, 5D...

--- Menjalankan Eksperimen Metrik L2 (Euclidean) ---

--- Menjalankan Eksperimen Metrik Inner Product ---

--- Hasil Eksperimen ---
| Library      | Metrik Jarak   |   Waktu Build (s) |   Waktu Query (s) |
|:-------------|:---------------|------------------:|------------------:|
| Annoy        | L2             |          2.43682  |             0     |
| FAISS (Flat) | L2             |          0.003    |             0.003 |
| HNSWlib      | L2             |         13.5658   |             0     |
| Annoy        | IP             |          3.07041  |             0     |
| FAISS (Flat) | IP             |          0.003511 |             0.003 |
| HNSWlib      | IP             |         11.9891   |             0     |
