## Pengantar

Percobaan kali ini kita akan melihat perbedaan ketiga model yang telah kita bahas dan bandingkan hasilnya.

In [2]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")

=== Annoy ===
Build time: 2.9317100048065186 detik
Query time: 0.00013184547424316406 detik
Neighbors: [853841, 606119, 864349, 398449, 693580] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.005504131317138672 detik
Query time: 0.00292205810546875 detik
Neighbors: [853841 606119 864349 398449 693580] ...

=== HNSW (hnswlib) ===
Build time: 2.9317100048065186 detik
Query time: 0.00013184547424316406 detik
Neighbors: [853841, 606119, 864349, 398449, 693580] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.005504131317138672 detik
Query time: 0.00292205810546875 detik
Neighbors: [853841 606119 864349 398449 693580] ...

=== HNSW (hnswlib) ===
Build time: 14.925785779953003 detik
Query time: 9.775161743164062e-05 detik
Neighbors: [853841 606119 864349 398449 693580] ...
Build time: 14.925785779953003 detik
Query time: 9.775161743164062e-05 detik
Neighbors: [853841 606119 864349 398449 693580] ...


## Tugas

Lakukan percobaan pada metric distance yang berbeda. Catat hasilnya pada tabel yang anda buat sendiri seperti pada praktikum 1.

In [3]:
import pandas as pd

# Gunakan dataset lebih kecil untuk eksperimen cepat
n_data = 10_000
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)
query = np.random.random((1, dim)).astype(np.float32)
k = 10

results = []

# Annoy metrics
annoy_metrics = ['euclidean', 'angular', 'manhattan']
for metric in annoy_metrics:
    ann_index = AnnoyIndex(dim, metric)
    start = time.time()
    for i in range(n_data):
        ann_index.add_item(i, X[i])
    ann_index.build(10)
    build_time = time.time() - start

    start = time.time()
    neighbors = ann_index.get_nns_by_vector(query[0], k)
    query_time = time.time() - start

    results.append({
        'Method': 'Annoy',
        'Metric': metric,
        'Build Time (s)': f"{build_time:.3f}",
        'Query Time (s)': f"{query_time:.6f}"
    })

# FAISS metrics
faiss_metrics = {
    'L2': faiss.IndexFlatL2(dim),
    'L1': faiss.IndexFlat(dim, faiss.METRIC_L1),
    'IP': faiss.IndexFlatIP(dim)
}
for name, index in faiss_metrics.items():
    start = time.time()
    index.add(X)
    build_time = time.time() - start

    start = time.time()
    distances, indices = index.search(query, k)
    query_time = time.time() - start

    results.append({
        'Method': 'FAISS',
        'Metric': name,
        'Build Time (s)': f"{build_time:.3f}",
        'Query Time (s)': f"{query_time:.6f}"
    })

# HNSW metrics
hnsw_metrics = ['l2', 'cosine', 'ip']
for space in hnsw_metrics:
    hnsw_index = hnswlib.Index(space=space, dim=dim)
    start = time.time()
    hnsw_index.init_index(max_elements=n_data, ef_construction=100, M=16)
    hnsw_index.add_items(X)
    build_time = time.time() - start

    hnsw_index.set_ef(50)
    start = time.time()
    labels, distances = hnsw_index.knn_query(query, k=k)
    query_time = time.time() - start

    results.append({
        'Method': 'HNSW',
        'Metric': space.upper(),
        'Build Time (s)': f"{build_time:.3f}",
        'Query Time (s)': f"{query_time:.6f}"
    })

# Buat tabel
df = pd.DataFrame(results)
print(df.to_string(index=False))

Method    Metric Build Time (s) Query Time (s)
 Annoy euclidean          0.033       0.000045
 Annoy   angular          0.034       0.000039
 Annoy manhattan          0.029       0.000055
 FAISS        L2          0.000       0.000058
 FAISS        L1          0.000       0.001231
 FAISS        IP          0.000       0.000094
  HNSW        L2          0.036       0.000067
  HNSW    COSINE          0.037       0.000040
  HNSW        IP          0.138       0.000035
