<a href="https://colab.research.google.com/github/Putra1688/MachineLearning-2025-22/blob/main/TG6_2341720248_Rangga_Dwi_Saputra_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **JS06 - ANN (Approximate Nearest Neighbors)**

# **PRAKTIKUM 6**

In [2]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "songs_with_attributes_and_lyrics.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "bwandowando/spotify-songs-with-attributes-and-lyrics",
  file_path,
  pandas_kwargs={"nrows": 100000}
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/bwandowando/spotify-songs-with-attributes-and-lyrics?dataset_version_number=19&file_name=songs_with_attributes_and_lyrics.csv...


100%|██████████| 1.44G/1.44G [00:44<00:00, 35.1MB/s]


First 5 records:                        id             name  \
0  0Prct5TDjAnEgIqbxcldY9                !   
1  2ASl4wirkeYm3OWZxXKYuq               !!   
2  69lcggVPmOr9cvPx9kLiiN  !!! - Interlude   
3  4U7dlZjg1s9pjdppqZy0fm   !!De Repente!!   
4  4v1IBp3Y3rpkWmWzIlkYju   !!De Repente!!   

                               album_name       artists  danceability  energy  \
0                              UNDEN!ABLE  ['HELLYEAH']         0.415  0.6050   
1                                     NaN       Yxngxr1         0.788  0.6480   
2                       Where I Belong EP    ['Glowie']         0.000  0.0354   
3  Un Palo Al Agua (20 Grandes Canciones)   ['Rosendo']         0.657  0.8820   
4                          Fuera De Lugar   ['Rosendo']         0.659  0.8930   

  key  loudness mode  speechiness  acousticness  instrumentalness  liveness  \
0   7   -11.157    1       0.0575       0.00116          0.838000    0.4710   
1   7    -9.135    0       0.3150       0.90000          0.00

In [4]:
import pandas as pd
import numpy as np
import time
import faiss
from annoy import AnnoyIndex
import hnswlib
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from kagglehub import dataset_load, KaggleDatasetAdapter

# Menggunakan 100,000 baris sebagai sampel untuk mengurangi waktu loading
N_SAMPLES = 100000
N_QUERIES = 1000 # HANYA MENGUJI 1000 QUERY UNTUK KECEPATAN

print(f"Memuat {N_SAMPLES} baris data dari Kaggle...")

try:
    # Menggunakan fungsi dataset_load() yang disarankan
    df = dataset_load(
      KaggleDatasetAdapter.PANDAS,
      "bwandowando/spotify-songs-with-attributes-and-lyrics",
      "songs_with_attributes_and_lyrics.csv",
      pandas_kwargs={"nrows": N_SAMPLES}
    )
except Exception as e:
    print(f"Error saat memuat data: {e}")
    print("\n⚠️ Menggunakan data acak (dummy) untuk melanjutkan pengujian algoritma.")
    # Fallback ke data acak jika gagal memuat data Kaggle
    np.random.seed(42)
    D = 9 # Jumlah dimensi (fitur)
    df = pd.DataFrame(np.random.random((N_SAMPLES, D)),
                      columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9'])


features = ['danceability', 'energy', 'loudness', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

# Memastikan hanya fitur yang tersedia yang digunakan jika menggunakan data Kaggle
if len(df.columns) > 9: # Jika data dari Kaggle berhasil dimuat (memiliki fitur lengkap)
    X = df[features].values
else: # Jika menggunakan data dummy
    X = df.values


# Standarisasi fitur
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X).astype('float32') # FAISS butuh float32

# Tentukan set data yang digunakan:
X_database = X_scaled             # Seluruh 100k sampel digunakan sebagai database untuk diindeks
X_query = X_scaled[:N_QUERIES]    # Hanya 1000 sampel pertama digunakan untuk query
k = 10                            # Jumlah nearest neighbors


print(f"Database Size: {X_database.shape[0]} vektor, Query Size: {X_query.shape[0]} vektor.")
print("--- Memulai Uji Waktu ---")

# ----------------------------------------------------
# 1. Exact NN (brute-force) - Baseline Akurasi 100%
# ----------------------------------------------------
start = time.time()
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
nn.fit(X_database)
# Query hanya X_query
dist_exact, idx_exact = nn.kneighbors(X_query)
time_exact = time.time() - start
print(f"Exact NN done in {time_exact:.3f} s")

# ----------------------------------------------------
# 2. Annoy - Perlu looping (paling lambat dari ANN)
# ----------------------------------------------------
start = time.time()
f = X_database.shape[1]
index_annoy = AnnoyIndex(f, 'euclidean')
for i, v in enumerate(X_database):
    index_annoy.add_item(i, v)
index_annoy.build(10) # 10 trees
# Query hanya X_query (menggunakan list comprehension untuk loop yang efisien)
idx_annoy = [index_annoy.get_nns_by_vector(v, k) for v in X_query]
time_annoy = time.time() - start
print(f"Annoy done in {time_annoy:.3f} s")

# ----------------------------------------------------
# 3. HNSW - Sangat Cepat (Symmetric Search)
# ----------------------------------------------------
start = time.time()
D = X_database.shape[1]
p_hnsw = hnswlib.Index(space='l2', dim=D)
p_hnsw.init_index(max_elements=X_database.shape[0], ef_construction=200, M=16)
p_hnsw.add_items(X_database)
p_hnsw.set_ef(200) # Efektif untuk pencarian yang cepat dan akurat
# Query X_query
idx_hnsw, _ = p_hnsw.knn_query(X_query, k=k)
time_hnsw = time.time() - start
print(f"HNSW done in {time_hnsw:.3f} s")

# ----------------------------------------------------
# 4. FAISS IVF (Inverted File Index) - Cepat & Skalabel
# ----------------------------------------------------
start = time.time()
D = X_database.shape[1] # Menggunakan D dari sesi sebelumnya
quantizer = faiss.IndexFlatL2(D)
# KOREKSI: Hapus 'nlist=' dan 'metric='
# Formatnya adalah: IndexIVFFlat(quantizer, dim, nlist, metric)
index_faiss = faiss.IndexIVFFlat(quantizer, D, 100, faiss.METRIC_L2)
index_faiss.train(X_database) # Pastikan menggunakan X_database (100k sampel)
index_faiss.add(X_database)
index_faiss.nprobe = 10
dist_faiss, idx_faiss = index_faiss.search(X_query, k) # Query X_query (1k sampel)
time_faiss = time.time() - start
print(f"FAISS IVF done in {time_faiss:.3f} s")

# -------------------------------
# Tampilkan ringkasan waktu & hasil
# -------------------------------
print("\n=== Ringkasan Waktu (detik) ===")
print(f"Exact NN : {time_exact:.3f}")
print(f"Annoy    : {time_annoy:.3f}")
print(f"HNSW     : {time_hnsw:.3f}")
print(f"FAISS    : {time_faiss:.3f}")

print("\n=== Perbandingan Hasil (Top-5 Neighbors untuk Query Pertama) ===")
# idx_exact[0] adalah daftar indeks tetangga terdekat yang akurat (Exact NN) untuk item pertama
print(f"Exact NN: {idx_exact[0][:5]}")
print(f"Annoy:    {idx_annoy[0][:5]}")
print(f"HNSW:     {idx_hnsw[0][:5]}")
print(f"FAISS:    {idx_faiss[0][:5]}")

Memuat 100000 baris data dari Kaggle...
Using Colab cache for faster access to the 'spotify-songs-with-attributes-and-lyrics' dataset.
Database Size: 100000 vektor, Query Size: 1000 vektor.
--- Memulai Uji Waktu ---
Exact NN done in 0.834 s
Annoy done in 2.013 s
HNSW done in 13.648 s
FAISS IVF done in 0.140 s

=== Ringkasan Waktu (detik) ===
Exact NN : 0.834
Annoy    : 2.013
HNSW     : 13.648
FAISS    : 0.140

=== Perbandingan Hasil (Top-5 Neighbors untuk Query Pertama) ===
Exact NN: [    0 61511 85956  3836 35205]
Annoy:    [0, 61511, 3836, 35205, 41311]
HNSW:     [    0 61511 85956  3836 35205]
FAISS:    [    0 61511 85956  3836 35205]
