# Spotify RAG Index (Colab)
Notebook ini membangun index FAISS dan file metadata dari dataset Spotify (Kaggle) untuk digunakan sebagai RAG di FastAPI.

## Langkah-langkah:
1. Install dependencies.
2. Upload dataset Spotify (CSV).
3. Normalisasi kolom dan pembersihan data.
4. Buat embeddings dengan sentence-transformers.
5. Bangun index FAISS dan simpan ke file.
6. Export output (parquet, index, embeddings, metadata).
7. Download hasil dan copy ke project `data/spotify_index/`.

Catatan: Dataset yang didukung memiliki kolom seperti `track_name`, `artists`, `track_genre/genre`, `valence`, `energy`, `tempo`, `liveness`, `acousticness`, `instrumentalness`, `danceability`, `duration_ms`, `explicit`, `year`, `id/track_id`, `uri` (opsional).

In [None]:
# 1) Install dependencies
!pip -q install pandas pyarrow numpy sentence-transformers faiss-cpu fastparquet

import os, json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from datetime import datetime

EMBED_MODEL = 'sentence-transformers/all-mpnet-base-v2'
OUTPUT_DIR = 'spotify_index_output'
os.makedirs(OUTPUT_DIR, exist_ok=True)
print('Deps OK, output dir:', OUTPUT_DIR)

## Upload dataset CSV
Upload file CSV dari Kaggle (contoh: `spotify_tracks.csv`). Jika dataset Anda punya nama berbeda, sesuaikan variabel `CSV_PATH`.

In [None]:
from google.colab import files
uploaded = files.upload()
CSV_PATH = list(uploaded.keys())[0]
print('CSV_PATH:', CSV_PATH)

## Load dan normalisasi kolom
Menyesuaikan berbagai kemungkinan nama kolom dari dataset Spotify.

In [None]:
df = pd.read_csv(CSV_PATH, low_memory=False)
print('Rows:', len(df))
# Normalisasi nama kolom
cols = {c.lower(): c for c in df.columns}
def pick(*names):
    for n in names:
        if n in cols: return cols[n]
    return None

col_title = pick('track_name','name','title')
col_artist = pick('artists','artist_name','artist')
col_genre = pick('track_genre','genre','genres')
col_valence = pick('valence')
col_energy = pick('energy')
col_tempo = pick('tempo')
col_liveness = pick('liveness')
col_acousticness = pick('acousticness')
col_instrumentalness = pick('instrumentalness')
col_danceability = pick('danceability')
col_speechiness = pick('speechiness')
col_duration = pick('duration_ms','duration')
col_explicit = pick('explicit')
col_year = pick('year','release_year')
col_id = pick('id','track_id','spotify_id')
col_uri = pick('uri','track_uri')

required = [col_title, col_artist, col_genre, col_valence, col_energy, col_tempo, col_liveness, col_acousticness, col_instrumentalness, col_danceability, col_duration]
missing = [n for n in required if n is None]
if missing:
    raise ValueError(f'Kolom penting hilang: {missing}')

work = pd.DataFrame({
    'title': df[col_title].astype(str).str.strip(),
    'artist': df[col_artist].astype(str).str.strip(),
    'genres': df[col_genre].astype(str).str.lower().str.replace(';', ',').str.replace('|', ',').str.strip(),
    'valence': pd.to_numeric(df[col_valence], errors='coerce'),
    'energy': pd.to_numeric(df[col_energy], errors='coerce'),
    'tempo': pd.to_numeric(df[col_tempo], errors='coerce'),
    'liveness': pd.to_numeric(df[col_liveness], errors='coerce'),
    'acousticness': pd.to_numeric(df[col_acousticness], errors='coerce'),
    'instrumentalness': pd.to_numeric(df[col_instrumentalness], errors='coerce'),
    'danceability': pd.to_numeric(df[col_danceability], errors='coerce'),
    'speechiness': pd.to_numeric(df[col_speechiness], errors='coerce') if col_speechiness else np.nan,
    'duration_ms': pd.to_numeric(df[col_duration], errors='coerce'),
    'explicit': df[col_explicit] if col_explicit else False,
    'year': pd.to_numeric(df[col_year], errors='coerce') if col_year else np.nan,
    'track_id': df[col_id] if col_id else None,
    'uri': df[col_uri] if col_uri else None,
})

# Drop baris dengan nilai penting yang hilang
work = work.dropna(subset=['title','artist','duration_ms','valence','energy','tempo','liveness','acousticness','instrumentalness','danceability'])
# Hapus duplikat berdasarkan title+artist
work['key_unique'] = (work['title'].str.lower() + '|' + work['artist'].str.lower())
work = work.drop_duplicates(subset=['key_unique'])

# Buat teks pencarian untuk embeddings
work['search_text'] = work['title'] + ' - ' + work['artist'] + ' [' + work['genres'].fillna('') + ']'
work = work.reset_index(drop=True)
print('Setelah bersih:', len(work))
work.head(3)

## Embeddings dengan Sentence-Transformers

In [None]:
model = SentenceTransformer(EMBED_MODEL)
emb = model.encode(work['search_text'].tolist(), batch_size=256, show_progress_bar=True, normalize_embeddings=True)
emb = np.asarray(emb, dtype='float32')
np.save(os.path.join(OUTPUT_DIR, 'embeddings.npy'), emb)
print('Embeddings shape:', emb.shape)

## Bangun FAISS index (cosine via inner-product)

In [None]:
d = emb.shape[1]
index = faiss.IndexFlatIP(d)
index.add(emb)
faiss.write_index(index, os.path.join(OUTPUT_DIR, 'faiss.index'))
print('FAISS index ditulis:', index.ntotal)

## Simpan metadata (parquet + json info)

In [None]:
meta_cols = ['title','artist','genres','duration_ms','valence','energy','tempo','liveness','acousticness','instrumentalness','danceability','speechiness','explicit','year','track_id','uri']
meta = work[meta_cols].copy()
meta.to_parquet(os.path.join(OUTPUT_DIR, 'tracks.parquet'), index=False)
with open(os.path.join(OUTPUT_DIR, 'metadata.json'), 'w') as f:
    json.dump({
        'created_at': datetime.utcnow().isoformat(),
        'rows': int(len(meta)),
        'embedding_model': EMBED_MODEL,
        'fields': meta_cols
    }, f, indent=2)
print('Metadata tersimpan:', len(meta))
meta.head(3)

## Zip dan download output

In [None]:
import shutil
zip_path = 'spotify_rag_index.zip'
shutil.make_archive('spotify_rag_index', 'zip', OUTPUT_DIR)
files.download(zip_path)
print('Siap diunduh:', zip_path)