In [None]:
# 02_faiss_index_benchmark.ipynb
import faiss
import numpy as np
import time
import pandas as pd
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
texts = [f"Câu văn thử nghiệm số {i}" for i in range(10000)]
embeddings = np.array(model.encode(texts)).astype('float32')
dim = embeddings.shape[1]

# Các loại index để so sánh
indexes = {
    "FlatL2": faiss.IndexFlatL2(dim),
    "IVFFlat": faiss.index_factory(dim, "IVF100,Flat"),
    "HNSW": faiss.IndexHNSWFlat(dim, 32)
}

results = []
query = model.encode(["Câu văn thử nghiệm số 50"]).astype("float32")

for name, index in indexes.items():
    if "IVF" in name:
        quantizer = faiss.IndexFlatL2(dim)
        index = faiss.IndexIVFFlat(quantizer, dim, 100)
        index.train(embeddings)
    index.add(embeddings)

    start = time.time()
    distances, indices = index.search(query, 5)
    latency = (time.time() - start) * 1000

    results.append({"Index": name, "Latency (ms)": latency, "Top Result ID": indices[0][0]})

pd.DataFrame(results)


In [None]:
"""
Benchmark script: Nomic-embed-text vs SentenceTransformer(all-MiniLM-L6-v2) with FAISS

Instructions:
1. Prepare a dataset CSV/TSV with two columns: "id","text" and another file "queries.csv" with "id","query","relevant_id" (one relevant id per query) or a relevance list.
2. Create a virtualenv and install dependencies (internet required):
   pip install sentence-transformers nomic-embed-text faiss-cpu tqdm numpy pandas scikit-learn
   - If you have GPU and faiss-gpu, install faiss-gpu instead.
3. Run:
   python benchmark_nomic_vs_minilm_faiss.py --docs docs.csv --queries queries.csv --out results.json

What the script does:
- Loads documents and queries
- Encodes documents and queries with both models
- Builds FAISS index for each embedding type
- Runs k-NN search and computes Recall@k, MRR, Mean latency per query
- Saves results to JSON and prints a summary

Notes:
- nomic-embed-text API usage: imports from nomic
- This script is written to be explicit and easy to adapt for custom datasets
- If nomic-embed-text not available, the script will prompt and can fallback to other local models.

"""

import argparse
import time
import json
import os
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
from tqdm import tqdm

# Try imports; the user should install these packages in their environment
try:
    from sentence_transformers import SentenceTransformer
except Exception as e:
    SentenceTransformer = None

try:
    # nomic-embed-text exposes a model loader as `nomicbert` or through `nomic` package.
    # The canonical import (as of 2024) is `from nomic import embed` or `from nomic.embedding import load_model`.
    # We'll try common entrypoints.
    from nomic import embed
    def nomic_encode(texts: List[str], model_name: str = "nomic-embed-text") -> np.ndarray:
        # embed.embed returns list of vectors
        vs = embed.embed(texts, model=model_name)
        return np.array(vs, dtype=np.float32)
    nomic_available = True
except Exception:
    try:
        # fallback: nomic-embed-text package
        import nomic
        from nomic.embedding import load_model
        def nomic_encode(texts: List[str], model_name: str = "nomic-embed-text-v1") -> np.ndarray:
            m = load_model(model_name)
            vs = m.encode(texts)
            return np.array(vs, dtype=np.float32)
        nomic_available = True
    except Exception:
        nomic_available = False

# FAISS import
try:
    import faiss
except Exception:
    faiss = None

from sklearn.metrics import ndcg_score


def load_docs(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    assert 'id' in df.columns and 'text' in df.columns, "docs csv must have columns 'id' and 'text'"
    return df


def load_queries(path: str) -> pd.DataFrame:
    q = pd.read_csv(path)
    assert 'id' in q.columns and 'query' in q.columns and 'relevant_id' in q.columns, "queries csv must have 'id','query','relevant_id'"
    return q


def encode_texts(model_name: str, texts: List[str], model_type: str = 'minilm') -> np.ndarray:
    """Encode texts using chosen model type.
    model_type: 'minilm' or 'nomic'
    Returns numpy array float32
    """
    if model_type == 'minilm':
        if SentenceTransformer is None:
            raise RuntimeError("sentence-transformers not installed")
        model = SentenceTransformer(model_name)
        vectors = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
        return vectors.astype(np.float32)
    elif model_type == 'nomic':
        if not nomic_available:
            raise RuntimeError("nomic-embed-text not installed or not available")
        return nomic_encode(texts, model_name)
    else:
        raise ValueError('model_type must be minilm or nomic')


def build_faiss_index(vectors: np.ndarray, index_type: str = 'Flat') -> faiss.Index:
    dim = vectors.shape[1]
    if index_type == 'Flat':
        index = faiss.IndexFlatIP(dim)  # use Inner Product on normalized vectors
    elif index_type == 'HNSW':
        index = faiss.IndexHNSWFlat(dim, 32)
    else:
        raise ValueError('unknown index type')
    return index


def normalize_vectors(v: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(v, axis=1, keepdims=True)
    norms[norms == 0] = 1e-6
    return v / norms


def run_search(index: faiss.Index, query_vecs: np.ndarray, k: int = 10) -> Tuple[np.ndarray, np.ndarray]:
    # Ensure index and query use same metric: for IP we expect normalized vectors
    D, I = index.search(query_vecs, k)
    return I, D


def compute_metrics(results_idx: np.ndarray, doc_ids: List, query_df: pd.DataFrame, topk: List[int] = [1,5,10]) -> Dict:
    # results_idx: (nq, k) indices into doc vectors (0-based)
    # doc_ids: list mapping index->doc_id
    id_map = {doc_id: i for i, doc_id in enumerate(doc_ids)}
    nq = results_idx.shape[0]
    metrics = {}
    for k in topk:
        correct = 0
        for qi in range(nq):
            relevant = query_df.iloc[qi]['relevant_id']
            topk_ids = [doc_ids[idx] for idx in results_idx[qi, :k]]
            if relevant in topk_ids:
                correct += 1
        metrics[f'recall@{k}'] = correct / nq
    # MRR
    rr_sum = 0.0
    for qi in range(nq):
        relevant = query_df.iloc[qi]['relevant_id']
        ranks = results_idx[qi]
        rr = 0.0
        for rank_pos, idx in enumerate(ranks, start=1):
            if doc_ids[idx] == relevant:
                rr = 1.0 / rank_pos
                break
        rr_sum += rr
    metrics['MRR'] = rr_sum / nq
    return metrics


def benchmark(docs_path: str, queries_path: str, out_path: str, use_gpu: bool = False):
    docs = load_docs(docs_path)
    queries = load_queries(queries_path)

    doc_texts = docs['text'].tolist()
    doc_ids = docs['id'].tolist()
    query_texts = queries['query'].tolist()

    results = {}

    # Models to evaluate
    models = [
        {'name': 'all-MiniLM-L6-v2', 'type': 'minilm'},
        {'name': 'nomic-embed-text', 'type': 'nomic'}
    ]

    for m in models:
        print(f"\n=== Encoding with {m['name']} ({m['type']}) ===")
        start = time.time()
        try:
            doc_vecs = encode_texts(m['name'], doc_texts, model_type=m['type'])
            query_vecs = encode_texts(m['name'], query_texts, model_type=m['type'])
        except Exception as e:
            print(f"Skipping model {m['name']} due to error: {e}")
            continue
        encode_time = time.time() - start
        print(f"Encoding time: {encode_time:.2f}s")

        # Normalize
        doc_vecs = normalize_vectors(doc_vecs)
        query_vecs = normalize_vectors(query_vecs)

        if faiss is None:
            raise RuntimeError('faiss library not available. Install faiss-cpu or faiss-gpu')

        # Build index
        index = build_faiss_index(doc_vecs, index_type='Flat')
        index.add(doc_vecs)

        # Search
        k = 10
        t0 = time.time()
        I, D = run_search(index, query_vecs, k=k)
        search_time = time.time() - t0
        avg_latency = search_time / len(query_texts)

        metrics = compute_metrics(I, doc_ids, queries, topk=[1,5,10])
        metrics.update({'encode_time_s': encode_time, 'search_time_s': search_time, 'avg_query_latency_s': avg_latency})

        results[m['name']] = metrics
        print(f"Results for {m['name']}: {metrics}")

    # Save
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"Saved results to {out_path}")


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--docs', required=True, help='CSV with columns id,text')
    parser.add_argument('--queries', required=True, help='CSV with columns id,query,relevant_id')
    parser.add_argument('--out', default='benchmark_results.json')
    parser.add_argument('--gpu', action='store_true')
    args = parser.parse_args()

    benchmark(args.docs, args.queries, args.out, use_gpu=args.gpu)


In [2]:
import pandas as pd

# Nếu bạn đã có file gốc (vd: tiki_raw.csv) chứa các cột name,short_description,id...
src = "/srcc/example/faiss/data/books_data.csv"  # đổi theo tên file gốc
docs_out = "docs.csv"
queries_out = "queries.csv"

# đọc file gốc (nếu file là phần bạn gửi, bạn có thể copy paste vào tiki_raw.csv)
df = pd.read_csv(src)

# tạo cột text = name + ". " + short_description (fillna để tránh NaN)
df['short_description'] = df.get('short_description', df.get('short_description', "") ).fillna("")
df['text'] = df['name'].astype(str) + ". " + df['short_description'].astype(str)

# chỉ giữ id và text
docs = df[['id', 'text']]
docs.to_csv(docs_out, index=False, encoding='utf-8')

# tạo queries mẫu (bạn có thể tự viết tốt hơn; ở đây là các query dựa trên tên)
sample_queries = [
    ("Sách Walden sống một mình trong rừng", 275861063),
    ("Tập Du ký Nam Phong tạp chí", 274468056),
    ("Sách Vạn Dặm Đường Từ Một Bước Chân của Mavis", 273842947),
    ("Sách Du hành cùng Herodotus", 210277405),
    ("Sách Gỗ mun của Ryszard Kapuściński", 210277378),
    ("Nhật ký sáu vạn dặm trên yên xe", 204317934),
    ("Cơm nhà xứ Quảng sách", 196902142),
    ("Sách Con Đường Tơ Lụa từ Pakistan tới Tây An", 193209826),
    ("Du Ký Phan Quang Tiếc Nuối Hoa Hồng", 192333128),
    ("Tác phẩm về Nam Phong tạp chí lịch sử", 274468056),
]

queries_df = pd.DataFrame([
    {"id": i+1, "query": q, "relevant_id": rid} for i, (q, rid) in enumerate(sample_queries)
])
queries_df.to_csv(queries_out, index=False, encoding='utf-8')

print("Saved", docs_out, "and", queries_out)


Saved docs.csv and queries.csv
