In [1]:
import os
import json
import pandas as pd
import torch
from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple, Set

from opensearchpy import OpenSearch, helpers 
from sentence_transformers import SentenceTransformer

from product_search.ingestion import create_bm25_index, create_hnsw_index, bulk_ingest_bm25, bulk_ingest_hnsw

project_dir = Path(os.getcwd()).parent
data_dir = project_dir / "Data" / "RAW"
processed_dir = project_dir / "Data" / "PROCESSED"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
bm25_index_name = "products_bm25"
hnsw_index_name = "products_hnsw"

batch_size = 1000

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
vector_field = "embedding"

In [3]:
client = OpenSearch(
    hosts=[{"host": "localhost", "port": 9200}],
    use_ssl=False,
    verify_certs=False,
    ssl_show_warn=False,
    timeout=60,
)

In [4]:
# load datasets
with open(processed_dir / "product_store.json", "r", encoding="utf-8") as f:
    product_store = json.load(f)

with open(processed_dir / "train_qrels.json", "r", encoding="utf-8") as f:
    train_qrels_dict = json.load(f)

with open(processed_dir / "test_qrels.json", "r", encoding="utf-8") as f:
    test_qrels_dict = json.load(f)

In [5]:
def load_embedder(model_name: str) -> tuple[SentenceTransformer, str]:
    """
    Returns (embedder, device_str).
    Prefers Apple Silicon GPU via MPS when available.
    """
    if torch.backends.mps.is_available() and torch.backends.mps.is_built():
        device = "mps"
    elif torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"

    embedder = SentenceTransformer(model_name, device=device)
    return embedder, device

In [6]:
embedder, device = load_embedder(embedding_model_name)
print(f"[INFO] SentenceTransformer device = {device}")

dim = int(embedder.get_sentence_embedding_dimension())

[INFO] SentenceTransformer device = cuda


In [7]:
product_ids = sorted(list(product_store.keys()))

In [8]:
if client.indices.exists(index=bm25_index_name):
    client.indices.delete(index=bm25_index_name)

if client.indices.exists(index=hnsw_index_name):
    client.indices.delete(index=hnsw_index_name)

In [9]:
create_bm25_index(client, bm25_index_name)
print(f"[OK] BM25 index ready: {bm25_index_name}")

[OK] BM25 index ready: products_bm25


In [10]:
bulk_ingest_bm25(
    client,
    index_name=bm25_index_name,
    product_store=product_store,
    product_ids=product_ids,
    batch_size=batch_size,
)
print(f"[OK] BM25 ingested: {bm25_index_name} (docs={client.count(index=bm25_index_name)['count']})")

BM25 ingest → products_bm25: 100%|██████████████████████████████| 456702/456702 [00:58<00:00, 7850.08docs/s]


[OK] BM25 ingested: products_bm25 (docs=456702)


In [11]:
create_hnsw_index(
    client,
    hnsw_index_name,
    dim=dim,
    vector_field=vector_field,
    space_type="cosinesimil",
    engine="faiss",
)
print(f"[OK] HNSW index ready: {hnsw_index_name} (dim={dim})")

[OK] HNSW index ready: products_hnsw (dim=384)


In [12]:
bulk_ingest_hnsw(
    client,
    index_name=hnsw_index_name,
    product_store=product_store,
    product_ids=product_ids,
    embedder=embedder,
    dim=dim,
    vector_field=vector_field,
    batch_size=batch_size,
    encode_batch_size=batch_size,
    normalize_embeddings=True,
)
print(f"[OK] HNSW ingested: {hnsw_index_name} (docs={client.count(index=hnsw_index_name)['count']})")

HNSW ingest → products_hnsw: 100%|███████████████████████████████| 456702/456702 [08:51<00:00, 859.24docs/s]


[OK] HNSW ingested: products_hnsw (docs=456702)
