In [1]:
import os
import json
import pandas as pd
import torch
from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple, Set

from opensearchpy import OpenSearch, helpers 
from sentence_transformers import SentenceTransformer

from product_search.ingestion import create_bm25_index, create_hnsw_index, bulk_ingest_bm25, bulk_ingest_hnsw

project_dir = Path(os.getcwd()).parent
data_dir = project_dir / "Data" / "RAW"
processed_dir = project_dir / "Data" / "PROCESSED"

print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

  from .autonotebook import tqdm as notebook_tqdm


CUDA available: True
GPU Name: NVIDIA GeForce RTX 5070


In [2]:
bm25_index_name = "products_bm25"
hnsw_index_name = "products_hnsw"

batch_size = 1000

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
vector_field = "embedding"

In [3]:
client = OpenSearch(
    hosts=[{"host": "localhost", "port": 9200}],
    use_ssl=False,
    verify_certs=False,
    ssl_show_warn=False,
    timeout=60,
)

In [4]:
# load datasets
with open(processed_dir / "product_store.json", "r", encoding="utf-8") as f:
    product_store = json.load(f)

with open(processed_dir / "train_qrels.json", "r", encoding="utf-8") as f:
    train_qrels_dict = json.load(f)

with open(processed_dir / "test_qrels.json", "r", encoding="utf-8") as f:
    test_qrels_dict = json.load(f)

In [5]:
def load_embedder(model_name: str) -> tuple[SentenceTransformer, str]:
    """
    Returns (embedder, device_str).
    Prefers Apple Silicon GPU via MPS when available.
    """
    if torch.backends.mps.is_available() and torch.backends.mps.is_built():
        device = "mps"
    elif torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"

    embedder = SentenceTransformer(model_name, device=device)
    return embedder, device

In [6]:
embedder, device = load_embedder(embedding_model_name)
print(f"SentenceTransformer device = {device}")

dim = int(embedder.get_sentence_embedding_dimension())

SentenceTransformer device = cuda


In [7]:
product_ids = sorted(list(product_store.keys()))

In [14]:
if client.indices.exists(index=bm25_index_name):
    client.indices.delete(index=bm25_index_name)

if client.indices.exists(index=hnsw_index_name):
    client.indices.delete(index=hnsw_index_name)

In [15]:
create_bm25_index(client, bm25_index_name)
print(f"[OK] BM25 index ready: {bm25_index_name}")

[OK] BM25 index ready: products_bm25


In [16]:
bulk_ingest_bm25(
    client,
    index_name=bm25_index_name,
    product_store=product_store,
    product_ids=product_ids,
    batch_size=batch_size,
)
print(f"[OK] BM25 ingested: {bm25_index_name} (docs={client.count(index=bm25_index_name)['count']})")

BM25 ingest → products_bm25: 100%|█████████████████████████████| 456702/456702 [00:39<00:00, 11486.45docs/s]


[OK] BM25 ingested: products_bm25 (docs=456702)


In [17]:
create_hnsw_index(
    client,
    hnsw_index_name,
    dim=dim,
    vector_field=vector_field,
    space_type="cosinesimil",
    engine="faiss",
)
print(f"[OK] HNSW index ready: {hnsw_index_name} (dim={dim})")

[OK] HNSW index ready: products_hnsw (dim=384)


In [18]:
bulk_ingest_hnsw(
    client,
    index_name=hnsw_index_name,
    product_store=product_store,
    product_ids=product_ids,
    embedder=embedder,
    dim=dim,
    vector_field=vector_field,
    batch_size=batch_size,
    encode_batch_size=batch_size,
    normalize_embeddings=True,
)
print(f"[OK] HNSW ingested: {hnsw_index_name} (docs={client.count(index=hnsw_index_name)['count']})")

HNSW ingest → products_hnsw: 100%|███████████████████████████████| 456702/456702 [08:33<00:00, 889.48docs/s]


[OK] HNSW ingested: products_hnsw (docs=456702)


In [8]:
from pprint import pprint

In [9]:
print(len(product_store))

456702


In [12]:
matching_keys = [key for key in product_store.keys() if key.startswith("wands_")]

print(matching_keys[:10])

['wands_25434', 'wands_12088', 'wands_42931', 'wands_2636', 'wands_42923', 'wands_41156', 'wands_5938', 'wands_5937', 'wands_37072', 'wands_37071']


In [13]:
pprint(product_store['wands_2636'])

{'average_rating': 5.0,
 'category hierarchy': 'Furniture / Living Room Furniture / Chairs & Seating / '
                       'Recliners',
 'product_class': 'Recliners',
 'product_description': 'this is a chair designed for your barbershop . it is '
                        'suitable for the haircuts and tattoos of men and '
                        'women . it is also for shaving . exquisite design , '
                        'durable structure . this barber chair definitely is '
                        'your best partner in the salon .',
 'product_features': 'design : standard recliner|warrantylength:60 '
                     'days|backheight-seattotopofback:20|upholsterycolor : '
                     'black|backfillmaterial : foam|waterrepellant : water '
                     'resistant|requiredbackclearancetorecline:15|fullyreclineddepth-fronttoback:24|weightcapacity:330|seatfillmaterial '
                     ': foam|recliningtype : '
                     'manual|overallproductwei

In [39]:
matching_keys = [key for key in train_qrels_dict.keys() if key.startswith("wands_")]

print(matching_keys[:10])

['wands_0', 'wands_10', 'wands_101', 'wands_103', 'wands_104', 'wands_107', 'wands_108', 'wands_109', 'wands_11', 'wands_110']


In [40]:
train_qrels_dict['wands_107']

{'wands_11070': 2.0,
 'wands_11242': 2.0,
 'wands_11251': 2.0,
 'wands_11252': 2.0,
 'wands_12102': 2.0,
 'wands_15259': 2.0,
 'wands_15270': 2.0,
 'wands_15431': 2.0,
 'wands_15433': 2.0,
 'wands_15551': 2.0,
 'wands_17891': 2.0,
 'wands_19264': 2.0,
 'wands_19550': 2.0,
 'wands_20484': 2.0,
 'wands_20928': 2.0,
 'wands_22311': 2.0,
 'wands_22561': 2.0,
 'wands_24851': 2.0,
 'wands_2807': 2.0,
 'wands_2817': 2.0,
 'wands_2818': 2.0,
 'wands_28844': 2.0,
 'wands_28845': 2.0,
 'wands_28875': 2.0,
 'wands_28876': 2.0,
 'wands_30669': 2.0,
 'wands_31370': 2.0,
 'wands_32938': 2.0,
 'wands_33044': 2.0,
 'wands_35029': 2.0,
 'wands_3509': 2.0,
 'wands_35562': 2.0,
 'wands_36357': 2.0,
 'wands_37070': 2.0,
 'wands_3732': 2.0,
 'wands_40268': 2.0,
 'wands_42059': 2.0,
 'wands_6286': 2.0,
 'wands_7585': 2.0,
 'wands_8463': 2.0}

In [None]:
def clean_product_store()