In [None]:
#Step 1: Install Required Libraries
!pip install faiss-cpu scann datasketch sentence-transformers --quiet

In [None]:
#Step 2: Import Libraries
import random
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
import faiss
from datasketch import MinHash, MinHashLSH
import scann


In [None]:
#Step 3: Generate Mock Amazon Product Dataset
categories = ["Smartphone", "Laptop", "Headphones", "Smartwatch", "Camera", "Tablet", "Speaker"]
features = [
    "with 4K display", "with long battery life", "with noise cancellation", "with stylus support",
    "with AI features", "with water resistance", "with fast charging", "with dual SIM",
    "with 5G connectivity", "with touchscreen", "with fingerprint sensor", "with triple camera"
]

def generate_product():
    return f"{random.choice(categories)} {random.choice(features)}"

product_texts = [generate_product() for _ in range(500)]
df = pd.DataFrame({'product_description': product_texts})
df.head()

Unnamed: 0,product_description
0,Camera with noise cancellation
1,Smartwatch with water resistance
2,Laptop with stylus support
3,Headphones with stylus support
4,Laptop with noise cancellation


In [None]:
#Step 4: Generate Embeddings
# Load sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all product descriptions
embeddings = model.encode(df['product_description'].tolist(), show_progress_bar=True)

# Convert to numpy array
embeddings = np.array(embeddings)

In [None]:
#Step 5: Define a Query and Generate Its Embedding

# Define a test query
query_text = "Smartphone with fast charging"
query_embedding = model.encode([query_text])[0]


In [None]:
#Step 6: Build Search Indexes for FAISS, LSH, and ScaNN
#6.1 — FAISS (Flat L2 Index)
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings)

#6.2 — MinHash LSH (Jaccard Similarity)
# Convert product descriptions to sets of shingles
def get_minhash(text, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for word in text.lower().split():
        m.update(word.encode('utf8'))
    return m

# Create LSH index
lsh_index = MinHashLSH(threshold=0.3, num_perm=128)
minhashes = []

for i, desc in enumerate(df['product_description']):
    m = get_minhash(desc)
    lsh_index.insert(f"item_{i}", m)
    minhashes.append(m)

# Create MinHash for query
query_minhash = get_minhash(query_text)

#6.3 — ScaNN (Cosine Similarity)
# Create ScaNN searcher (cosine distance)
scann_searcher = scann.scann_ops_pybind.builder(embeddings, 10, "dot_product")\
    .score_ah(2, anisotropic_quantization_threshold=0.2)\
    .reorder(10)\
    .build()



In [None]:
#Step 7: Perform Search + Benchmark Results
# Store results and timing
results = {}
# FAISS Search
start = time.time()
_, faiss_indices = faiss_index.search(np.array([query_embedding]), k=5)
results['faiss'] = [df['product_description'].iloc[i] for i in faiss_indices[0]]
faiss_time = time.time() - start

#  MinHash LSH Search
start = time.time()
lsh_result_keys = lsh_index.query(query_minhash)
results['lsh'] = [df['product_description'].iloc[int(key.split("_")[1])] for key in lsh_result_keys]
lsh_time = time.time() - start

# ScaNN Search
start = time.time()
scann_indices, _ = scann_searcher.search(query_embedding)  # Pass 1D vector directly
results['scann'] = [df['product_description'].iloc[i] for i in scann_indices]
scann_time = time.time() - start


In [None]:
# Print Timing Summary
print("Search Time (seconds):")
print(f"FAISS:  {faiss_time:.4f}s")
print(f"LSH:    {lsh_time:.4f}s")
print(f"ScaNN:  {scann_time:.4f}s")

# Show Results
for method, res in results.items():
    print(f"\n{method.upper()} Results:")
    for r in res:
        print(" -", r)


Search Time (seconds):
FAISS:  0.0009s
LSH:    0.0017s
ScaNN:  0.0067s

FAISS Results:
 - Smartphone with fast charging
 - Smartphone with fast charging
 - Smartphone with fast charging
 - Smartphone with fast charging
 - Smartphone with fast charging

LSH Results:
 - Camera with dual SIM
 - Smartphone with triple camera
 - Tablet with fingerprint sensor
 - Speaker with triple camera
 - Smartwatch with fast charging
 - Camera with long battery life
 - Speaker with 4K display
 - Smartphone with touchscreen
 - Camera with dual SIM
 - Headphones with noise cancellation
 - Tablet with dual SIM
 - Speaker with triple camera
 - Camera with dual SIM
 - Camera with long battery life
 - Speaker with long battery life
 - Speaker with dual SIM
 - Tablet with triple camera
 - Speaker with dual SIM
 - Smartphone with touchscreen
 - Camera with 4K display
 - Headphones with fingerprint sensor
 - Speaker with long battery life
 - Camera with triple camera
 - Tablet with long battery life
 - Headphone