In [1]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.2.0-py3-none-any.whl (493 kB)
   ---------------------------------------- 0.0/493.7 kB ? eta -:--:--
   -- ------------------------------------ 30.7/493.7 kB 640.0 kB/s eta 0:00:01
   ------------------- -------------------- 235.5/493.7 kB 2.9 MB/s eta 0:00:01
   ---------------------------------------  491.5/493.7 kB 4.4 MB/s eta 0:00:01
   ---------------------------------------- 493.7/493.7 kB 3.4 MB/s eta 0:00:00
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-5.2.0


In [13]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle


In [17]:
df = pd.read_csv("processed_reviews.csv")

df["review_text"] = df["review_text"].astype(str)
df = df[df["review_text"].str.strip().str.len() > 0]

corpus = df["review_text"].tolist()

print("Documents:", len(corpus))

if len(corpus) == 0:
    raise RuntimeError(
        "Corpus is empty. The HTML dataset does not contain usable review text. "
        "MiniLM retrieval cannot be executed."
    )

print("Sample review:\n", corpus[0][:300])


Documents: 61
Sample review:
 this dataset is an updated version of the


In [18]:
queries = [
    "battery life",
    "screen quality",
    "sound quality",
    "build quality",
    "camera performance"
]


In [19]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2",
    device="cpu"
)

print("MiniLM model loaded")


MiniLM model loaded


In [20]:
corpus_embeddings = model.encode(
    corpus,
    batch_size=32,
    show_progress_bar=False
)

print("Corpus embeddings shape:", corpus_embeddings.shape)

if corpus_embeddings.shape[0] == 0:
    raise RuntimeError("Embedding failed: no document embeddings created")


Corpus embeddings shape: (61, 384)


In [21]:
np.save("corpus_embeddings.npy", corpus_embeddings)
print("Saved corpus_embeddings.npy")


Saved corpus_embeddings.npy


In [22]:
def minilm_rank(embeddings, query, top_k=10):
    if embeddings.shape[0] == 0:
        raise ValueError("No embeddings available for ranking")

    query_emb = model.encode([query])[0]
    scores = embeddings @ query_emb

    return np.argsort(scores)[::-1][:top_k]


In [23]:
minilm_results = {}

for q in queries:
    minilm_results[q] = minilm_rank(corpus_embeddings, q)

minilm_results


{'battery life': array([24, 40, 26,  2, 57, 43,  3, 56, 25, 33], dtype=int64),
 'screen quality': array([47, 21, 24, 26,  6,  2, 60, 41, 57, 36], dtype=int64),
 'sound quality': array([24, 26, 21, 38,  2, 57, 56, 60, 47, 40], dtype=int64),
 'build quality': array([24, 45, 50, 51, 20,  2, 26, 40, 21,  9], dtype=int64),
 'camera performance': array([24, 21,  6, 26, 47, 40, 41,  2, 31, 56], dtype=int64)}

In [25]:
query = queries[0]
top_docs = minilm_results[query]

print("Query:", query)
print("\nTop retrieved reviews:\n")

for idx in top_docs[:3]:
    print("-", corpus[idx][:200], "\n")


Query: battery life

Top retrieved reviews:

- (34gb) - all 233.1 million reviews 

- - time of the review (unix time) 

- (14.3gb) - subset of the data in which all users and items have at least 5 reviews (75.26 million reviews) 



In [26]:
with open("minilm_results.pkl", "wb") as f:
    pickle.dump(minilm_results, f)

print("Saved minilm_results.pkl")


Saved minilm_results.pkl
