In [None]:
!pip install notebook pandas numpy matplotlib scikit-learn sentence-transformers faiss-cpu pillow

In [None]:
# 01 — Data exploration & embedding index
#Load dataset, run EDA, compute text embeddings using `all-MiniLM-L6-v2` and save embeddings + index.


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import joblib


In [None]:
DATA_PATH = os.path.join("..","backend","data","products.csv")
# if running inside backend folder use: DATA_PATH = "data/products.csv"
df = pd.read_csv(DATA_PATH)
df.head(), df.shape


In [None]:
# Fill missing values and create a combined text field for embedding
df['title'] = df['title'].fillna("").astype(str)
df['description'] = df['description'].fillna("").astype(str)
df['categories'] = df.get('categories', pd.Series([""]*len(df))).fillna("").astype(str)
df['text'] = (df['title'] + ". " + df['description'] + ". " + df['categories']).str.strip()
# add a string id column
if 'uniq_id' not in df.columns:
    df['uniq_id'] = df.index.astype(str)
df = df.reset_index(drop=True)
print("Rows:", len(df))
df[['uniq_id','title','text']].head(10)


In [None]:
# Category counts
print("Category counts:")
print(df['categories'].value_counts().head(10))

# Price stats (if price exists)
if 'price' in df.columns:
    print("\nPrice stats:")
    print(df['price'].describe())

# Show 5 random examples
df.sample(5)[['uniq_id','title','categories','price']].transpose()


In [None]:
# Plot category distribution (top 10)
top = df['categories'].value_counts().head(10)
plt.figure(figsize=(8,4))
top.plot.bar()
plt.title("Top 10 categories")
plt.xlabel("Category")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")  # small & fast

In [None]:
# compute embeddings in batches to avoid memory spikes
texts = df['text'].tolist()
BATCH = 64
embs = []
for i in range(0, len(texts), BATCH):
    batch = texts[i:i+BATCH]
    e = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
    embs.append(e)
embeddings = np.vstack(embs)
print("Embeddings shape:", embeddings.shape)
# normalize (for cosine)
embeddings = normalize(embeddings, norm='l2', axis=1)


In [None]:
OUT_DIR = os.path.join("..","backend","models")
os.makedirs(OUT_DIR, exist_ok=True)
np.save(os.path.join(OUT_DIR, "embeddings.npy"), embeddings)
df.to_csv(os.path.join(OUT_DIR, "products_with_text.csv"), index=False)
print("Saved embeddings to", os.path.join(OUT_DIR,"embeddings.npy"))


In [None]:
try:
    import faiss
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)  # inner-product on normalized vectors = cosine
    index.add(embeddings.astype('float32'))
    # save index
    faiss.write_index(index, os.path.join(OUT_DIR, "faiss_index.idx"))
    print("Built and saved FAISS index.")
except Exception as e:
    print("FAISS not available or error:", e)
    # fallback: save embeddings + use sklearn.NearestNeighbors later
    joblib.dump({"embeddings": embeddings, "ids": df['uniq_id'].tolist()}, os.path.join(OUT_DIR, "nn_store.pkl"))
    print("Saved embeddings for sklearn fallback.")


In [None]:
# Query example: use first row text
query_text = df.loc[0, 'text']
q_emb = model.encode([query_text], convert_to_numpy=True)
q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
k = min(6, len(df))
try:
    # faiss path
    D, I = index.search(q_emb.astype('float32'), k)
    print("Top ids:", df.loc[I[0], ['uniq_id','title']])
except:
    from sklearn.neighbors import NearestNeighbors
    store = joblib.load(os.path.join(OUT_DIR,"nn_store.pkl"))
    neigh = NearestNeighbors(n_neighbors=k, metric='cosine').fit(store['embeddings'])
    dist, idx = neigh.kneighbors(q_emb)
    print(df.iloc[idx[0]][['uniq_id','title']])


In [None]:
## Summary
- Embeddings saved: `backend/models/embeddings.npy`
- Index saved: `backend/models/faiss_index.idx` or `backend/models/nn_store.pkl`
- Next: open `02_model_training_and_evaluation.ipynb` to evaluate retrieval and run small ML experiments.
