In [5]:
# 02 — Model training & evaluation
#Evaluate semantic retrieval using embeddings, compute recall@k on synthetic holdout, small clustering baseline, and optional CV placeholder.


In [6]:
import os, numpy as np, pandas as pd
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
import joblib

OUT_DIR = os.path.join("..","backend","models")
df = pd.read_csv(os.path.join(OUT_DIR, "products_with_text.csv"))
emb_path = os.path.join(OUT_DIR, "embeddings.npy")
embeddings = np.load(emb_path)
print("Loaded", embeddings.shape, "embeddings for", len(df), "rows")


Loaded (312, 384) embeddings for 312 rows


In [7]:
# create train/test split indices
idx = np.arange(len(df))
train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=42)
print("Train:", len(train_idx), "Test:", len(test_idx))


Train: 249 Test: 63


In [8]:
train_emb = embeddings[train_idx]
test_emb = embeddings[test_idx]
neigh = NearestNeighbors(n_neighbors=20, metric='cosine').fit(train_emb)


In [13]:
def recall_at_k(neigh, test_emb, test_idx, train_idx, k=5):
    # For evaluation we assume "ground-truth" similar items are those sharing the same category.
    # This is a proxy; better labels would be user clicks or curated similar pairs.
    D, I = neigh.kneighbors(test_emb, n_neighbors=k)
    recalls = []
    for i, row in enumerate(I):
        test_row_idx = test_idx[i]
        test_cat = df.loc[test_row_idx, 'categories']
        retrieved_indices = train_idx[row]  # map to original indices
        retrieved_cats = df.loc[retrieved_indices, 'categories'].values
        # check if any retrieved category equals test category
        recall = int(test_cat in retrieved_cats)
        recalls.append(recall)
    return np.mean(recalls)


In [14]:
for k in [1,3,5]:
    r = recall_at_k(neigh, test_emb, test_idx, train_idx, k=k)
    print(f"Recall@{k}: {r:.3f}")


Recall@1: 0.651
Recall@3: 0.746
Recall@5: 0.762


In [15]:
k = min(10, len(df)//10)
km = KMeans(n_clusters=k, random_state=42).fit(embeddings)
df['cluster'] = km.labels_
# show cluster sizes
print(df['cluster'].value_counts().head())


cluster
2    65
4    49
0    38
9    36
5    34
Name: count, dtype: int64


In [16]:
for c in sorted(df['cluster'].unique())[:6]:
    sample = df[df['cluster']==c].head(3)[['uniq_id','title','categories']]
    print("Cluster", c)
    print(sample.to_string(index=False))
    print("---")


Cluster 0
                             uniq_id                                                                                                                                                                                          title                                                                                                               categories
02593e81-5c09-5069-8516-b0b29f439ded                                                                     GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, Hallway ['Home & Kitchen', 'Storage & Organization', 'Clothing & Closet Storage', 'Shoe Organizers', 'Free Standing Shoe Racks']
aba4138e-6401-52ca-a099-02e30b638db4 Folews Bathroom Organizer Over The Toilet Storage, 4-Tier Bathroom Shelves Over Toilet Shelf Above Toilet Storage Rack Freestanding Bathroom Space Saver Adjustable Shelves and Baskets, Black                                         ['Home & Kitchen', 'Furn

In [17]:
# If you have product images saved locally with filenames matching uniq_id,
# implement a simple transfer-learning reminder here.
# This cell is a placeholder showing steps (no heavy training).
print("CV placeholder: if images are available, use torchvision/resnet18, replace classifier and finetune.")


CV placeholder: if images are available, use torchvision/resnet18, replace classifier and finetune.


In [18]:
res = {
    "recall@1": recall_at_k(neigh, test_emb, test_idx, train_idx, k=1),
    "recall@3": recall_at_k(neigh, test_emb, test_idx, train_idx, k=3),
    "recall@5": recall_at_k(neigh, test_emb, test_idx, train_idx, k=5),
    "n_products": len(df)
}
joblib.dump(res, os.path.join(OUT_DIR, "evaluation_results.pkl"))
print("Saved evaluation:", res)


Saved evaluation: {'recall@1': np.float64(0.6507936507936508), 'recall@3': np.float64(0.746031746031746), 'recall@5': np.float64(0.7619047619047619), 'n_products': 312}


In [20]:
## Summary
# - Retrieval evaluated using category-proximity as a proxy label.
# - Recall@k saved to `backend/models/evaluation_results.pkl`.
# - Next: integrate evaluation findings into README and mention limitations (small dataset, category proxy).
