In [2]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import os

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv("../sample_data.csv").drop(["Unnamed: 0"],axis=1)
df.columns

Index(['item_number', 'description', 'price', 'quantity'], dtype='object')

In [16]:
df

Unnamed: 0,item_number,description,price,quantity
0,1,"Apple iPhone 14, 128GB, Midnight Black",799.99,15
1,2,"Samsung Galaxy S22, 256GB, Phantom White",749.5,10
2,3,"Google Pixel 7, 128GB, Obsidian",699.0,8
3,4,"OnePlus 11, 16GB RAM, 256GB, Titan Black",649.99,12
4,5,"Dell XPS 13, Intel i7, 16GB RAM, 512GB SSD",1199.0,5
5,6,"Apple MacBook Air M2, 8GB RAM, 256GB SSD",1049.99,6
6,7,"HP Spectre x360, 13.5 inch OLED, 16GB RAM",1249.0,4
7,8,"Lenovo ThinkPad X1 Carbon Gen 10, 14 inch",1399.0,3
8,9,"Bananas, 1kg pack, organically grown",1.5,40
9,10,"Whole Wheat Bread, 400g, freshly baked",2.49,25


In [19]:


# Step 2: Convert Descriptions to Vectors using a Pre-trained Model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast & effective

# Generate embeddings
description_embeddings = model.encode(df['description'].tolist(), convert_to_numpy=True)

# Step 3: Normalize (optional but recommended for cosine similarity)
description_embeddings = description_embeddings / np.linalg.norm(description_embeddings, axis=1, keepdims=True)

# Step 4: Create FAISS Index
dimension = description_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity

# Step 5: Add Vectors to Index
index.add(description_embeddings)

# Step 6: Perform a Search
query = "vegetables"
query_vector = model.encode([query], convert_to_numpy=True)
query_vector = query_vector / np.linalg.norm(query_vector, axis=1, keepdims=True)

k = 5  # number of nearest neighbors
distances, indices = index.search(query_vector, k)

# Step 7: Print Results
print("Query:", query)
for i, idx in enumerate(indices[0]):
    print(f"Rank {i+1}: ID={df.iloc[idx]['item_number']}, Description='{df.iloc[idx]['description']}', Similarity={distances[0][i]:.4f}")


Query: vegetables
Rank 1: ID=14, Description='Broccoli, 500g, fresh and organic', Similarity=0.6112
Rank 2: ID=18, Description='Potatoes, 2kg, locally sourced', Similarity=0.5423
Rank 3: ID=9, Description='Bananas, 1kg pack, organically grown', Similarity=0.5047
Rank 4: ID=11, Description='Organic Milk, 1L, low-fat', Similarity=0.3445
Rank 5: ID=15, Description='Red Apples, 1kg, imported from Washington', Similarity=0.3234
