# Guided Buying Recommendation System

In [1]:
import warnings
warnings.filterwarnings('ignore')

## Environment Setup

In [2]:
# Load env variables

import os
from dotenv import load_dotenv

load_dotenv()

True

## Pinecone DB Setup

In [3]:
#Pinecone client setup
from pinecone import Pinecone, ServerlessSpec

Pinecone_API_Key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=Pinecone_API_Key)

In [4]:
#Creating index
if "amazon-data" not in pc.list_indexes().names():
    pc.create_index(
        name="amazon-data",
        dimension= 768,
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print("Index created")
else:
    print("Index already exists")

Index already exists


In [5]:
idx = pc.Index("amazon-data")

## NER Engine

In [6]:
from transformers import pipeline

# Use this reliable model (no extra downloads needed)
ner_engine = pipeline("ner", 
               model="Babelscape/wikineural-multilingual-ner",
               aggregation_strategy="simple")



Device set to use cpu


In [7]:
# Extract entities
entities = ner_engine(["I need a buy a multicolor York Bag", "My name is Pratik"])
entities

[[{'entity_group': 'ORG',
   'score': np.float32(0.6403332),
   'word': 'York',
   'start': 26,
   'end': 30},
  {'entity_group': 'MISC',
   'score': np.float32(0.4894772),
   'word': 'Bag',
   'start': 31,
   'end': 34}],
 [{'entity_group': 'PER',
   'score': np.float32(0.7661619),
   'word': 'Pratik',
   'start': 11,
   'end': 17}]]

## Retriever - Embeddings

In [8]:
from sentence_transformers import SentenceTransformer

retriever = SentenceTransformer("all-mpnet-base-v2")

In [9]:
retriever

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False, 'architecture': 'MPNetModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

## Data Prep

In [10]:
import pandas as pd

In [11]:
df = pd.read_csv("./data/Amazon/amazon_products.csv")

In [12]:
#Features not required
df.drop(['asin', 'imgUrl', 'listPrice'], axis = 1, inplace=True)
df.head()

Unnamed: 0,title,productURL,stars,reviews,price,category_id,isBestSeller,boughtInLastMonth
0,"Sion Softside Expandable Roller Luggage, Black...",https://www.amazon.com/dp/B014TMV5YE,4.5,0,139.99,104,False,2000
1,Luggage Sets Expandable PC+ABS Durable Suitcas...,https://www.amazon.com/dp/B07GDLCQXV,4.5,0,169.99,104,False,1000
2,Platinum Elite Softside Expandable Checked Lug...,https://www.amazon.com/dp/B07XSCCZYG,4.6,0,365.49,104,False,300
3,Freeform Hardside Expandable with Double Spinn...,https://www.amazon.com/dp/B08MVFKGJM,4.6,0,291.59,104,False,400
4,Winfield 2 Hardside Expandable Luggage with Sp...,https://www.amazon.com/dp/B01DJLKZBA,4.5,0,174.99,104,False,400


In [13]:
#Select only office related products
cat_ids = [55, 56, 57, 60, 63, 64, 65, 66, 68, 69, 71, 72, 73, 75, 76, 81]

df = df[df['category_id'].isin(cat_ids)]

In [14]:
df['title'] = df['title'].astype(str)
df['productURL'] = df['productURL'].astype(str)

In [15]:
print(df['title'].apply(type).unique())
print(df['productURL'].apply(type).unique())

[<class 'str'>]
[<class 'str'>]


In [16]:
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,title,productURL,stars,reviews,price,category_id,isBestSeller,boughtInLastMonth
0,Zmstlamp VLT-HC910LP VLT-HC100LP VLT-EX100LP R...,https://www.amazon.com/dp/B08RYJCM8S,3.8,25,47.99,69,False,0
1,EWO'S ELP50 Replacement Projector Lamp for Eps...,https://www.amazon.com/dp/B084ZMGTH6,4.6,13,59.99,69,False,0
2,VEVOR Movie Screen with Stand 180inch Portable...,https://www.amazon.com/dp/B094VBTRRT,4.2,126,199.99,69,False,0
3,Perlegear UL Listed Full Motion TV Wall Mount ...,https://www.amazon.com/dp/B0B8YZ5TR4,4.8,1864,45.99,69,False,0
4,"LG Remote Magic Remote Control, Compatible wit...",https://www.amazon.com/dp/B08D1VF18M,4.4,8201,31.5,69,False,0


## Extract NER from newly created feature

In [17]:
def extract_entities(list_of_text):
    entities_list = []
    for doc in list_of_text:
        entities_list.append([item['word'] for item in ner_engine(doc)])

    return entities_list

In [18]:
extract_entities(df['title'].iloc[0:2])

[['Replacement Projector Lamp Bulb', 'Mitsubishi'],
 ['Projector Lamp', 'E', '##pson ELP', '##lite', 'Lamp Bulb']]

## Check if Retriever is working fine

In [19]:
df_batch = df['title'].iloc[0:10]
df_batch

0    Zmstlamp VLT-HC910LP VLT-HC100LP VLT-EX100LP R...
1    EWO'S ELP50 Replacement Projector Lamp for Eps...
2    VEVOR Movie Screen with Stand 180inch Portable...
3    Perlegear UL Listed Full Motion TV Wall Mount ...
4    LG Remote Magic Remote Control, Compatible wit...
5    Mdbebbron 120 inch Projection Screen 16:9 Fold...
6    Google Chromecast (3rd Generation) Media Strea...
7    ELIVED UL Listed Full Motion TV Monitor Wall M...
8    Universal Remote Control Compatible for Samsun...
9    Amazon Basics 16-Gauge Speaker Wire Cable, 100...
Name: title, dtype: object

In [20]:
len(retriever.encode(df_batch).tolist())

10

## Upsert Data into DB

In [21]:
from tqdm import tqdm
'''
batch_size = 64
df = df.sample(frac=1).reset_index(drop=True)

for i in tqdm(range(0, 50000, batch_size), desc="Processing batches"):
   i_end = min(i+batch_size, len(df))
   
   #Batch of Data
   df_batch = df.iloc[i:i_end]
   
   #Embeddings
   emb = retriever.encode(df_batch['title'].tolist()).tolist()
   
   #NER Extraction
   entities = extract_entities(df_batch['title'].tolist())
   df_batch['ner'] = [list(set(entity)) for entity in entities]
   
   #Create metadata
   metadata = df_batch.to_dict(orient='records')
   
   #Create indices
   ids = [str(id) for id in range(i, i_end)]
   
   #Upsert
   vectors_to_upsert = list(zip(ids, emb, metadata))
   idx.upsert(vectors_to_upsert)
   '''

'\nbatch_size = 64\ndf = df.sample(frac=1).reset_index(drop=True)\n\nfor i in tqdm(range(0, 50000, batch_size), desc="Processing batches"):\n   i_end = min(i+batch_size, len(df))\n\n   #Batch of Data\n   df_batch = df.iloc[i:i_end]\n\n   #Embeddings\n   emb = retriever.encode(df_batch[\'title\'].tolist()).tolist()\n\n   #NER Extraction\n   entities = extract_entities(df_batch[\'title\'].tolist())\n   df_batch[\'ner\'] = [list(set(entity)) for entity in entities]\n\n   #Create metadata\n   metadata = df_batch.to_dict(orient=\'records\')\n\n   #Create indices\n   ids = [str(id) for id in range(i, i_end)]\n\n   #Upsert\n   vectors_to_upsert = list(zip(ids, emb, metadata))\n   idx.upsert(vectors_to_upsert)\n   '

## Querying

In [22]:
query = "I want to buy a HP laptop"

#Embedded vector
emb_qx = retriever.encode(query).tolist()

#NER
ne = extract_entities([query])[0]
ne

['HP']

In [23]:
xc = idx.query(vector=emb_qx, top_k=5, include_metadata=True, filter={"ner": {"$in": ne}})

In [24]:
for result in xc['matches']:
    print(result['score'], " ", result['metadata']['ner'], result['metadata']['title']  )

0.746730924   ['JVQ MP', 'Newest Pavilion Laptop', 'Type - C', 'Fingerprint Reader', 'Silver', '##MI', 'Bluetooth', 'Windows 11 Home', 'Intel Core', '##D', 'HP'] HP Newest Pavilion Laptop, 15.6" FHD Screen, Intel Core i5-1135G7 Processor (up to 4.2 GHz), 32GB Memory, 1TB SSD, Fingerprint Reader, Type-C, HDMI, Bluetooth, Windows 11 Home, Silver, JVQ MP
0.743216753   ['HP', 'Webcam', 'PCIe SSD', '##est Laptop', 'Fingerprint Reader', 'Type - C', 'Bluetooth', 'Silver', 'WiFi', 'Windows 11 Home', 'Intel Core', 'HDMI', 'Backlit Keyboard'] HP Newest Laptop, 15.6" Full HD Touchscreen, Intel Core i7-1165G7, 16GB RAM, 1TB PCIe SSD, Backlit Keyboard, Fingerprint Reader, Webcam, WiFi 5, HDMI, Type-C, Bluetooth, Windows 11 Home, Silver
0.733544409   ['Thin', 'Light Laptop', 'Webcam', 'Pink', 'Long Battery Life', 'Windows 11 + 1 Year Microsoft 365', 'Bluetooth', 'Wi - Fi', 'Intel Dual - Core Processor', 'HDMI', 'HP'] HP Premium 14-inch HD Thin and Light Laptop, Intel Dual-Core Processor, 16GB RAM, 6

# GBR Modular

## Search Agent

In [25]:
from src.search_agent import SearchAgent
from src.logging.logger import logging

In [26]:
searchAgent = SearchAgent(pinecone_index=idx,
                          retriever_model=retriever,
                          ner_engine=ner_engine)
logging.info("SearchAgent Initialized!")

In [31]:
import json
from config.constants import user
# Load user profile
with open('config/user_profiles.json', 'r') as f:
    profiles = json.load(f)

# Use specific profile
user_profile = profiles[user]
user_profile

{'role': 'software_developer',
 'budget': 2500,
 'categories': ['electronics', 'computers']}

In [None]:
results = searchAgent.search(query=query, user_profile=user_profile)
logging.info(f"Received results")

Batches: 100%|██████████| 1/1 [00:00<00:00, 35.24it/s]


In [33]:
results

[SearchResult(id='33548', title='HP Newest Pavilion Laptop, 15.6" FHD Screen, Intel Core i5-1135G7 Processor (up to 4.2 GHz), 32GB Memory, 1TB SSD, Fingerprint Reader, Type-C, HDMI, Bluetooth, Windows 11 Home, Silver, JVQ MP', price=619.0, stars=4.6, reviews=0, isBestSeller=False, boughtinLastMonth=0, productURL='https://www.amazon.com/dp/B0BNFBXRZX'),
 SearchResult(id='5928', title='HP Newest Laptop, 15.6" Full HD Touchscreen, Intel Core i7-1165G7, 16GB RAM, 1TB PCIe SSD, Backlit Keyboard, Fingerprint Reader, Webcam, WiFi 5, HDMI, Type-C, Bluetooth, Windows 11 Home, Silver', price=774.99, stars=4.0, reviews=0, isBestSeller=False, boughtinLastMonth=0, productURL='https://www.amazon.com/dp/B092W6HR6R'),
 SearchResult(id='24450', title='HP Premium 14-inch HD Thin and Light Laptop, Intel Dual-Core Processor, 16GB RAM, 64GB Storage, Long Battery Life, Webcam, Bluetooth, HDMI, Wi-Fi, Pink, Windows 11 + 1 Year Microsoft 365', price=288.99, stars=4.0, reviews=0, isBestSeller=False, boughtinLa