In [None]:
import random
import numpy as np
import pandas as pd
import time
from redis import Redis
from redis.commands.search.field import VectorField
from redis.commands.search.field import TextField
from redis.commands.search.field import TagField
from redis.commands.search.query import Query
from redis.commands.search.result import Result
import collections

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'


# Load Amazon Product Data

Truncate text selected fields on load.

In [None]:
NUMBER_PRODUCTS=100000

#Load Product data and truncate long text fields
all_prods_df = pd.read_csv("data/product_data.csv")
all_prods_df['primary_key'] = all_prods_df['item_id'] + '-' + all_prods_df['domain_name']
all_prods_df['item_keywords'].replace('', np.nan, inplace=True)
all_prods_df.dropna(subset=['item_keywords'], inplace=True)
all_prods_df.reset_index(drop=True,inplace=True)

#get the first 1000 products with non-empty item keywords
product_metadata = all_prods_df.head(NUMBER_PRODUCTS).to_dict(orient='index')
product_metadata = collections.OrderedDict(product_metadata)

In [None]:
len(product_metadata.keys())

# Connect to Redis

In [None]:
host = 'vecsim'
port = 6379
redis_conn = Redis(host = host, port = port)
print ('Connected to redis')

# Load Previously Created Embeddings

Let's load the vector embeddings for the first 100k products in the dataset.

These embeddings were generated usign a pre-trained sentence embedding model
https://huggingface.co/sentence-transformers/all-distilroberta-v1

In [None]:
%%time
with open('100k-item-keyword-vectors.npy', 'rb') as f:
    item_keywords_vectors = np.load('100k-item-keyword-vectors.npy')

# Get Sentence Transformer model

It will be needed later to generate vectors for end user queries

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

# Utility Functions to Load Product Data
Each product will be stored in a redis hash
* **Hash Key** = **key='product:index+:primary_key'**


In [None]:
import itertools

def chunk(it, size):
    it = iter(it)
    while True:
        p = dict(itertools.islice(it, size))
        if not p:
            break
        yield p


def load_vectors(client:Redis, product_metadata, vector_dict, vector_field_name):
    i=0
    for batch in chunk(product_metadata.items(), 10000):
        #process batch 
        print (f'processing batch {i}')
        p = client.pipeline(transaction=False)
        for key in batch.keys():    
            #hash key
            hashkey='product:'+ str(key)+ ':' + batch[key]['primary_key']

            #hash values
            item_metadata = batch[key]
            item_keywords_vector = vector_dict[key].astype(np.float32).tobytes()
            item_metadata[vector_field_name]=item_keywords_vector

            # HSET
            p.hset(hashkey,mapping=item_metadata)

        p.execute()
        i+=1

# Utility Functions to Create Indexes on Vector field

In [None]:
def create_flat_index (redis_conn,vector_field_name,number_of_vectors, vector_dimensions=512, distance_metric='L2'):
    redis_conn.ft().create_index([
        VectorField(vector_field_name, "FLAT", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric, "INITIAL_CAP": number_of_vectors, "BLOCK_SIZE":number_of_vectors }),
        TagField("product_type"),
        TextField("item_name"),
        TextField("item_keywords"),
        TagField("country")        
    ])

def create_hnsw_index (redis_conn,vector_field_name,number_of_vectors, vector_dimensions=512, distance_metric='L2',M=40,EF=200):
    redis_conn.ft().create_index([
        VectorField(vector_field_name, "HNSW", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric, "INITIAL_CAP": number_of_vectors, "M": M, "EF_CONSTRUCTION": EF}),
        TagField("product_type"),
        TextField("item_keywords"),        
        TextField("item_name"),
        TagField("country")     
    ])    

# FLAT - Load and Index Product Data
Let's create an index for the image vectors and load information for 100,000 products

**This might take 1-2 minutes**

A FLAT index is used to perform an exact nearest neighbors search. 

A query vector will be compared against all other image vectors in the database

In [None]:
%%time

ITEM_KEYWORD_EMBEDDING_FIELD='item_keyword_vector'
TEXT_EMBEDDING_DIMENSION=768
NUMBER_PRODUCTS=100000


#flush all data
redis_conn.flushall()

print ('Loading and Indexing ' +  str(NUMBER_PRODUCTS) + ' products...')

#create flat index & load vectors
create_flat_index(redis_conn, ITEM_KEYWORD_EMBEDDING_FIELD,NUMBER_PRODUCTS,TEXT_EMBEDDING_DIMENSION,'COSINE')
load_vectors(redis_conn,product_metadata,item_keywords_vectors,ITEM_KEYWORD_EMBEDDING_FIELD)

# FLAT index - FIND The Top K MOST SEMANTICALLY Similar Products
Let's use the brute-force index to find the exact top k nearest neighbors of a given text query

Check the output for 2 very different queries:
* **Query 1** = 'beautifully crafted gems for her. a special occasion'
* **Query 2** = 'Ultra modern cool way to pimp up my phone'

Feel free to experiment with other text queries to match against the item keyword data.









In [None]:
%%time
topK=5
product_query='beautifully crafted gems for her. a special occasion'
#product_query='cool way to pimp up my cell'

#vectorize the query
query_vector = model.encode(product_query).astype(np.float32).tobytes()

#prepare the query
q = Query(f'*=>[KNN {topK} @{ITEM_KEYWORD_EMBEDDING_FIELD} $vec_param AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','item_name','item_id','item_keywords','country').dialect(2)
params_dict = {"vec_param": query_vector}


#Execute the query
results = redis_conn.ft().search(q, query_params = params_dict)

#Print similar products found
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Id = ' +  color.END  + product.item_id)
    print (color.YELLOW + 'Item keywords = ' +  color.END  + product.item_keywords)
    print (color.YELLOW + 'Country = ' +  color.END  + product.country)
    print (color.YELLOW + 'Score = ' +  color.END  + product.vector_score)


# HNSW - Load and Index Product Data
Let's try indexing with an HNSW index.
This index is used to calculate Top K Approximate Nearest Neighbors of a given vector

**This might take 1-3 minutes**

In [None]:
%%time
print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products...')

ITEM_KEYWORD_EMBEDDING_FIELD='item_keyword_vector'
TEXT_EMBEDDING_DIMENSION=768
NUMBER_PRODUCTS=100000


#flush all data
redis_conn.flushall()

#create flat index & load vectors
create_hnsw_index(redis_conn, ITEM_KEYWORD_EMBEDDING_FIELD,NUMBER_PRODUCTS,TEXT_EMBEDDING_DIMENSION,'COSINE',M=40,EF=200)
load_vectors(redis_conn,product_metadata,item_keywords_vectors,ITEM_KEYWORD_EMBEDDING_FIELD)


# HNSW - Query The Top 5 semantically Similar Products
Let's repeat the similarity search but this time using the HNSW index

Check the output for 2 very different queries:
* **Query 1** = 'beautifully crafted gem for her. a special occasion'
* **Query 2** = 'Ultra modern cool way to pimp up my cell'



In [None]:
%%time
topK=5
product_query='beautifully crafted gem for her. a special occasion'
#product_query='cool way to pimp up my cell'

#vectorize the query
query_vector = model.encode(product_query).astype(np.float32).tobytes()

#prepare the query
q = Query(f'*=>[KNN {topK} @{ITEM_KEYWORD_EMBEDDING_FIELD} $vec_param AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','item_name','item_id','item_keywords','country').dialect(2)
params_dict = {"vec_param": query_vector}


#Execute the query
results = redis_conn.ft().search(q, query_params = params_dict)

#Print similar products found
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Id = ' +  color.END  + product.item_id)
    print (color.YELLOW + 'Item keywords = ' +  color.END  + product.item_keywords)
    print (color.YELLOW + 'Country = ' +  color.END  + product.country)
    print (color.YELLOW + 'Score = ' +  color.END  + product.vector_score)


# HNSW - Hybrid Query the top 5 most visually semantically products available ONLY in selected markets

Let's repeat our Top 5 search but this time limit to products that meet the following criteria:
* **Listed on** Amazon India (IN) 


This RediSearch query has this form:

**(@country:{{DE|IN|IT}})=> [KNN 5 vector_field_name $query_vector EF_RUNTIME 10 AS vector_score])**



In [None]:
%%time
topK=5
product_query='beautifully crafted gem for her. a special occasion'
#product_query='cool way to pimp up my cell'

#vectorize the query
query_vector = model.encode(product_query).astype(np.float32).tobytes()

#prepare the query
q = Query(f'(@country:{{IN}})=>[KNN {topK} @{ITEM_KEYWORD_EMBEDDING_FIELD} $vec_param AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','item_name','item_id','item_keywords','country').dialect(2)
params_dict = {"vec_param": query_vector}


#Execute the query
results = redis_conn.ft().search(q, query_params = params_dict)

#Print similar products found
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Id = ' +  color.END  + product.item_id)
    print (color.YELLOW + 'Item keywords = ' +  color.END  + product.item_keywords)
    print (color.YELLOW + 'Score = ' +  color.END  + product.vector_score)
    print (color.YELLOW + 'Country = ' +  color.END  + product.country)
