In [None]:
import random
import numpy as np
import pandas as pd
import time
from redis import Redis
import redisearch

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'


# Load Amazon Product Data

Truncate text selected fields on load.  

The Max Length supported by the pre-trained sentence embedding generator is 512

In [None]:
MAX_TEXT_LENGTH=512

def auto_truncate(val):
    return val[:MAX_TEXT_LENGTH]

#Load Product data and truncate long text fields
all_prods_df = pd.read_csv("data/product_data.csv", converters={'bullet_point': auto_truncate,'item_keywords':auto_truncate,'item_name':auto_truncate})
all_prods_df['primary_key'] = all_prods_df['item_id'] + '-' + all_prods_df['domain_name']
all_prods_df = all_prods_df.fillna('')



In [None]:
all_prods_df.head(5)

# Connect to Redis

In [None]:
host = 'vecsim'
port = 6379
redis_conn = Redis(host = host, port = port)
print ('Connected to redis')

# Generate Embeddings

We will use a pre-trained sentence embedding generator from

https://huggingface.co/sentence-transformers/all-distilroberta-v1

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')


Generate vector representations of the "item_keywords"  field for 1000 products

The distilroberta-v1 generates a 768-float vector for a given sentence of up to 512 characters

In [None]:
%%time
NUMBER_PRODUCTS=1000

subset_df = all_prods_df.head(NUMBER_PRODUCTS)
item_keywords_vectors = [ model.encode(sentence) for sentence in subset_df['item_keywords']]


### Check the dimensions of one of the vectors generated

In [None]:
item_keywords_vectors[0].shape

# Utility Functions to Load Product Data
Each product will be stored in a redis hash
* **Hash Key** = **'product:'** + **product_key**
* **Hash Fields:** 
    * Item Id
    * Item Name
    * Item Keywords (text)
    * Item Keywords vector - 768-float vector
 

In [None]:
def load_vectors(client:Redis, product_df, vector_data,vector_field_name):
    p = client.pipeline(transaction=False)
    for index, row in product_df.iterrows():    
        #hash key
        key='product:'+ product_df.iloc[index]['primary_key']
        #hash fields
        item_id=product_df.iloc[index]['item_id']
        item_keywords=product_df.iloc[index]['item_keywords']
        item_name=product_df.iloc[index]['item_name']
        item_keywords_vector = vector_data[index].astype(np.float32).tobytes()
        product_data_values ={'item_id':item_id,'item_keywords':item_keywords,
                 'item_name':item_name,
                 vector_field_name:item_keywords_vector}
        
        p.hset(key,mapping=product_data_values)
    p.execute()
    
        
def delete_data(client: Redis):
    client.flushall()

# Utility Functions to Create Indexes on Vector field

In [None]:
def create_bf_index (redis_conn,index_name,vector_field_name,number_of_vectors, vector_dimensions=768, distance_metric='L2'):
    bf_index = redisearch.Client(index_name, conn=redis_conn)
    bf_index.redis.execute_command("FT.CREATE", index_name, "SCHEMA",vector_field_name, "VECTOR", "FLOAT32", vector_dimensions, distance_metric, "BF", "INITIAL_CAP", number_of_vectors)
    return bf_index

def create_hnsw_index (redis_conn,index_name,vector_field_name,number_of_vectors, vector_dimensions=768, distance_metric='L2',M=40,EF=200):
    hnsw_index = redisearch.Client(index_name, conn=redis_conn)
    hnsw_index.redis.execute_command("FT.CREATE", index_name, "SCHEMA", vector_field_name, "VECTOR", "FLOAT32", vector_dimensions, distance_metric, "HNSW", "INITIAL_CAP", number_of_vectors, "M", M, "EF", EF)
    return hnsw_index

def delete_index(vector_index):
    delete_data(vector_index.redis)
    

# Utility Functions to Perform Similarity Search 
Using different indexing methods

In [None]:
def find_similar_products_bf(product_query, query_encoder, vector_index,vector_field_name, topK=5):
    #vectorize the query
    query_vector = query_encoder.encode(product_query).astype(np.float32).tobytes()
    #prepare the query
    q = redisearch.Query(f'@{vector_field_name}:[$vec_param TOPK {topK}]').sort_by(f'{vector_field_name}_score').paging(0,topK).return_fields(f'{vector_field_name}_score','item_name','item_keywords')
    #Execute the query
    results = vector_index.search(q, query_params = {'vec_param': query_vector})
    return results 

def find_similar_products_hnsw(product_query, query_encoder, vector_index,vector_field_name, topK=5,EF=5):
    #vectorize the query
    query_vector = query_encoder.encode(product_query).astype(np.float32).tobytes()
    #prepare the query
    q = redisearch.Query(f'@{vector_field_name}:[$vec_param TOPK {topK}]  => {{$EFRUNTIME : {EF}}}').sort_by(f'{vector_field_name}_score').paging(0,topK).return_fields(f'{vector_field_name}_score','item_name','item_keywords')
    #Execute the query
    results = vector_index.search(q, query_params = {'vec_param': query_vector})
    return results 



# Brute-Force - Load and Index Product Data
Load and index product data using a brute-force Index on the 'item_keywords_vector' field.
This index is used to calculate Top K Exact Nearest Neighbors of a given vector

In [None]:
%%time
print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products')
my_bf_index = create_bf_index(redis_conn,'my_bf_index','item_keywords_vector',NUMBER_PRODUCTS,768,'L2')
load_vectors(my_bf_index.redis,subset_df,item_keywords_vectors,'item_keywords_vector')

# Brute-Force - Query The Top 5 Similar Products
Let's use the brute-force index to find the exact top k nearest neighbors of a given text query

Check the output for 2 very different queries:
* Query 1 = 'Fantastic piece of handmade jewllery for a special occasion'
* Query 2 = 'Ultra modern cool way to pimp up my phone'



In [None]:
%%time
#product_query='Fantastic piece of handmade jewllery for a special occasion'
product_query='cool way to pimp up my cell'

results = find_similar_products_bf (product_query,model,my_bf_index,'item_keywords_vector',5)
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Keywords = ' +  color.END  + product.item_keywords_vector_score)

## Check one of the Search Results

In [None]:
results.docs[0]

In [None]:
#Delete index and underlying data
delete_index(my_bf_index)



# HNSW - Load and Index Product Data
Load and index product data using an HNSW Index on the 'item_keywords_vector' field.
This index is used to calculate Top K Approximate Nearest Neighbors of a given vector

In [None]:
%%time
print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products')
my_hnsw_index = create_hnsw_index(redis_conn,'my_hnsw_index','item_keywords_vector',NUMBER_PRODUCTS,768,'L2',M=40,EF=200)
load_vectors(my_hnsw_index.redis,subset_df,item_keywords_vectors,'item_keywords_vector')

# HNSW - Query The Top 5 Similar Products
Let's repeat the similarity search but this time using the HNSW index

Check the output for 2 very different queries:
* Query 1 = 'Fantastic piece of handmade jewllery for a special occasion'
* Query 2 = 'Ultra modern cool way to pimp up my cell'



In [None]:
%%time
product_query='Fantastic piece of handmade jewllery for a special occasion'
#product_query='Ultra modern cool way to pimp up my cell'

results = find_similar_products_hnsw (product_query,model,my_hnsw_index,'item_keywords_vector',5,EF=5)
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Keywords = ' +  color.END  + product.item_keywords_vector_score)

In [None]:
#cleanup
delete_index(my_hnsw_index)