In [None]:
import random
import numpy as np
import pandas as pd
import time
from redis import Redis
from redis.commands.search.field import VectorField
from redis.commands.search.field import TextField
from redis.commands.search.field import TagField
from redis.commands.search.query import Query
from PIL import Image
from img2vec_pytorch import Img2Vec

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'


# Load Amazon Product and Image metadata



In [None]:
?TagField

In [None]:
#Load Product data and truncate long text fields
all_prods_df = pd.read_csv('data/product_image_data.csv')
all_prods_df['primary_key'] = all_prods_df['item_id'] + '-' + all_prods_df['domain_name']
all_prods_df.shape


In [None]:
all_prods_df.head(5)

# Connect to Redis

In [None]:
host = 'vecsim'
port = 6379
redis_conn = Redis(host = host, port = port)
redis_conn.ping()
print ('Connected to redis')

# Generate Embeddings

We will use 'Img2Vec' to generate embeddings (vectors) for 1K product images

https://github.com/christiansafka/img2vec

In [None]:
img2vec = Img2Vec(cuda=False)


By Default, Img2Vect uses **'resnet-18'** as the neural network architecture to generate embeddings. In particular, each image is run through this network and the output at the  'avgpool' layer will be returned 

The output of the 'avgpool' layer in **'resnet-18' has 512 dimensions** so a single 512-float vector will be generated for every image converted

In [None]:
NUMBER_PRODUCTS = 1000



subset_df = all_prods_df.head(NUMBER_PRODUCTS)


In [None]:
subset_df.head()

In [None]:
subset_df.shape

In [None]:
#transform the df into a dictionary
product_metadata = subset_df.to_dict(orient='index')

In [None]:
# Check one of the products
product_metadata[0]

# Some Utility Functions to Generate Vectors from Images

Product images are stored under the 'data/small' folder

Every product has metadata indicating the full path to the main product image


The 'generate_img2vec_dict' function below simply takes:
* A dataframe with product metadata
* The folder where images are stored
* A batch size to generate image vectors for a batch of products in one call

The output will be a dictionary mapping 'full image path' to its corresponding vector generated

In [None]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def generate_img2vec_dict(df,image_path, batch_size=100):
    output_dict={}

    for batch in chunker(df, batch_size):
        image_filenames=batch['path'].values.tolist()
        images=[]
        converted=[]
        
        for img_fn in image_filenames:
            try:
                img = Image.open(image_path + img_fn)
                images.append(img)
                converted.append(img_fn)
            except:
                #unable_to_convert -> skip to the next image
                continue
        
        #Generate vectors for all images in this batch
        vec_list = img2vec.get_vec(images)
        
        #update the dictionary to be returned
        batch_dict= dict(zip(converted, vec_list))
        output_dict.update(batch_dict)
        
    
    return output_dict


### Time to generate the vectors!

This may take 30-60 seconds depending on your set up

In [None]:
%%time
IMAGE_PATH= './data/images/small/'

img2vec_dict = generate_img2vec_dict(subset_df,IMAGE_PATH,batch_size=250)


In [None]:
#img2vec_dict['30/3079540e.jpg']

# Utility Functions to Load Product metadata and image data
Each product will be stored in a redis hash
* **Hash Key** = **product:primary_key**
* **Hash Fields:** 
    * Item Id
    * Item Name
    * Product Image vector = 512-float vector
 

In [None]:
def load_vectors(client:Redis, product_metadata, vector_dict, vector_field_name):
    p = client.pipeline(transaction=False)
    for index in product_metadata.keys():    
        #hash key
        key='product:'+ product_metadata[index]['primary_key']
        
        #hash values
        item_metadata = product_metadata[index]
        item_path = item_metadata['path']
        
        if item_path in vector_dict:
            #retrieve vector for product image 
            product_image_vector = vector_dict[item_path].astype(np.float32).tobytes()
            item_metadata[vector_field_name]=product_image_vector
            
            # HSET
            #p.hset(key,mapping=product_data_values)
            p.hset(key,mapping=item_metadata)
            
    p.execute()

# Utility Functions to Create Indexes on Vector field

In [None]:
def create_flat_index (redis_conn,vector_field_name,number_of_vectors, vector_dimensions=512, distance_metric='L2'):
    redis_conn.ft().create_index([
        VectorField(vector_field_name, "FLAT", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric, "INITIAL_CAP": number_of_vectors, "BLOCK_SIZE":number_of_vectors }),
        TagField("product_type"),
        TextField("item_name"),
        TagField("country")        
    ])

def create_hnsw_index (redis_conn,vector_field_name,number_of_vectors, vector_dimensions=512, distance_metric='L2',M=40,EF=200):
    redis_conn.ft().create_index([
        VectorField(vector_field_name, "HNSW", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric, "INITIAL_CAP": number_of_vectors, "M": M, "EF_CONSTRUCTION": EF}),
        TagField("product_type"),
        TextField("item_name"),
        TagField("country")     
    ])    


# FLAT - Load and Index 1000 Products

Let's create a FLAT index for the image vectors and load 1000 hashes

A FLAT index is used to perform an exact nearest neighbors search. 

The query vector will be compared against all other image vectors in the database

In [None]:
%%time
PRODUCT_IMAGE_VECTOR_FIELD='product_image_vector'
IMAGE_VECTOR_DIMENSION=512
print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products')

#flush all data
redis_conn.flushall()

#create flat index & load vectors
create_flat_index(redis_conn, PRODUCT_IMAGE_VECTOR_FIELD,NUMBER_PRODUCTS,IMAGE_VECTOR_DIMENSION,'COSINE')
load_vectors(redis_conn,product_metadata,img2vec_dict,PRODUCT_IMAGE_VECTOR_FIELD)

# FLAT index - FIND The Top K MOST VISUALLY Similar Products
Let's use the FLAT index to find the exact top K nearest neighbors of a mobile phone cover available in the catalogue 



In [None]:
pos=0
print (product_metadata[pos]['item_name'])
print (product_metadata[pos]['path'])
queryImage = Image.open(IMAGE_PATH + product_metadata[pos]['path'])
queryImage


In [None]:
%%time
topK=5
query_vector = img2vec.get_vec(queryImage).astype(np.float32).tobytes()

#prepare the query
q = Query(f'*=>[KNN {topK} @{PRODUCT_IMAGE_VECTOR_FIELD} $vec_param AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','item_name','item_id','path').dialect(2)
params_dict = {"vec_param": query_vector}

#Execute the query
results = redis_conn.ft().search(q, query_params = params_dict)
docs = redis_conn.ft().search(q,params_dict).docs

#Print similar products found
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Id = ' +  color.END  + product.item_id)
    print (color.YELLOW + 'Score = ' +  color.END  + product.vector_score)
    result_img= Image.open(IMAGE_PATH + product.path)
    display(result_img)

## Examine Search Results

You can see the redis hash fields projected in the query (e.g item_name, item_path,item_id). 

The score field returns the distance between the query vector to each of the vectors in the result

In [None]:
results.docs

# HNSW - Load and Index Product Data

Let's repeat the exercise of loading and indexing 1000 products but this time using an HNSW index

This HNSW index  is used to calculate Approximate Nearest Neighbors (ANN) of a given vector image. 

It speeds up query times but requires more memory to store the vector index

In [None]:
%%time
print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products')
#flush all data
redis_conn.flushall()
#create HNSW index & load vectors
create_hnsw_index(redis_conn,PRODUCT_IMAGE_VECTOR_FIELD,NUMBER_PRODUCTS,IMAGE_VECTOR_DIMENSION,'COSINE',M=40,EF=200)
load_vectors(redis_conn,product_metadata,img2vec_dict,PRODUCT_IMAGE_VECTOR_FIELD)

# HNSW - Query the top 5 most visually similar products
Let's repeat the similarity search but this time using the HNSW index.

Let's see the image we're sending in for visual similarity




In [None]:
queryImage

In [None]:
%%time

topK=5
query_vector = img2vec.get_vec(queryImage).astype(np.float32).tobytes()
EF_RUNTIME=10

#prepare the query
q = Query(f'*=>[KNN {topK} @{PRODUCT_IMAGE_VECTOR_FIELD} $vec_param EF_RUNTIME {EF_RUNTIME} AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','item_name','item_id','path').dialect(2)
params_dict = {"vec_param": query_vector}

#Execute the query
results = redis_conn.ft().search(q, query_params = params_dict)
docs = redis_conn.ft().search(q,params_dict).docs

#Print similar products found
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Id = ' +  color.END  + product.item_id)
    print (color.YELLOW + 'Score = ' +  color.END  + product.vector_score)
    result_img= Image.open(IMAGE_PATH + product.path)
    display(result_img)

In [None]:
results.docs

# HNSW - Hybrid Query the top 5 most visually similar products ONLY in selected markets

Let's repeat our Top 5 search but this time limit to products that meet the following criteria:
* **Listed on** Amazon Germany (DE), United States (US) or Italy (IT) **AND**
* **Product type** = CELLULAR_PHONE_CASE  


This RediSearch query has this form:

**(@country:{{DE|US|IT}} @product_type:{{CELLULAR_PHONE_CASE}})=> [KNN 5 vector_field_name $query_vector EF_RUNTIME 10 AS vector_score])**


Note that there is only 1 matching this criteria

In [None]:
%%time

topK=5
query_vector = img2vec.get_vec(queryImage).astype(np.float32).tobytes()
EF_RUNTIME=10

#prepare the query
q = Query(f'(@country:{{DE|US|IT}} @product_type:{{CELLULAR_PHONE_CASE}})=>[KNN {topK} @{PRODUCT_IMAGE_VECTOR_FIELD} $vec_param EF_RUNTIME {EF_RUNTIME} AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','item_name','item_id','path','country').dialect(2)
params_dict = {"vec_param": query_vector}

#Execute the query
results = redis_conn.ft().search(q, query_params = params_dict)
docs = redis_conn.ft().search(q,params_dict).docs

#Print similar products found
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Id = ' +  color.END  + product.item_id)
    print (color.YELLOW + 'Score = ' +  color.END  + product.vector_score)
    print (color.YELLOW + 'Country = ' +  color.END  + product.country)
    result_img= Image.open(IMAGE_PATH + product.path)

    display(result_img)

In [None]:
docs