In [None]:
import random
import numpy as np
import pandas as pd
import time
from redis import Redis
from redis.commands.search.field import VectorField
from redis.commands.search.query import Query
from PIL import Image
from img2vec_pytorch import Img2Vec
import pickle

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'


# Load Amazon Product and Image metadata



In [None]:
#Load Product data and truncate long text fields
all_prods_df = pd.read_csv('data/product_image_data.csv')
all_prods_df['primary_key'] = all_prods_df['item_id'] + '-' + all_prods_df['domain_name']
all_prods_df.shape


In [None]:
all_prods_df.head(5)

# Connect to Redis

In [None]:
host = 'vecsim'
port = 6379
redis_conn = Redis(host = host, port = port)
redis_conn.ping()
print ('Connected to redis')

# Generate Embeddings

We will use 'Img2Vec' to generate embeddings (vectors) for 1K product images

https://github.com/christiansafka/img2vec

In [None]:
img2vec = Img2Vec(cuda=False)


By Default, Img2Vect uses **'resnet-18'** as the neural network architecture to generate embeddings. In particular, each image is run through this network and the output at the  'avgpool' layer will be returned 

The output of the 'avgpool' layer in **'resnet-18' has 512 dimensions** so a single 512-float vector will be generated for every image converted

In [None]:
NUMBER_PRODUCTS=100000
IMAGE_PATH = './data/images/small/'
PRODUCT_IMAGE_VECTOR_FIELD='product_image_vector'
IMAGE_VECTOR_DIMENSION=512

subset_df = all_prods_df.head(NUMBER_PRODUCTS)


In [None]:
subset_df.head()

In [None]:
subset_df.shape

# Some Utility Functions to Generate Vectors from Images

Product images are stored under the 'data/small' folder

Every product has metadata indicating the full path to the main product image


The 'generate_img2vec_dict' function below simply takes:
* A dataframe with product metadata
* The folder where images are stored
* A batch size to generate image vectors for a batch of products in one call

The output will be a dictionary mapping 'full image path' to its corresponding vector generated

In [None]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def generate_img2vec_dict(df,image_path, batch_size=500):
    output_dict={}

    for batch in chunker(df, batch_size):
        image_filenames=batch['path'].values.tolist()
        images=[]
        converted=[]
        
        for img_fn in image_filenames:
            try:
                img = Image.open(image_path + img_fn)
                images.append(img)
                converted.append(img_fn)
            except:
                #unable_to_convert -> skip to the next image
                continue
        
        #Generate vectors for all images in this batch
        vec_list = img2vec.get_vec(images)
        
        #update the dictionary to be returned
        batch_dict= dict(zip(converted, vec_list))
        output_dict.update(batch_dict)
        
    
    return output_dict


### Time to Load the vectors!

Let's load vectors, previously generated, for the the first 100k products

In [None]:
%%time

with open('100k-image-vectors.npy', 'rb') as handle:
    img2vec_dict = pickle.load(handle)


In [None]:
print (str(len(img2vec_dict.keys())) + ' keys')
#print (img2vec_dict.keys())

### Check the Dimensions of one of the vectors generated

In [None]:
first_key = next(iter(img2vec_dict))
first_vector = img2vec_dict[first_key]
first_vector.shape[0]

# Utility Functions to Load Product metadata and image data
Each product will be stored in a redis hash
* **Hash Key** = **product:primary_key**
* **Hash Fields:** 
    * Item Id
    * Item Name
    * Product Image vector = 512-float vector
 

In [None]:
def load_vectors(client:Redis, product_df, vector_dict,vector_field_name):
    p = client.pipeline(transaction=False)
    for index, row in product_df.sort_index().iterrows():    
        #hash key
        key='product:'+ product_df.iloc[index]['primary_key']
        #hash fields
        item_id=product_df.iloc[index]['item_id']
        item_path=product_df.iloc[index]['path']
        item_name=product_df.iloc[index]['item_name']
        
        if item_path in vector_dict:
            #retrieve vector for product image 
            product_image_vector = vector_dict[item_path].astype(np.float32).tobytes()
            
            
            #prepare data for the hash
            product_data_values ={'item_id':item_id,
                                  'item_name':item_name,
                                  'item_path':item_path,
                                  vector_field_name:product_image_vector}
            # HSET
            p.hset(key,mapping=product_data_values)
    p.execute()

# Utility Functions to Create Indexes on Vector field

In [None]:
def create_bf_index (redis_conn,vector_field_name,number_of_vectors, vector_dimensions=IMAGE_VECTOR_DIMENSION, distance_metric='L2'):
    redis_conn.ft().create_index((VectorField(vector_field_name, "FLAT", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric, "INITIAL_CAP": number_of_vectors})))

def create_hnsw_index (redis_conn,vector_field_name,number_of_vectors, vector_dimensions=IMAGE_VECTOR_DIMENSION, distance_metric='L2',M=40,EF=200):
    redis_conn.ft().create_index((VectorField(vector_field_name, "HNSW", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric, "INITIAL_CAP": number_of_vectors, "M": M, "EF_CONSTRUCTION": EF})))

def delete_index(client: Redis):
    client.flushall()


# Utility Functions to Perform Similarity Search 
Using different indexing methods

In [None]:
def find_similar_products_bf(product_image, image_encoder, redis_conn, vector_field_name, topK=5):
    #vectorize the image
    query_vector = image_encoder.get_vec(product_image).astype(np.float32).tobytes()
    #prepare the query
    q = Query(f'*=>[KNN {topK} @{vector_field_name} $vec_param AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','item_name','item_id','item_path').dialect(2)
    #Execute the query
    results = redis_conn.ft().search(q, query_params = {'vec_param': query_vector})
    return results 

def find_similar_products_hnsw(product_image, image_encoder, redis_conn, vector_field_name, topK=5,EF=5):
    #vectorize the image
    query_vector = image_encoder.get_vec(product_image).astype(np.float32).tobytes()
    #prepare the query
    q = Query(f'*=>[KNN {topK} @{vector_field_name} $vec_param EF_RUNTIME {EF} AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','item_name','item_id','item_path').dialect(2)
    #Execute the query
    results = redis_conn.ft().search(q, query_params = {'vec_param': query_vector})
    return results 


# Brute-Force - Load and Index Product Data
Let's create an index for the image vectors and load information for 100,000 products

This might take a couple of minutes depending on your setup

A brute-force index is used to perform an exhaustive search. The query image will be compared against all other vectors in the database

In [None]:
%%time

print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products')

create_bf_index(redis_conn,
                PRODUCT_IMAGE_VECTOR_FIELD,
                NUMBER_PRODUCTS,
                IMAGE_VECTOR_DIMENSION,
                'L2')


load_vectors(redis_conn,subset_df,img2vec_dict,PRODUCT_IMAGE_VECTOR_FIELD)

# Brute-Force - FIND The Top 5 MOST VISUALLY Similar Products
Let's use the brute-force index to find the exact top 5 nearest neighbors of one of the mobile phone cover products available in the catalogue 

The mobile phone product is the first product in the dataset
(pos = 0)

Feel free to set **pos** to another index in the cell below change the product! 







In [None]:
pos=0
print (subset_df.iloc[pos]['item_name'])
print (subset_df.iloc[pos]['path'])
queryImage = Image.open(IMAGE_PATH + subset_df.iloc[pos]['path'])
queryImage


In [None]:
%%time

results = find_similar_products_bf (queryImage,img2vec,redis_conn,
                                    PRODUCT_IMAGE_VECTOR_FIELD,5)

for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Id = ' +  color.END  + product.item_id)
    result_img= Image.open(IMAGE_PATH + product.item_path)
    display(result_img)

## Examine Search Results

You can see the redis hash fields projected in the query (e.g item_name, item_path,item_id). 

The score field returs the distance between the query vector to each of the vectors in the result

In [None]:
results.docs

In [None]:
#Delete index and underlying data
delete_index(redis_conn)


# HNSW - Load and Index Product Data

Let's create an HNSW index for the image vectors and load 100K hashes containing product information (including the image vector data)

This HNSW index is used to calculate Approximate Nearest Neighbors (ANN) of a given vector image. It speeds up query times but requires more memory to store the vector index

In [None]:
%%time
print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products')
create_hnsw_index(redis_conn,PRODUCT_IMAGE_VECTOR_FIELD,NUMBER_PRODUCTS,IMAGE_VECTOR_DIMENSION,'L2',M=40,EF=200)
load_vectors(redis_conn,subset_df,img2vec_dict,PRODUCT_IMAGE_VECTOR_FIELD)

# HNSW - Query The Top 5 Similar Products
Let's repeat the similarity search but this time using the HNSW index.

Let's see the image we're sending in for visual similarity




In [None]:
queryImage

In [None]:
%%time

results = find_similar_products_hnsw (queryImage,img2vec,redis_conn,PRODUCT_IMAGE_VECTOR_FIELD,5,EF=5)
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Id = ' +  color.END  + product.item_id)
    result_img= Image.open(IMAGE_PATH + product.item_path)
    display(result_img)

In [None]:
results.docs

In [None]:
#cleanup
delete_index(redis_conn)