In [3]:
import random
import numpy as np
import pandas as pd
import time
from redis import Redis
from redis.commands.search.field import VectorField
from redis.commands.search.field import TextField
from redis.commands.search.field import TagField
from redis.commands.search.query import Query
from redis.commands.search.result import Result

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'


# 加载 Amazon 产品数据集

在加载时截断文本选定字段。

预训练句子嵌入生成器支持的最大长度为512。

In [6]:
MAX_TEXT_LENGTH = 512
NUMBER_PRODUCTS = 1000

def auto_truncate(val):
    """
    自定义函数，截断，到最大长度。
    """
    return val[:MAX_TEXT_LENGTH]

# Load Product data and truncate long text fields
# 在读取 CSV 的时候，使用自定义方法，截取 3 个列中的前 512 个字符
all_prods_df = pd.read_csv("data/product_data.csv", converters={
                                'bullet_point': auto_truncate,
                                'item_keywords':auto_truncate,
                                'item_name':auto_truncate
                                }
                          )

# 组合两个字段作为新数据的主键
all_prods_df['primary_key'] = all_prods_df['item_id'] + '-' + all_prods_df['domain_name']
# 替换 item_keywords 字段中个的 '' 为 np.nan
all_prods_df['item_keywords'].replace('', np.nan, inplace=True)
# 删除 item_keywords 中的 na 值
all_prods_df.dropna(subset=['item_keywords'], inplace=True)
all_prods_df.reset_index(drop=True,inplace=True)

# get the first 1000 products with non-empty item keywords
# 获取前 1000 个具有非空项目关键字的产品
product_metadata = all_prods_df.head(NUMBER_PRODUCTS).to_dict(orient='index')

In [None]:
product_metadata

In [7]:
all_prods_df.head()

Unnamed: 0,item_id,marketplace,country,main_image_id,domain_name,bullet_point,item_keywords,material,brand,color,item_name,model_name,model_number,product_type,primary_key
0,B07T6RZ2CM,Amazon,IN,71dZhpsferL,amazon.in,3D Printed Hard Back Case Mobile Cover for Len...,mobile cover back cover mobile case phone case...,,Amazon Brand - Solimo,Others,Amazon Brand - Solimo Designer Couples Sitting...,Lenovo K4 Note,gz8115-SL40423,CELLULAR_PHONE_CASE,B07T6RZ2CM-amazon.in
1,B07T2JY31Y,Amazon,IN,71vX7qIEAIL,amazon.in,3D Printed Hard Back Case Mobile Cover for Son...,mobile cover back cover mobile case phone case...,Wood,Amazon Brand - Solimo,others,Amazon Brand - Solimo Designer Leaf on Wood 3D...,Sony Xperia Z1 L39H,gz8056-SL40528,CELLULAR_PHONE_CASE,B07T2JY31Y-amazon.in
2,B0849YGSCZ,Amazon,AE,A1EZF-2mB5L,amazon.ae,,small de fur rooms navidad woven girls shag pa...,,Stone & Beam,,Stone & Beam Contemporary Doily Wool Farmhouse...,,I59I8044IVYGRYC00-Parent,HOME_FURNITURE_AND_DECOR,B0849YGSCZ-amazon.ae
3,B081K6TCML,Amazon,IN,81o9EyZ-fAL,amazon.in,Solimo Plastic Multipurpose Modular Drawer; sm...,drawer modular drawer 3 rack modular drawer ki...,Plastic,Amazon Brand - Solimo,Multicolor,Amazon Brand - Solimo Plastic Multipurpose Mod...,,sol_cujo_13,HOME,B081K6TCML-amazon.in
4,B0854774X5,Amazon,IN,81xaJCVnl3L,amazon.in,"Snug fit for Nokia 8.1, with perfect cut-outs ...",Back Cover Designer Case Designer Take It Easy...,Silicon,Amazon Brand - Solimo,Multicolor,Amazon Brand - Solimo Designer Take It Easy UV...,Nokia 8.1,UV10714-SL40617,CELLULAR_PHONE_CASE,B0854774X5-amazon.in


# 连接到 Redis

In [8]:
host = 'vecsim'
port = 6379
redis_conn = Redis(host = host, port = port)
print ('Connected to redis')

Connected to redis


In [16]:
redis_conn

Redis<ConnectionPool<Connection<host=vecsim,port=6379,db=0>>>

# 嵌入生成器安装

我们会使用一个[HuggingFace](https://huggingface.co/sentence-transformers/all-distilroberta-v1)上预训练的句子嵌入生成器。



In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

Downloading (…)87e68/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)5afc487e68/README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading (…)fc487e68/config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e68/data_config.json:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading (…)afc487e68/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)87e68/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)7e68/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)afc487e68/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)c487e68/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

# 生成嵌入

我们将使用 Sentence Transformer 模型 (distilroberta-v1) 'Img2Vec' 为项目关键字生成嵌入（向量）

distilroberta-v1 为最多 512 个字符的给定句子生成一个 768 浮点向量。

In [10]:
%%time

item_keywords =  [product_metadata[i]['item_keywords']  for i in product_metadata.keys()]
item_keywords_vectors = [ model.encode(sentence) for sentence in item_keywords]


CPU times: user 3min 8s, sys: 0 ns, total: 3min 8s
Wall time: 40.1 s


### 检查生成向量的维度

In [12]:
len(item_keywords_vectors)

1000

In [13]:
len(product_metadata)

1000

In [14]:
# Check one of the products
product_metadata[0]

{'item_id': 'B07T6RZ2CM',
 'marketplace': 'Amazon',
 'country': 'IN',
 'main_image_id': '71dZhpsferL',
 'domain_name': 'amazon.in',
 'bullet_point': '3D Printed Hard Back Case Mobile Cover for Lenovo K4 Note Easy to put & take off with perfect cutouts for volume buttons, audio & charging ports. Stylish design and appearance, express your unique personality. Extreme precision design allows easy access to all buttons and ports while featuring raised bezel to life screen and camera off flat surface. Slim Hard Back Cover No Warranty None',
 'item_keywords': 'mobile cover back cover mobile case phone case mobile panel phone panel Lenovo mobile case Lenovo phone cover Lenovo back case hard case 3D printed mobile cover mobile cover back cover mobile case phone case mobile panel phone panel Lenovo mobile case Lenovo phone cover Lenovo back case hard case 3D printed mobile cover mobile cover back cover mobile case phone case mobile panel phone panel Lenovo mobile case Lenovo phone cover Lenovo 

In [15]:
item_keywords_vectors[0]

array([ 1.63512714e-02,  5.15371673e-02,  2.05468750e-05, -3.01682167e-02,
       -5.74108101e-02,  2.66802944e-02, -3.55679612e-03,  3.73508334e-02,
        4.68826927e-02, -1.97955035e-02, -4.86263558e-02, -2.56960150e-02,
        3.70321795e-03,  7.14352448e-03, -2.64414772e-03,  1.59897190e-02,
        1.78457499e-02,  6.22210093e-02, -4.18959372e-02, -3.56804356e-02,
        1.47165852e-02, -3.82758528e-02,  2.37587094e-02, -1.40176909e-02,
       -6.99739233e-02, -5.59106618e-02, -1.23031344e-02, -3.78090814e-02,
       -6.60971599e-03, -3.96509096e-02, -8.95805284e-03, -3.27235758e-02,
        1.08916201e-02, -1.56319793e-02,  7.95790702e-02,  4.41595465e-02,
       -1.08439215e-02,  6.86705112e-02, -2.29273178e-02, -3.44910249e-02,
        5.20440377e-02, -1.25887962e-02,  4.94504049e-02,  2.91608099e-04,
        2.40929276e-02,  3.91421579e-02, -1.96286067e-02, -2.90713105e-02,
       -1.06236478e-02, -2.32878663e-02, -1.66118629e-02, -4.22629416e-02,
       -7.55494228e-03,  

# 加载产品数据的使用函数

每个产品都会存储在一个redis hash中
* **Hash Key** = **key='product:index+:primary_key'**

 

In [17]:
def load_vectors(client:Redis, product_metadata, vector_dict, vector_field_name):
    p = client.pipeline(transaction=False)
    for index in product_metadata.keys():    
        # hash key
        key='product:'+ str(index)+ ':' + product_metadata[index]['primary_key']
        
        # hash values
        item_metadata = product_metadata[index]
        item_keywords_vector = vector_dict[index].astype(np.float32).tobytes()
        item_metadata[vector_field_name]=item_keywords_vector
        
        # HSET
        p.hset(key,mapping=item_metadata)
            
    p.execute()

# 在向量字段上创建索引的实用函数

In [19]:
def create_flat_index (redis_conn,vector_field_name,number_of_vectors, vector_dimensions=512, distance_metric='L2'):
    redis_conn.ft().create_index([
        VectorField(vector_field_name, "FLAT", 
                    {"TYPE": "FLOAT32", 
                     "DIM": vector_dimensions, 
                     "DISTANCE_METRIC": distance_metric, 
                     "INITIAL_CAP": number_of_vectors, 
                     "BLOCK_SIZE":number_of_vectors }),
        TagField("product_type"),
        TextField("item_name"),
        TextField("item_keywords"),
        TagField("country")        
    ])

def create_hnsw_index (redis_conn,vector_field_name,number_of_vectors, vector_dimensions=512, distance_metric='L2',M=40,EF=200):
    redis_conn.ft().create_index([
        VectorField(vector_field_name, "HNSW", 
                    {"TYPE": "FLOAT32", 
                     "DIM": vector_dimensions, 
                     "DISTANCE_METRIC": distance_metric, 
                     "INITIAL_CAP": number_of_vectors, 
                     "M": M, 
                     "EF_CONSTRUCTION": EF}),
        TagField("product_type"),
        TextField("item_keywords"),        
        TextField("item_name"),
        TagField("country")     
    ])    


# FLAT - 加载和索引产品数据

让我们为 1000 个产品的图像向量和加载信息创建一个索引

**这可能需要 1-2 分钟**

FLAT 索引用于执行精确最近邻搜索。

查询向量将与数据库中的所有其他图像向量进行比较

In [20]:
%%time

ITEM_KEYWORD_EMBEDDING_FIELD='item_keyword_vector'
TEXT_EMBEDDING_DIMENSION=768
NUMBER_PRODUCTS=1000

print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products')

#flush all data
redis_conn.flushall()

#create flat index & load vectors
create_flat_index(redis_conn, ITEM_KEYWORD_EMBEDDING_FIELD,NUMBER_PRODUCTS,TEXT_EMBEDDING_DIMENSION,'COSINE')
load_vectors(redis_conn,product_metadata,item_keywords_vectors,ITEM_KEYWORD_EMBEDDING_FIELD)

Loading and Indexing + 1000 products
CPU times: user 152 ms, sys: 0 ns, total: 152 ms
Wall time: 730 ms


# FLAT index - FIND The Top K MOST SEMANTICALLY Similar Products
Let's use the brute-force index to find the exact top k nearest neighbors of a given text query

Check the output for 2 very different queries:
* Query 1 = 'beautifully crafted present for her. a special occasion'
* Query 2 = 'Ultra modern cool way to pimp up my phone'

Feel free to experiment with other text queries to match against the item keyword data.


# FLAT 索引 - 找到前 K 个最相似的产品
让我们使用强力索引来找到给定文本查询的前 k 个最近邻居

检查 2 个非常不同的查询的输出：
* 查询 1 = '为她精心制作的礼物。 一个特殊的场合'
* 查询 2 = '超现代酷炫的方式来装扮我的手机'

随意尝试其他文本查询以匹配项目关键字数据。






In [21]:
%%time
topK=5
product_query='beautifully crafted present for her. a special occasion'
#product_query='cool way to pimp up my cell'

#vectorize the query
query_vector = model.encode(product_query).astype(np.float32).tobytes()

#prepare the query
q = Query(f'*=>[KNN {topK} @{ITEM_KEYWORD_EMBEDDING_FIELD} $vec_param AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','item_name','item_id','item_keywords').dialect(2)
params_dict = {"vec_param": query_vector}


#Execute the query
results = redis_conn.ft().search(q, query_params = params_dict)

#Print similar products found
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Id = ' +  color.END  + product.item_id)
    print (color.YELLOW + 'Item keywords = ' +  color.END  + product.item_keywords)
    print (color.YELLOW + 'Score = ' +  color.END  + product.vector_score)


***************Product  found ************
[1mhash key = [0mproduct:0:B07T6RZ2CM-amazon.in
[93mItem Name = [0mAmazon Brand - Solimo Designer Couples Sitting at Dark 3D Printed Hard Back Case Mobile Cover for Lenovo K4 Note
[93mItem Id = [0mB07T6RZ2CM
[93mItem keywords = [0mmobile cover back cover mobile case phone case mobile panel phone panel Lenovo mobile case Lenovo phone cover Lenovo back case hard case 3D printed mobile cover mobile cover back cover mobile case phone case mobile panel phone panel Lenovo mobile case Lenovo phone cover Lenovo back case hard case 3D printed mobile cover mobile cover back cover mobile case phone case mobile panel phone panel Lenovo mobile case Lenovo phone cover Lenovo back case hard case 3D printed mobile cover mobile cover back cover mobil
[93mScore = [0m0
***************Product  found ************
[1mhash key = [0mproduct:1:B07T2JY31Y-amazon.in
[93mItem Name = [0mAmazon Brand - Solimo Designer Leaf on Wood 3D Printed Hard Back Case Mo

# HNSW - 加载和索引产品数据

让我们尝试使用 HNSW 索引进行索引。
该索引用于计算给定向量的前 K 个近似最近邻

In [None]:
%%time
print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products')

ITEM_KEYWORD_EMBEDDING_FIELD='item_keyword_vector'
NUMBER_PRODUCTS=1000
TEXT_EMBEDDING_DIMENSION=768



#flush all data
redis_conn.flushall()

#create flat index & load vectors
create_hnsw_index(redis_conn, ITEM_KEYWORD_EMBEDDING_FIELD,NUMBER_PRODUCTS,TEXT_EMBEDDING_DIMENSION,'COSINE',M=40,EF=200)
load_vectors(redis_conn,product_metadata,item_keywords_vectors,ITEM_KEYWORD_EMBEDDING_FIELD)


Loading and Indexing + 1000 products


# HNSW - Query The Top 5 semantically Similar Products
Let's repeat the similarity search but this time using the HNSW index

Check the output for 2 very different queries:
* Query 1 = 'beautifully crafted present for her. a special occasion'
* Query 2 = 'Ultra modern cool way to pimp up my cell'

# HNSW - 查询前 5 个语义相似的产品
让我们重复相似性搜索，但这次使用 HNSW 索引

检查 2 个非常不同的查询的输出：
* 查询 1 = '为她精心制作的礼物。 一个特殊的场合'
* 查询 2 = '超现代酷炫的方式来装扮我的手机'

In [None]:
%%time
topK=5
product_query='beautifully crafted present for her. a special occasion'
#product_query='cool way to pimp up my cell'

#vectorize the query
query_vector = model.encode(product_query).astype(np.float32).tobytes()

#prepare the query
q = Query(f'*=>[KNN {topK} @{ITEM_KEYWORD_EMBEDDING_FIELD} $vec_param AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','item_name','item_id','item_keywords','country').dialect(2)
params_dict = {"vec_param": query_vector}


#Execute the query
results = redis_conn.ft().search(q, query_params = params_dict)

#Print similar products found
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Id = ' +  color.END  + product.item_id)
    print (color.YELLOW + 'Item keywords = ' +  color.END  + product.item_keywords)
    print (color.YELLOW + 'Country = ' +  color.END  + product.country)
    print (color.YELLOW + 'Score = ' +  color.END  + product.vector_score)


# HNSW - Hybrid Query the top 5 most visually semantically products available ONLY in selected markets

Let's repeat our Top 5 search but this time limit to products that meet the following criteria:
* **Listed on** Amazon Germany (DE), India (IN) or Italy (IT)


This RediSearch query has this form:

**(@country:{{DE|IN|IT}})=> [KNN 5 vector_field_name $query_vector EF_RUNTIME 10 AS vector_score])**


# HNSW - 混合查询仅在特定市场提供的前 5 名最具视觉语义的产品

让我们重复我们的前 5 名搜索，但这次限制为满足以下条件的产品：
* **列于**亚马逊德国（DE）、印度（IN）或意大利（IT）


此 RediSearch 查询具有以下形式：

**(@country:{{DE|IN|IT}})=> [KNN 5 vector_field_name $query_vector EF_RUNTIME 10 AS vector_score])**

In [None]:
%%time
topK=5
product_query='beautifully crafted carpets for a special occasion'
product_query='cool way to pimp up my cell'

#vectorize the query
query_vector = model.encode(product_query).astype(np.float32).tobytes()

#prepare the query
q = Query(f'(@country:{{DE|IN|IT}})=>[KNN {topK} @{ITEM_KEYWORD_EMBEDDING_FIELD} $vec_param AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','item_name','item_id','item_keywords','country').dialect(2)
params_dict = {"vec_param": query_vector}


#Execute the query
results = redis_conn.ft().search(q, query_params = params_dict)

#Print similar products found
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Id = ' +  color.END  + product.item_id)
    print (color.YELLOW + 'Item keywords = ' +  color.END  + product.item_keywords)
    print (color.YELLOW + 'Score = ' +  color.END  + product.vector_score)
    print (color.YELLOW + 'Country = ' +  color.END  + product.country)
