# Vector Similarity for RediSearch - Hybrid queries


## Python examples

### Packages

In [12]:
!pip install git+https://github.com/RediSearch/redisearch-py.git@params
!pip install numpy

Collecting git+https://github.com/RediSearch/redisearch-py.git@params
  Cloning https://github.com/RediSearch/redisearch-py.git (to revision params) to /tmp/pip-req-build-xmid8x85
  Running command git clone --filter=blob:none --quiet https://github.com/RediSearch/redisearch-py.git /tmp/pip-req-build-xmid8x85
  Running command git checkout -b params --track origin/params
  Switched to a new branch 'params'
  Branch 'params' set up to track remote branch 'params' from 'origin'.
  Resolved https://github.com/RediSearch/redisearch-py.git to commit 1c938c33cf314c0403e487473f9022f645de35c8
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting rmtest@ git+https://github.com/RedisLabs/rmtest@master
  Cloning https://github.com/RedisLabs/rmtest (to revision master) to /tmp/pip-install-3ygg7kn_/rmtest_44fbd4d60671454a8c306a47dd884045
  Running command git clone --filte

In [18]:
import numpy as np
from redis import Redis
import redisearch

### Create redis client

In [19]:
host = "localhost"
port = 6379

redis_conn = Redis(host = host, port = port)

In [20]:
# Index fields and configurations

n_vec = 10000
dim = 128
M = 40
EF = 200
vector_field_name = "vector"
title_field_name = "title"
genre_field_name = "genre"
rating_field_name = "rating"
k = 10

In [21]:
def load_vectors(client : Redis, n, d):
    for i in range(1, n+1):
        # np_vector = np.float32([i for j in range(dim)])
        np_vector = np.random.rand(1, d).astype(np.float32)
        if i%5 != 0:
            client.hset(i, mapping = {vector_field_name: np_vector.tobytes(),
                                      rating_field_name: 10*(i/n),  # ratings ranges from 0-10, sorted by the id
                                      genre_field_name: "action",
                                      title_field_name: "matrix"})
        else:
            client.hset(i, mapping = {vector_field_name: np_vector.tobytes(),
                                      rating_field_name: 10*(i/n),
                                      genre_field_name: "action, drama",
                                      title_field_name: "spiderman"})
        
def delete_data(client: Redis):
    client.flushall()
        

### Create HNSW index with meta-data

In [29]:
# build index
hnsw_index = redisearch.Client("my_hnsw_index", conn=redis_conn)
delete_data(redis_conn)
hnsw_index.redis.execute_command('FT.CREATE', "my_hnsw_index", 'SCHEMA',
                                 vector_field_name, 'VECTOR', 'HNSW', '8', 'TYPE', 'FLOAT32', 'DIM', dim, 'DISTANCE_METRIC', 'L2', 'EF_RUNTIME', EF,
                                 rating_field_name, 'NUMERIC',title_field_name, 'TEXT', genre_field_name, 'TAG')

#load vectors
np.random.seed(42)
load_vectors(hnsw_index.redis, n_vec, dim)

print("index size: ", hnsw_index.info()['num_docs'])

query_vector = np.random.rand(1, dim).astype(np.float32)
# query_vector = np.float32([n_vec for j in range(dim)])

index size:  10000


## Hybrid queries examples

In [32]:
# Give me the top 10 reviews on action movies similar to mine

q = redisearch.Query(f'(@{genre_field_name}:{{action}})=>[KNN 10 @{vector_field_name} $vec_param]').sort_by(f'__{vector_field_name}_score')
res = hnsw_index.search(q, query_params = {'vec_param': query_vector.tobytes()})

docs = [int(doc.id) for doc in res.docs]
rs_dists = [float(doc.__vector_score) for doc in res.docs]
print(docs)
print(rs_dists)

[8770, 9386, 83, 5126, 9572, 3492, 6268, 3949, 4437, 1057]
[13.3469600677, 14.5484476089, 14.7082948685, 14.7560749054, 14.837141037, 14.9124708176, 15.2173652649, 15.3307313919, 15.3791847229, 15.4887781143]


In [33]:
# Give me the top 10 reviews on action movies similar to mine that got ratings between 5-7.
# (ids 5000-7000)

q = redisearch.Query(f'(@{genre_field_name}:{{action}} @{rating_field_name}:[5 7])=>[KNN 10 @{vector_field_name} $vec_param]').sort_by(f'__{vector_field_name}_score')
res = hnsw_index.search(q, query_params = {'vec_param': query_vector.tobytes()})

docs = [int(doc.id) for doc in res.docs]
rs_dists = [float(doc.__vector_score) for doc in res.docs]
print(docs)
print(rs_dists)

[5126, 6268, 5390, 5085, 6741, 6251, 5239, 5487, 5194, 5595]
[14.7560749054, 15.2173652649, 15.793586731, 15.8196582794, 15.8495740891, 15.8533287048, 16.0165462494, 16.0417747498, 16.0750141144, 16.2356395721]


In [34]:
# Give me the top 10 reviews on spiderman movie that are similar to mine and got ratings between 5-7.
#(ids 5000-7000 divided by 5)

q = redisearch.Query(f'(@{title_field_name}:spiderman @{rating_field_name}:[5 7])=>[KNN 10 @{vector_field_name} $vec_param]').sort_by(f'__{vector_field_name}_score')
res = hnsw_index.search(q, query_params = {'vec_param': query_vector.tobytes()})

docs = [int(doc.id) for doc in res.docs]
rs_dists = [float(doc.__vector_score) for doc in res.docs]
print(docs)
print(rs_dists)

[5390, 5085, 5595, 6695, 6285, 5765, 6595, 5795, 5790, 5550]
[15.793586731, 15.8196582794, 16.2356395721, 16.4198703766, 16.419916153, 16.4874362946, 16.5903587341, 16.657459259, 16.6816978455, 16.7862262726]


In [35]:
# Give me the top 10 reviews on movies which aren't spiderman that are similar to mine.
#(all ids which are not divided by 5)

q = redisearch.Query(f'(@{genre_field_name}:{{action}} -@{title_field_name}:spider*)=>[KNN 10 @{vector_field_name} $vec_param]').sort_by(f'__{vector_field_name}_score')
res = hnsw_index.search(q, query_params = {'vec_param': query_vector.tobytes()})

docs = [int(doc.id) for doc in res.docs]
rs_dists = [float(doc.__vector_score) for doc in res.docs]
print(docs)
print(rs_dists)

[9386, 83, 5126, 9572, 3492, 6268, 3949, 4437, 1057, 557]
[14.5484476089, 14.7082948685, 14.7560749054, 14.837141037, 14.9124708176, 15.2173652649, 15.3307313919, 15.3791847229, 15.4887781143, 15.4977865219]


In [37]:
# Give me the top 10 reviews which are on "spiderman" movie, or movies with at least 9 rating.
#(ids which are divided by 5 or above 9000)

q = redisearch.Query(f'((@{title_field_name}:spiderman) | (@{rating_field_name}:[9 inf]))=>[KNN 10 @{vector_field_name} $vec_param]').sort_by(f'__{vector_field_name}_score')
res = hnsw_index.search(q, query_params = {'vec_param': query_vector.tobytes()})

docs = [int(doc.id) for doc in res.docs]
rs_dists = [float(doc.__vector_score) for doc in res.docs]
print(docs)
print(rs_dists)

[8770, 9386, 9572, 8400, 9396, 3655, 9526, 9353, 5390, 5085]
[13.3469600677, 14.5484476089, 14.837141037, 15.4953155518, 15.6169643402, 15.6970911026, 15.7229309082, 15.7777109146, 15.793586731, 15.8196582794]
