jupyter kernelgateway --KernelGatewayApp.api=kernel_gateway.notebook_http --KernelGatewayApp.seed_uri=./hybrid_search_api.ipynb --port=10100

In [57]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
from sentence_transformers import SentenceTransformer

2023-06-10 20:17:24.749802: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-10 20:17:24.898055: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-10 20:17:25.406298: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-06-10 20:17:25.406356: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [1]:
from typesense import Client

In [2]:
client = Client({
    'nodes': [{
        'host': 'localhost', # For Typesense Cloud use xxx.a1.typesense.net
        'port': '8108',      # For Typesense Cloud use 443
        'protocol': 'http'   # For Typesense Cloud use https
    }],
    'api_key': 'xyz',
    'connection_timeout_seconds': 10
})

In [6]:
client.operations.is_healthy()

True

In [6]:
encoder = SentenceTransformer('all-MiniLM-L6-v2')

In [26]:
def wrrf(subj, obj, alpha=0.5, C=1, k=1, s_e=1, o_e=1, c_e=1):
    return alpha*(1/(k+subj))**s_e + (1-alpha)*(1/(k+obj))**o_e + C*( (1/(np.sqrt(k)+subj))*(1/(np.sqrt(k)+obj)) )**c_e

In [114]:
def dense_query(query, include_fields='publication,title,content,id', index='news', top_k=50):
    search_requests = {
        'searches': [
            {
                'collection': index,
                'q' : '*',
                'vector_query': f'vec:({encoder.encode(query).tolist()}, k:{top_k})',
                'include_fields' : include_fields,
                'per_page': top_k
            }
        ]
    }

    common_search_params =  {}
    vec_hits = client.multi_search.perform(search_requests, common_search_params)

    results = vec_hits['results'][0]['hits']
    results_df = [x['document'] for x in results]
    for record, result in zip(results_df, results):
        record['dense_score'] = result['vector_distance']
    return pd.DataFrame(results_df)

def sparse_query(query, include_fields='publication,title,content', index='news', top_k=50):
    search_parameters = {
        'q'         : query,
        'query_by'  : 'content,title',
        'exclude_fields': 'vec',
        'include_fields': include_fields,
        'per_page' : top_k
    }

    results = client.collections[index].documents.search(search_parameters)['hits']
    results_df = [x['document'] for x in results]
    for record, result in zip(results_df, results):
        record['sparse_score'] = result['text_match']

    return pd.DataFrame(results_df)

def hybrid_query(query, include_fields='publication,title,content,id', index='news', top_k=250):
    sparse_results = sparse_query(query, include_fields=include_fields, index=index, top_k=top_k)
    dense_results = dense_query(query, include_fields=include_fields, index=index, top_k=top_k)

    hybrid_results = pd.merge(sparse_results, dense_results[['id','dense_score']],
                              how='inner',
                              left_on=['id'],
                              right_on=['id'])

    hybrid_results['sparse_rank'] = (-1*hybrid_results['sparse_score']).rank()
    hybrid_results['dense_rank'] = (-1*hybrid_results['dense_score']).rank()
    hybrid_results['wrrf_score'] = hybrid_results.apply(lambda x: wrrf(x['sparse_rank'], x['dense_rank']), axis=1)
    hybrid_results['wrrf_rank'] = (-1*hybrid_results['wrrf_score']).rank()

    return hybrid_results

def densify_sparse(query, sparse_results):
    query_vec = encoder.encode(query)
    sparse_vecs = encoder.encode(sparse_results['content'].values)
    similarity = cosine_similarity([query_vec], sparse_vecs)
    sparse_results['dense_score'] = similarity[0]
    sparse_results['sparse_rank'] = (-1*sparse_results['sparse_score']).rank()
    sparse_results['dense_rank'] = (-1*sparse_results['dense_score']).rank()
    sparse_results['wrrf_score'] = sparse_results.apply(lambda x: wrrf(x['sparse_rank'], x['dense_rank']), axis=1)
    sparse_results['wrrf_rank'] = (-1*sparse_results['wrrf_score']).rank()
    return sparse_results

def hybrid_reranking(query, include_fields='publication,title,content,id', index='news', top_k=250):
    sparse_results = sparse_query(query, include_fields=include_fields, index=index, top_k=top_k)
    sparse_results = densify_sparse(query, sparse_results)
    sparse_results = sparse_results.iloc[:50]
    return sparse_results.sort_values(by=['wrrf_rank'])


def id_query(id, include_fields='publication,title,content', index='news', top_k=50):
    results = client.collections['news'].documents[str(id)].retrieve()
    del results['vec']
    results = pd.Series(results)
    return results

_ = hybrid_query('trump inauguration')

_  =client.collections['news'].documents[str(204532)].retrieve()

In [None]:
# GET /query/by_id

req = json.loads(REQUEST)
query = req['args']['query'][0]

results = id_query(query)

print(results.to_json())

In [None]:
# ResponseInfo GET /query/by_id
print(json.dumps({
    "status" : 200,
    "headers" : {
        "Content-Type" : "application/json"
    }
}))

In [None]:
# GET /query/sparse

req = json.loads(REQUEST)
query = req['args']['query'][0]

results = sparse_query(query)

print(results.to_json(orient='records'))

In [None]:
# ResponseInfo GET /query/sparse
print(json.dumps({
    "status" : 200,
    "headers" : {
        "Content-Type" : "application/json"
    }
}))

In [None]:
# GET /query/dense

req = json.loads(REQUEST)
query = req['args']['query'][0]

results = dense_query(query)

print(results.to_json(orient='records'))

In [None]:
# ResponseInfo GET /query/dense
print(json.dumps({
    "status" : 200,
    "headers" : {
        "Content-Type" : "application/json"
    }
}))

In [None]:
# GET /query/hybrid

req = json.loads(REQUEST)
query = req['args']['query'][0]

results = hybrid_query(query)
print(results.to_json(orient='records'))

In [None]:
# ResponseInfo GET /query/hybrid
print(json.dumps({
    "status" : 200,
    "headers" : {
        "Content-Type" : "application/json"
    }
}))

In [None]:
# POST /query/hybrid_rerank

req = json.loads(REQUEST)
query = req['args']['query'][0]

results = hybrid_reranking(query)
print(results.to_json(orient='records'))

In [None]:
# ResponseInfo POST /query/hybrid_rerank
print(json.dumps({
    "status" : 200,
    "headers" : {
        "Content-Type" : "application/json"
    }
}))