# Hybrid search

Once all the documents are already indexed, we can finally start doing our searches.

In [1]:
from qdrant_client import QdrantClient, models

client = QdrantClient("http://localhost:6333", timeout=600)
client.count("scifact")

  client = QdrantClient("http://localhost:6333", timeout=600)


CountResult(count=5183)

In [2]:
from fastembed import TextEmbedding, SparseTextEmbedding

dense_embedding_model = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
bm25_embedding_model = SparseTextEmbedding("Qdrant/bm25")

In [3]:
query_text = "What is the impact of COVID-19 on the environment?"

The new Query API of Qdrant 1.10+ unifies all the operations that might be done on a collection.

In [4]:
# Please notice that we started using a brand new .query_points method.
# Before Qdrant 1.10 we used .query method instead.
client.query_points(
    "scifact",
    query=next(dense_embedding_model.query_embed(query_text)),
    using="all-MiniLM-L6-v2",
    limit=10,
    with_payload=False,
)

QueryResponse(points=[ScoredPoint(id=13770184, version=448, score=0.37506568, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=2097256, version=78, score=0.36063462, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=13882658, version=453, score=0.35937387, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=1215116, version=42, score=0.35598302, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=25897733, version=765, score=0.35456836, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=22401061, version=661, score=0.34815577, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=27866735, version=803, score=0.3396923, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=11936877, version=397, score=0.33812556, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=581832, version=21, scor

## Benchmarking

BeIR SciFact isn't designed for the demo purposes, but in order to benchmark how different methods deal with the same task, we can use it as a reference. Let's load the ground truth and evaluate the performance of various search pipelines.

In [5]:
from datasets import load_dataset

queries_dataset = load_dataset("BeIR/scifact", "queries", split="queries")
len(queries_dataset)
queries_dataset[0:10]

{'_id': ['0', '2', '4', '6', '9', '10', '11', '12', '14', '15'],
 'title': ['', '', '', '', '', '', '', '', '', ''],
 'text': ['0-dimensional biomaterials lack inductive properties.',
  '1 in 5 million in UK have abnormal PrP positivity.',
  '1-1% of colorectal cancer patients are diagnosed with regional or distant metastases.',
  '10% of sudden infant death syndrome (SIDS) deaths happen in newborns aged less than 6 months.',
  '32% of liver transplantation programs required patients to discontinue methadone treatment in 2001.',
  '4-PBA treatment decreases endoplasmic reticulum stress in response to general endoplasmic reticulum stress markers.',
  '4-PBA treatment raises endoplasmic reticulum stress in response to general endoplasmic reticulum stress markers.',
  '40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.',
  "5'-nucleotidase metabolizes 6MP.",
  '50% of patients exposed to radiation have activated marker

In [6]:
query_qrels = load_dataset("BeIR/scifact-qrels", split="train")
len(query_qrels)

919

In [7]:
query_qrels[0:3]

{'query-id': [0, 2, 4],
 'corpus-id': [31715818, 13734012, 22942787],
 'score': [1, 1, 1]}

### Building the ground truth dataset

The ground truth is a dataset of the queries with their best matches. Each of the matches needs a relevancy measure. In the simplest case that might be just binary information

In [8]:
for entry in query_qrels:
    print(entry)
    break

{'query-id': 0, 'corpus-id': 31715818, 'score': 1}


In [9]:
from ranx import Qrels
from collections import defaultdict

qrels_dict = defaultdict(dict)
for entry in query_qrels:
    query_id = str(entry["query-id"])
    doc_id = str(entry["corpus-id"])
    qrels_dict[query_id][doc_id] = entry["score"]

qrels = Qrels(qrels_dict, name="scifact")
qrels

DictType[unicode_type,DictType[[unichr x 9],int64]<iv=None>]<iv=None>({0: {31715818: 1}, 10: {32587939: 1}, 1000: {16472469: 1}, 1001: {5702790: 1}, 1002: {13639330: 1}, 1003: {14332945: 1, 4319844: 1, 4899981: 1}, 1004: {301838: 1, 2734421: 1, 3952288: 1}, 1005: {301838: 1, 2734421: 1, 3952288: 1}, 1006: {4926049: 1}, 1008: {2547636: 1}, 1009: {1982286: 1}, 1011: {9745001: 1}, 1015: {6277638: 1}, 1016: {6277638: 1}, 1018: {11603066: 1}, 1023: {16927286: 1}, 1025: {32408470: 1}, 1026: {3113630: 1}, 1027: {3113630: 1}, 1028: {13923140: 1, 11899391: 1}, 1030: {6441369: 1}, 1031: {12486491: 1}, 1032: {6836086: 1}, 1033: {6836086: 1}, 1034: {4547102: 1}, 1035: {4547102: 1}, 1036: {4547102: 1}, 1037: {16287725: 1}, 1038: {16287725: 1}, 104: {40164383: 1}, 1040: {25254425: 1, 16626264: 1}, 1042: {17421851: 1}, 1043: {17671145: 1}, 1044: {22500262: 1}, 1045: {22500262: 1}, 1046: {418246: 1, 4324278: 1, 16712164: 1}, 1047: {14706752: 1}, 1048: {12486491: 1}, 105: {36606083: 1}, 1050: {19878070

## Precalculating the embeddings

We'll test various hybrid pipelines, so it is a good idea to precompute all the query vectors beforehand.

In [10]:
import tqdm

dense_vectors, sparse_vectors, late_vectors = [], [], []
for query in tqdm.tqdm(queries_dataset):
    dense_query_vector = next(dense_embedding_model.query_embed(query["text"]))
    sparse_query_vector = next(bm25_embedding_model.query_embed(query["text"]))

    dense_vectors.append(dense_query_vector)
    sparse_vectors.append(sparse_query_vector)


100%|██████████| 1109/1109 [00:03<00:00, 278.42it/s]


## Testing various search pipelines

The ground truth dataset is ready, so we can start with calculating the effectiveness of each of our search methods separately.

### Dense embeddings

In [20]:
from ranx import Run

run_dict = {}
for query_idx, query in enumerate(queries_dataset):
    query_id = str(query["_id"])

    query_vector = dense_vectors[query_idx]

    results = client.query_points(
        "scifact",
        query=query_vector,
        using="all-MiniLM-L6-v2",
        with_payload=False,
        limit=5,
    )

    run_dict[query_id] = {
        str(point.id): point.score
        for point in results.points
    }

dense_run = Run(run_dict, name="all-MiniLM-L6-v2")

In [21]:
from ranx import evaluate

evaluate(qrels, dense_run, metrics=["precision@5", "mrr@5"], make_comparable=True)

{'precision@5': 0.1517923362175525, 'mrr@5': 0.5762875978574372}

### Sparse embeddings

In [24]:
run_dict = {}
for query_idx, query in enumerate(queries_dataset):
    query_id = str(query["_id"])

    query_vector = sparse_vectors[query_idx]

    results = client.query_points(
        "scifact",
        query=models.SparseVector(**query_vector.as_object()),
        using="bm25",
        with_payload=False,
        limit=5,
    )

    run_dict[query_id] = {
        str(point.id): point.score
        for point in results.points
    }

bm25_run = Run(run_dict, name="bm25")
evaluate(qrels, bm25_run, metrics=["precision@5", "mrr@5"], make_comparable=True)

{'precision@5': 0.16093943139678615, 'mrr@5': 0.6474660074165636}

### Reciprocal Rank Fusion

### Hybrid Search - Dense & sparse

In [25]:
hybrid_search_run_dict = {}
hybrid_search_result = {}
for query_idx, query in enumerate(queries_dataset):
    query_id = str(query["_id"])

    dense_query_vector = dense_vectors[query_idx]
    sparse_query_vector = sparse_vectors[query_idx]

    prefetch = [
        models.Prefetch(
            query=dense_query_vector,
            using="all-MiniLM-L6-v2",
            limit=10,
        ),
        models.Prefetch(
            query=models.SparseVector(**sparse_query_vector.as_object()),
            using="bm25",
            limit=10,
        ),
    ]
    hybrid_search_result = client.query_points(
        "scifact",
        prefetch=prefetch,
        query=models.FusionQuery(
            fusion=models.Fusion.RRF,
        ),
        with_payload=False,
        limit=5,
    )

    hybrid_search_run_dict[query_id] = {
        str(point.id): point.score
        for point in hybrid_search_result.points
    }

rrf_run = Run(hybrid_search_run_dict, name="rrf")
evaluate(qrels, rrf_run, metrics=["precision@5", "mrr@5"], make_comparable=True)

{'precision@5': 0.17107540173053148, 'mrr@5': 0.660877626699629}

# Reranking


In [26]:
from datasets import load_dataset
import concurrent.futures
from tqdm import tqdm
# Load the datasets
documents_dataset = load_dataset("BeIR/scifact", "corpus", split="corpus")

def rerank(pairs):
    from sentence_transformers import CrossEncoder
    model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-4')
    return model.predict(pairs)



def process_query(query_id, doc_scores, query_texts, document_texts):
    query_text = query_texts.get(query_id, "")
    query_document_pairs = [(query_text, document_texts.get(doc_id, "")) for doc_id in doc_scores.keys()]
    scores = rerank(query_document_pairs)
    return query_id, {doc_id: score for doc_id, score in zip(doc_scores.keys(), scores)}

def reranked_data(data):
    query_texts = {str(query["_id"]): query["text"] for query in queries_dataset}
    document_texts = {str(doc["_id"]): doc["text"] for doc in documents_dataset}

    max_workers = None  # Use the number of CPU cores by default
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(process_query, query_id, doc_scores, query_texts, document_texts)
            for query_id, doc_scores in data.items()
        ]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Reranking"):
            query_id, updated_scores = future.result()
            data[query_id] = updated_scores

    return data

# Dictionary to convert
# test_hybrid_search_run_dict = dict(list(hybrid_search_run_dict.items())[:10])
# 
# final_data = reranked_data(hybrid_search_run_dict)

In [27]:
reranker_dict = {}
for query_idx, query in enumerate(queries_dataset):
    query_id = str(query["_id"])

    dense_query_vector = dense_vectors[query_idx]
    sparse_query_vector = sparse_vectors[query_idx]

    prefetch = [
        models.Prefetch(
            query=dense_query_vector,
            using="all-MiniLM-L6-v2",
            limit=20,
        ),
        models.Prefetch(
            query=models.SparseVector(**sparse_query_vector.as_object()),
            using="bm25",
            limit=20,
        ),
    ]
    hybrid_search_result = client.query_points(
        "scifact",
        prefetch=prefetch,
        query=models.FusionQuery(
            fusion=models.Fusion.RRF,
        ),
        with_payload=False,
        limit=10,
    )

    reranker_dict[query_id] = {
        str(point.id): point.score
        for point in hybrid_search_result.points
    }
    
final_data = reranked_data(reranker_dict)



Reranking: 100%|██████████| 1109/1109 [34:33<00:00,  1.87s/it]   


In [None]:

# Example usage
# data = {'378': {'45154987': 1, '10534299': 1, '11886686': 1, '25007443': 1, '17150648': 1}}

# final_data = reranked_data(final_data)

In [28]:
final_data  

{'0': {'29638116': 0.0002508934,
  '26071782': 0.0017919432,
  '4346436': 0.001100046,
  '10608397': 0.0009521333,
  '17388232': 0.00049831654,
  '40212412': 0.02117403,
  '21257564': 0.00043697277,
  '3874000': 0.00029141858,
  '31715818': 0.0002977159,
  '34386619': 0.00039305777},
 '2': {'13734012': 0.33996016,
  '3413083': 0.00041565835,
  '22194407': 0.00096568436,
  '18617259': 0.11450823,
  '25806385': 0.00024510064,
  '841371': 0.7177285,
  '12438901': 0.00024263997,
  '21495419': 0.0002465573,
  '4828631': 0.0003360185,
  '21186109': 0.00030266328},
 '4': {'10958594': 0.73338246,
  '5641851': 0.033387672,
  '1387104': 0.074440576,
  '2058909': 0.15061732,
  '9285396': 0.00278608,
  '24980622': 0.09537921,
  '3899896': 0.0010424663,
  '52188256': 0.7640498,
  '39580129': 0.15457045,
  '9558539': 0.00077895634},
 '6': {'23117378': 0.87953436,
  '2613775': 0.980786,
  '35521287': 0.5926121,
  '20240998': 0.8876992,
  '21050357': 0.3153721,
  '4791384': 0.001892254,
  '27099731': 

In [30]:
post_rerank_run = Run(final_data, name="post-rerank")
evaluate(qrels, post_rerank_run, metrics=["precision@5", "mrr@5"], make_comparable=True)

{'precision@5': 0.169097651421508, 'mrr@5': 0.6171817058096415}

In [31]:
from ranx import compare

compare(
    qrels=qrels,
    runs=[
        dense_run,
        bm25_run,
        rrf_run,
        post_rerank_run,
    ],
    metrics=["precision@5", "recall@5", "mrr@5", "dcg@5", "ndcg@5"],
)

#    Model             P@5      Recall@5    MRR@5    DCG@5     NDCG@5
---  ----------------  -------  ----------  -------  --------  --------
a    all-MiniLM-L6-v2  0.152    0.682       0.576    0.634     0.592
b    bm25              0.161    0.736ᵃ      0.647ᵃᵈ  0.701ᵃ    0.665ᵃ
c    rrf               0.171ᵃᵇ  0.774ᵃᵇ     0.661ᵃᵈ  0.725ᵃᵇᵈ  0.682ᵃᵈ
d    post-rerank       0.169ᵃᵇ  0.764ᵃᵇ     0.617ᵃ   0.689ᵃ    0.648ᵃ

In [None]:
from sentence_transformers import CrossEncoder

def rerank(pairs):
    model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-4')
    return model.predict(pairs)


scores = rerank(pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']])

In [None]:
scores

In [None]:
rerank_run_dict = {}
hs_results = {}
for query_idx, query in enumerate(queries_dataset):
    query_id = str(query["_id"])

    dense_query_vector = dense_vectors[query_idx]
    sparse_query_vector = sparse_vectors[query_idx]

    prefetch = [
        models.Prefetch(
            query=dense_query_vector,
            using="all-MiniLM-L6-v2",
            limit=20,
        ),
        models.Prefetch(
            query=models.SparseVector(**sparse_query_vector.as_object()),
            using="bm25",
            limit=20,
        ),
    ]
    results = client.query_points(
        "scifact",
        prefetch=prefetch,
        query=models.FusionQuery(
            fusion=models.Fusion.RRF,
        ),
        with_payload=False,
        limit=2,
    )
    print(results)
    rerank_run_dict[query_id] = {
        str(point.id): point.score
        for point in results.points
    }
    break

rerank_run_dict
# rerank_run = Run(rerank_run_dict, name="post-rerank")
# evaluate(qrels, rerank_run, metrics=["precision@10", "mrr@10"], make_comparable=True)