In [1]:
# Example: Combining sparse and dense retrieval results
# Install required libraries if needed
# !pip install rank_bm25 sentence-transformers

from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
import numpy as np

corpus = [
    "annual revenue growth in europe",
    "europe sales stable last quarter",
    "quarterly report on growth and revenue"
]

# Sparse retrieval with BM25
tokenized_corpus = [doc.lower().split() for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

# Dense retrieval with sentence-transformers
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(corpus, convert_to_tensor=True)

query = "european quarterly revenue"
query_embedding = model.encode(query, convert_to_tensor=True)

# Dense retrieval scores
dense_scores = util.cos_sim(query_embedding, doc_embeddings)[0].cpu().numpy()

# Sparse retrieval scores
sparse_scores = bm25.get_scores(query.lower().split())

# Combine scores (simple linear combination)
combined_scores = dense_scores + sparse_scores

# Sort documents by combined score
ranked_indices = np.argsort(-combined_scores)  # descending order

print("Query:", query)
print("Sparse Scores:", sparse_scores)
print("Dense Scores:", dense_scores)
print("Combined Scores:", combined_scores)
print("\nRanked Documents by combined score:")
for idx in ranked_indices:
    print(f"Doc {idx}: {corpus[idx]}")


Query: european quarterly revenue
Sparse Scores: [0.07075497 0.         0.54872484]
Dense Scores: [0.7896244 0.6044986 0.6067592]
Combined Scores: [0.86037936 0.60449862 1.15548403]

Ranked Documents by combined score:
Doc 2: quarterly report on growth and revenue
Doc 0: annual revenue growth in europe
Doc 1: europe sales stable last quarter
