In [None]:
!pip install rank_bm25

# BM-25 Keyword scoring

In [2]:
from rank_bm25 import BM25Okapi

corpus = [
    "The cat, commonly referred to as the domestic cat or house cat, is a small domesticated carnivorous mammal.",
    "The dog is a domesticated descendant of the wolf.",
    "Humans are the most common and widespread species of primate, and the last surviving species of the genus Homo.",
    "The scientific name Felis catus was proposed by Carl Linnaeus in 1758"
]
tokenized_corpus = [doc.split(" ") for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

query = "The cat"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)

print(doc_scores)

[0.92061135 0.20898199 0.         0.18788848]


In [None]:
!pip install sentence-transformers

# Semantic Scoring with Cosine Similarity

In [7]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
corpus = [
    "The cat, commonly referred to as the domestic cat or house cat, is a small domesticated carnivorous mammal.",
    "The dog is a domesticated descendant of the wolf.",
    "Humans are the most common and widespread species of primate, and the last surviving species of the genus Homo.",
    "The scientific name Felis catus was proposed by Carl Linnaeus in 1758"
]

# Calculate embeddings by calling model.encode()
document_embeddings = model.encode(corpus)

# Sanity check
print(document_embeddings.shape) #(4,384)



(4, 384)


In [29]:
query = "The cat"
query_embedding = model.encode(query)


from sentence_transformers.util import cos_sim

# Compute cosine_similarity between documents and query
scores = cos_sim(document_embeddings, query_embedding)

print(scores)

# query_embedding = model.encode("feline")

# scores = cos_sim(document_embeddings, query_embedding)
# print(scores)

tensor([[0.5716],
        [0.2904],
        [0.0942],
        [0.3157]])


In [30]:
scores.flatten().tolist()

[0.5715550780296326,
 0.29044675827026367,
 0.09418575465679169,
 0.31574833393096924]

In [31]:
np.argsort(scores.flatten().tolist())# from smalles to largest

array([2, 1, 3, 0])

In [32]:
np.argsort(scores.flatten().tolist())[::-1]

array([0, 3, 1, 2])

In [28]:
np.argsort(scores.flatten().tolist())[::-1] + 1

array([1, 4, 2, 3])

# Hybrid Approach

In [33]:
import numpy as np

def scores_to_ranking(scores: list[float]) -> list[int]:
    """Convert float scores into int rankings (rank 1 is the best)"""
    return np.argsort(scores)[::-1] + 1


def rrf(keyword_rank: int, semantic_rank: int) -> float:
    """Combine keyword rank and semantic rank into a hybrid score."""
    k = 60
    rrf_score = 1 / (k + keyword_rank) + 1 / (k + semantic_rank)
    return rrf_score

In [34]:
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def hybrid_search(
    query: str, corpus: list[str], encoder_model: SentenceTransformer
) -> list[int]:
    # bm25
    tokenized_corpus = [doc.split(" ") for doc in corpus]
    tokenized_query = query.split(" ")
    bm25 = BM25Okapi(tokenized_corpus)
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_ranking = scores_to_ranking(bm25_scores)

    # embeddings
    document_embeddings = model.encode(corpus)
    query_embedding = model.encode(query)
    cos_sim_scores = cos_sim(document_embeddings, query_embedding).flatten().tolist()
    cos_sim_ranking = scores_to_ranking(cos_sim_scores)

    # combine rankings into RRF scores
    hybrid_scores = []
    for i, doc in enumerate(corpus):
        document_ranking = rrf(bm25_ranking[i], cos_sim_ranking[i])
        print(f"Document {i} has the rrf score {document_ranking}")
        hybrid_scores.append(document_ranking)

    # convert RRF scores into final rankings
    hybrid_ranking = scores_to_ranking(hybrid_scores)
    return hybrid_ranking

In [35]:
hybrid_ranking = hybrid_search(
    query="What is the scientifc name for cats?", corpus=corpus, encoder_model=model
)

Document 0 has the rrf score 0.03125
Document 1 has the rrf score 0.032266458495966696
Document 2 has the rrf score 0.03225806451612903
Document 3 has the rrf score 0.032266458495966696


# Scoring a sentence against corpus with cross-encoder (Ranking)

In [36]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder(
    "cross-encoder/ms-marco-TinyBERT-L-2-v2", max_length=512, device="cpu"
)

cross_encoder.rank(
    query="Do I like cats?",
    documents=["I like cats", "I like horses", "I like dogs"],
    return_documents=True,
)

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[{'corpus_id': 0, 'score': 9.002287, 'text': 'I like cats'},
 {'corpus_id': 2, 'score': -6.4110594, 'text': 'I like dogs'},
 {'corpus_id': 1, 'score': -9.557425, 'text': 'I like horses'}]