In [52]:
import json
import numpy as np
from fastembed import SparseTextEmbedding, TextEmbedding, SparseEmbedding, LateInteractionTextEmbedding
from typing import List
from tokenizers import Tokenizer
from sentence_transformers import CrossEncoder
SparseTextEmbedding.list_supported_models()

[{'model': 'prithivida/Splade_PP_en_v1',
  'vocab_size': 30522,
  'description': 'Independent Implementation of SPLADE++ Model for English',
  'size_in_GB': 0.532,
  'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'},
  'model_file': 'model.onnx'},
 {'model': 'prithvida/Splade_PP_en_v1',
  'vocab_size': 30522,
  'description': 'Independent Implementation of SPLADE++ Model for English',
  'size_in_GB': 0.532,
  'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'},
  'model_file': 'model.onnx'},
 {'model': 'Qdrant/bm42-all-minilm-l6-v2-attentions',
  'vocab_size': 30522,
  'description': 'Light sparse embedding model, which assigns an importance score to each token in the text',
  'size_in_GB': 0.09,
  'sources': {'hf': 'Qdrant/all_miniLM_L6_v2_with_attentions'},
  'model_file': 'model.onnx',
  'additional_files': ['stopwords.txt'],
  'requires_idf': True},
 {'model': 'Qdrant/bm25',
  'description': 'BM25 as sparse embeddings meant to be used with Qdrant',
  'size_in_GB': 0.01,
  'sources': {'hf': 'Qdra

In [53]:
documents = [
    "Hell yeah",
    "I am not sure about that",
    "I am good",
    "Hmm, no",
    "I wasn't going to do that",
]

queries = ["Yes", "No", "Maybe"]

In [65]:
model_name = "prithivida/Splade_PP_en_v1"
# This triggers the model download
model = SparseTextEmbedding(model_name=model_name)
dense_embedding_model = TextEmbedding()
late_interaction_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")
cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2', max_length=512)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [55]:
colbert_document_embeddings_list: List = list(
    late_interaction_model.embed(documents, batch_size=6)
)
colbert_query_embeddings_list: List = list(
    late_interaction_model.embed(queries, batch_size=6)
)

In [56]:
colbert_document_embeddings_list[0].shape, colbert_query_embeddings_list[0].shape

((11, 128), (4, 128))

In [57]:
dense_document_embeddings_list: List = list(
    dense_embedding_model.embed(documents, batch_size=6)
)
dense_query_embeddings_list: List = list(
    dense_embedding_model.embed(queries, batch_size=6)
)

In [58]:
dense_document_embeddings_list[0].shape, dense_query_embeddings_list[0].shape

((384,), (384,))

In [59]:
sparse_document_embeddings_list: List[SparseEmbedding] = list(
    model.embed(documents, batch_size=6)
)
sparse_query_embeddings_list: List[SparseEmbedding] = list(
    model.embed(queries, batch_size=6)
)

In [60]:
len(sparse_document_embeddings_list[0].indices), len(sparse_query_embeddings_list[0].indices)

(16, 3)

In [61]:
index = 0
tokenizer = Tokenizer.from_pretrained(
    SparseTextEmbedding.list_supported_models()[0]["sources"]["hf"]
)

In [62]:
def get_tokens_and_weights(sparse_embedding, tokenizer):
    token_weight_dict = {}
    for i in range(len(sparse_embedding.indices)):
        token = tokenizer.decode([sparse_embedding.indices[i]])
        weight = sparse_embedding.values[i]
        token_weight_dict[token] = weight

    # Sort the dictionary by weights
    token_weight_dict = dict(
        sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True)
    )
    return token_weight_dict

def get_colbert_scores(query_embedding: np.array, document_embeddings: np.array):
    scores = np.matmul(query_embedding, document_embeddings.transpose(0, 2, 1))
    return scores



# Test the function with the first SparseEmbedding
print(f"Document: {documents[index]}")

print(
    "Representation:",
    json.dumps(
        get_tokens_and_weights(sparse_document_embeddings_list[index], tokenizer),
        indent=2,
    ),
)

print(f"Dense Document Representation: Array of length {len(dense_document_embeddings_list[index])}")
print(f"Colbert Document Representation: Tensor of shape {colbert_document_embeddings_list[index].shape}")
print(f"Query: {queries[index]}")

print(
    "Representation:",
    json.dumps(
        get_tokens_and_weights(sparse_query_embeddings_list[index], tokenizer), indent=2
    ),
)

print(f"Dense Query Representation: Array of length {len(dense_query_embeddings_list[index])}")
print(f"Colbert Query Representation: Tensor of shape {colbert_query_embeddings_list[index].shape}")

Document: Hell yeah
Representation: {
  "hell": 2.937955617904663,
  "yeah": 2.629483938217163,
  "yes": 2.186044931411743,
  "fuck": 1.5486094951629639,
  "heck": 1.4398324489593506,
  "god": 1.3642619848251343,
  "no": 1.317369818687439,
  "shit": 1.1750346422195435,
  "heaven": 1.0905754566192627,
  "sure": 0.9624558687210083,
  "damn": 0.7844801545143127,
  "what": 0.7418169379234314,
  "yep": 0.5640624165534973,
  "religion": 0.25662943720817566,
  "good": 0.06601270288228989,
  "sex": 0.02037149667739868
}
Dense Document Representation: Array of length 384
Colbert Document Representation: Tensor of shape (11, 128)
Query: Yes
Representation: {
  "yes": 2.6260664463043213,
  "yeah": 1.5697352886199951,
  "no": 1.314880609512329
}
Dense Query Representation: Array of length 384
Colbert Query Representation: Tensor of shape (4, 128)


In [67]:
colbert_tokenizer = Tokenizer.from_pretrained("colbert-ir/colbertv2.0")


tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [63]:
def compute_relevance_scores(query_embedding: np.array, document_embeddings: np.array, k: int):
    scores = np.matmul(query_embedding, document_embeddings.transpose(0, 2, 1))

    # Apply max-pooling across document terms (axis=2) to find the max similarity per query term
    # Shape after max-pool: [num_documents, num_query_terms]
    max_scores_per_query_term = np.max(scores, axis=2)

    # Sum the scores across query terms to get the total score for each document
    # Shape after sum: [num_documents]
    total_scores = np.sum(max_scores_per_query_term, axis=1)

    # Sort the documents based on their total scores and get the indices of the top-k documents
    sorted_indices = np.argsort(total_scores)[::-1][:k]

    return sorted_indices

In [72]:
def get_colbert_token_scores(query, document, query_embedding, document_embedding, tokenizer):
    # Tokenize query and document
    query_tokens = tokenizer.encode(query).tokens
    doc_tokens = tokenizer.encode(document).tokens
    
    # Remove CLS and SEP tokens
    query_tokens = query_tokens[1:-1]  # Remove first and last token
    doc_tokens = doc_tokens[1:-1]  # Remove first and last token
    
    # Calculate scores
    scores = np.matmul(query_embedding[1:-1], document_embedding[1:-1].T)
    
    # Create a matrix of scores
    score_matrix = []
    for i, q_token in enumerate(query_tokens):
        row = []
        for j, d_token in enumerate(doc_tokens):
            row.append((d_token, scores[i, j]))
        score_matrix.append((q_token, row))
    
    return score_matrix

# Example usage
query_index = 1
doc_index = 1

query = queries[query_index]
document = documents[doc_index]
query_embedding = colbert_query_embeddings_list[query_index]
document_embedding = colbert_document_embeddings_list[doc_index]

token_scores = get_colbert_token_scores(query, document, query_embedding, document_embedding, colbert_tokenizer)

# Print the results
print(f"Query: {query}")
print(f"Document: {document}")
print("\nToken Interaction Scores:")
for q_token, d_scores in token_scores:
    print(f"\nQuery Token: {q_token}")
    for d_token, score in d_scores:
        print(f"Doc Token: {d_token}: {score:.4f}")

Query: No
Document: I am not sure about that

Token Interaction Scores:

Query Token: no
Doc Token: i: 0.5062
Doc Token: am: 0.3855
Doc Token: not: 0.4358
Doc Token: sure: 0.3073
Doc Token: about: 0.3195
Doc Token: that: 0.3722


In [64]:
scores = cross_encoder_model.predict([('How many people live in Berlin?', 'Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.'), 
                        ('How many people live in Berlin?', 'Berlin is well known for its museums.')])
scores

array([ 7.1523714, -6.287037 ], dtype=float32)