In [52]:
import numpy as np

from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore
import qdrant_client
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.retrievers.fusion_retriever import QueryFusionRetriever

In [64]:
client = QdrantClient(host="localhost", port=6333)
aclient = AsyncQdrantClient(host="localhost", port=6333)

vector_store = QdrantVectorStore(
    collection_name="my_collection",
    client=client,
    aclient=aclient,
    enable_hybrid=True,
    batch_size=20,  # controls sparse batch processing
    fastembed_sparse_model="prithivida/Splade_PP_en_v1",
)

In [65]:
from fastembed import SparseTextEmbedding
for i in SparseTextEmbedding.list_supported_models():
    print(i['model'])


prithivida/Splade_PP_en_v1
prithvida/Splade_PP_en_v1
Qdrant/bm42-all-minilm-l6-v2-attentions
Qdrant/bm25
Qdrant/minicoil-v1


In [66]:
dense_retriever = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    # Optional but recommended:
    normalize=True,       # cosine similarity friendly
    embed_batch_size=32,  # tune for your hardware
)

In [67]:
reranker = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-6-v2",
    top_n=3,
)

In [68]:
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=dense_retriever,
)

In [69]:
# 1) Retrieve (embedding similarity)
base_retriever = index.as_retriever(similarity_top_k=10)
initial_nodes = base_retriever.retrieve("As shown in figure 2, the semantic chunking algorithm works by first splitting")

# 2) Preserve similarity scores
for n in initial_nodes:
    n.node.metadata["similarity_score"] = n.score

# 3) Rerank
reranker = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-6-v2",
    top_n=3,
)

reranked_nodes = reranker.postprocess_nodes(initial_nodes, query_str="As shown in figure 2, the semantic chunking algorithm works by first splitting")

# 4) Print both scores
for n in reranked_nodes:
    print(f"Rerank score: {n.score:.4f}")
    print(f"Similarity score: {n.node.metadata['similarity_score']:.4f}")
    print("Text:", n.text)
    print("-" * 50)


Rerank score: 8.9718
Similarity score: 0.6959
Text: As shown in figure 2, the semantic chunking algorithm works by first splitting the input text into individual sentences, then encoding each sentence into a vector using a pre-trained language model. It calculates the cosine similarity between each sentence and the current chunk to determine semantic closeness. If the similarity is high, the sentence is grouped with the current chunk; otherwise, a new chunk is started. This results in contextually meaningful groups of sentences. These chunks can then be used for tasks like entity extraction, summarization, and building knowledge graphs, enabling structured understanding of long, unstructured text.
--------------------------------------------------
Rerank score: 0.3592
Similarity score: 0.6069
Text: The Semantic chunking implementation focuses on incorporating a semantic chunking methodology into a lightweight Graph RAG framework, due to its adaptability and reduced computational requir

In [70]:
# gives you relative relevance probabilities for a query


scores = np.array([n.score for n in reranked_nodes])

# temperature controls sharpness (lower = more confident)
temperature = 1.0
exp_scores = np.exp(scores / temperature)
probs = exp_scores / exp_scores.sum()

for n, p in zip(reranked_nodes, probs):
    print(f"prob={p:.3f}  rerank_score={n.score:.3f}")

prob=1.000  rerank_score=8.972
prob=0.000  rerank_score=0.359
prob=0.000  rerank_score=-0.131


# Resources

- https://developers.llamaindex.ai/python/framework/integrations/vector_stores/qdrant_hybrid/