In [3]:
import numpy as np
import os
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.retrievers.fusion_retriever import QueryFusionRetriever
from dotenv import load_dotenv
from llama_index.core.prompts import PromptTemplate
from llama_index.core.response_synthesizers import get_response_synthesizer
from llama_index.llms.openai import OpenAI
from llama_index.core.settings import Settings

from litellm import completion

In [2]:
load_dotenv()

True

# Retrieval

In [9]:
dense_retriever = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    # Optional but recommended:
    normalize=True,       # cosine similarity friendly
    embed_batch_size=32,  # tune for your hardware
)

Settings.embed_model = dense_retriever

In [10]:
client = QdrantClient(host="localhost", port=6333)
aclient = AsyncQdrantClient(host="localhost", port=6333)

vector_store = QdrantVectorStore(
    collection_name="my_collection",
    client=client,
    aclient=aclient,
    enable_hybrid=True,
    batch_size=20,  # controls sparse batch processing
    fastembed_sparse_model="prithivida/Splade_PP_en_v1",
)

In [11]:
from fastembed import SparseTextEmbedding
for i in SparseTextEmbedding.list_supported_models():
    print(i['model'])


prithivida/Splade_PP_en_v1
prithvida/Splade_PP_en_v1
Qdrant/bm42-all-minilm-l6-v2-attentions
Qdrant/bm25
Qdrant/minicoil-v1


In [12]:
reranker = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-6-v2",
    top_n=3,
)

In [13]:
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=dense_retriever,
)

In [16]:
# 1) Retrieve (embedding similarity)
base_retriever = index.as_retriever(similarity_top_k=10, sparse_top_k=10, vector_store_query_mode="hybrid", hybrid_top_k=7)
initial_nodes = base_retriever.retrieve("As shown in figure 2, the semantic chunking algorithm works by first splitting")

# 2) Preserve similarity scores
for n in initial_nodes:
    n.node.metadata["similarity_score"] = n.score

print(len(initial_nodes))
print(initial_nodes)

7
[NodeWithScore(node=TextNode(id_='6d735565-ae17-41e7-b949-ac0f0f5d761d', embedding=None, metadata={'uuid': 'ee95abd1-30ab-4f7e-be4c-3250076dcd69', 'file_path': 'data/processed/docling/2025.icnlsp-1.15.json', 'chunk_index': 0, 'section_path': ['4 Recursive Semantic Chunking'], 'section_refs': ['#/texts/35'], 'page_no': 3, 'tokens': 59, 'similarity_score': 0.7368821230294198}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='This section presents the Recursive Semantic Chunking framework in detail. The primary objective is to ensure the splitting of chunks is semantically coherent and maintains the integrity of the content. In addition, the size of the chunks should be optimal. The standard semantic chunking technique tends to generate large chunks, which', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}')

In [17]:
# 3) Rerank
reranker = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-6-v2",
    top_n=3,
)

reranked_nodes = reranker.postprocess_nodes(initial_nodes, query_str="As shown in figure 2, the semantic chunking algorithm works by first splitting")

# 4) Print both scores
for n in reranked_nodes:
    print(f"Rerank score: {n.score:.4f}")
    print(f"Similarity score: {n.node.metadata['similarity_score']:.4f}")
    print("Text:", n.text)
    print("-" * 50)


Rerank score: 7.7959
Similarity score: 0.5000
Text: As shown in figure 2, the semantic chunking algorithm works by first splitting the input text into individual sentences, then encoding each sentence into a vector using a pre-trained language model. It calculates the cosine similarity between each sentence and the current chunk to determine semantic closeness. If the similarity is high, the sentence is grouped with the current chunk; otherwise, a new chunk is started. This results in contextually meaningful groups of sentences. These chunks can then be used for tasks like entity extraction, summarization, and building knowledge graphs, enabling structured understanding of long, unstructured text.
--------------------------------------------------
Rerank score: 3.7247
Similarity score: 0.6033
Text: Although the recursive text split tends to keep the chunks semantically closed together, it does not directly account for semantic meaning. Conversely, semantic chunking (LangChain, 2024) gr

In [70]:
# gives you relative relevance probabilities for a query


scores = np.array([n.score for n in reranked_nodes])

# temperature controls sharpness (lower = more confident)
temperature = 1.0
exp_scores = np.exp(scores / temperature)
probs = exp_scores / exp_scores.sum()

for n, p in zip(reranked_nodes, probs):
    print(f"prob={p:.3f}  rerank_score={n.score:.3f}")

prob=1.000  rerank_score=8.972
prob=0.000  rerank_score=0.359
prob=0.000  rerank_score=-0.131


# Generation

In [72]:
reranked_nodes

[NodeWithScore(node=TextNode(id_='59bbfa5d-3076-4b3a-834d-9bd9d0ad663a', embedding=None, metadata={'uuid': '0e2453e3-c73e-4b19-8409-8ebd37158677', 'section_path': ['3 Methodology', '3.2 SemRAG', '3.2.3 Retrieval'], 'section_refs': ['#/texts/37', '#/texts/40', '#/texts/89'], 'page_no': 6, 'char_len': 654, 'similarity_score': 0.69592637}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e9ae230f-e254-458b-9b24-6ef3cc7de0af', node_type='4', metadata={'uuid': '0e2453e3-c73e-4b19-8409-8ebd37158677', 'section_path': ['3 Methodology', '3.2 SemRAG', '3.2.3 Retrieval'], 'section_refs': ['#/texts/37', '#/texts/40', '#/texts/89'], 'page_no': 6, 'char_len': 654}, hash='1b5eae9341ef355ccd2be15a53a0b0250dcba53075986e63ce200055e934e34e')}, metadata_template='{key}: {value}', metadata_separator='\n', text='As shown in figure 2, the semantic chunking algorithm works by first splitting the input text into individual s

In [100]:
def format_context(nodes):
    formatted = []
    for i, node in enumerate(nodes, start=1):
        formatted.append(
            f"""Source {i}: {node.text}"""
        )
    return "\n\n".join(formatted)


In [98]:
chunks_str = format_chunks(reranked_nodes)
chunks_str

'Chunk 1: As shown in figure 2, the semantic chunking algorithm works by first splitting the input text into individual sentences, then encoding each sentence into a vector using a pre-trained language model. It calculates the cosine similarity between each sentence and the current chunk to determine semantic closeness. If the similarity is high, the sentence is grouped with the current chunk; otherwise, a new chunk is started. This results in contextually meaningful groups of sentences. These chunks can then be used for tasks like entity extraction, summarization, and building knowledge graphs, enabling structured understanding of long, unstructured text.\n\nChunk 2: The Semantic chunking implementation focuses on incorporating a semantic chunking methodology into a lightweight Graph RAG framework, due to its adaptability and reduced computational requirements. Efforts to integrate this approach into a more resource-intensive platform proved challenging, prompting a shift to a simpler

In [105]:
SYSTEM_PROMPT = """You are a RAG-based assistant.
Use ONLY the provided context to answer the question.
If the answer is not contained in the context, say:
"I don’t know based on the provided information."
Cite sources using [Source X] notation.
Be concise and factual.
"""


USER_PROMPT_TEMPLATE = """Context:
{context}

Question:
{question}
"""



def generate_answer(
    question: str,
    retrieved_chunks: list[str],
    system_prompt: str = SYSTEM_PROMPT,
    user_prompt_template: str = USER_PROMPT_TEMPLATE,
) -> str:
    context = format_context(retrieved_chunks)

    user_prompt = user_prompt_template.format(
        context=context,
        question=question,
    )

    response = completion(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.2, # low temp for grounded answers
        max_tokens=300,
    )

    return response

In [106]:
query = "What does figure 2 show?"

answer = generate_answer(query, reranked_nodes)

In [107]:
print(answer.choices[0].message.content)

Figure 2 shows how the semantic chunking algorithm works by splitting input text into individual sentences, encoding each sentence into a vector using a pre-trained language model, and calculating cosine similarity to determine semantic closeness for grouping sentences into meaningful chunks. These chunks can be used for various tasks like entity extraction and summarization, enabling a structured understanding of long, unstructured text [Source 1].


In [104]:
answer

ModelResponse(id='chatcmpl-D570HN9CFjYNvdLuTJipCwGCYOr3m', created=1770110933, model='gpt-4o-mini-2024-07-18', object='chat.completion', system_fingerprint='fp_6b23b3aa8a', choices=[Choices(finish_reason='stop', index=0, message=Message(content='Figure 2 shows how the semantic chunking algorithm works, which involves splitting the input text into individual sentences, encoding each sentence into a vector using a pre-trained language model, and calculating the cosine similarity between each sentence and the current chunk to determine semantic closeness. It illustrates the process of grouping sentences into contextually meaningful chunks based on their semantic similarity. [Source 1]', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]), provider_specific_fields={})], usage=Usage(completion_tokens=76, prompt_tokens=396, total_tokens=472, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, au

In [88]:
metadata = {
    "request_time": "2026-02-03",
    "user_role": "risk_analyst",
    "top_k": len(reranked_nodes)
}

metadata_str = "\n".join(
    f"{k}: {v}" for k, v in metadata.items()
)

metadata_str

'request_time: 2026-02-03\nuser_role: risk_analyst\ntop_k: 3'

# Resources

- https://developers.llamaindex.ai/python/framework/integrations/vector_stores/qdrant_hybrid/