# Semantic Chunking

- Semantic Chunker is a document splitter that uses embedding similarity between sentences to decide chunk boundaries.

- It ensures that each chunk is semantically coherent and not cut offf mid-thought like traditional character/token splitters.

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:


model = SentenceTransformer('all-MiniLM-L6-v2')

## Sample text
text = """Langchain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agent, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

## Step 1: Split into sentences
sentences = [s.strip() for s in text.split('\n') if s.strip()] # if s.strip() condition filters out any lines that would become empty after stripping

## Step 2: Embed each sentence
embeddings = model.encode(sentences)

## Step 3: Initialize parametes
threshold = 0.7  # control chunk tightness
chunks = []
current_chunk = [sentences[0]]

## Step 4: Semantic Chunking based on threshold

for i in range(1, len(sentences)):
    sim = cosine_similarity([embeddings[i]], [embeddings[i-1]])[0][0]
    if sim >= threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(' '.join(current_chunk))
        current_chunk = [sentences[i]]

# Finalize last chunk
chunks.append(' '.join(current_chunk))

## Output the chunks
print("Semantic Chunks:")
for idx, chunk in enumerate(chunks):
    print(f"Chunk {idx+1}: {chunk}")

Semantic Chunks:
Chunk 1: Langchain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
Chunk 2: You can create chains, agent, memory, and retrievers.
Chunk 3: The Eiffel Tower is located in Paris.
Chunk 4: France is a popular tourist destination.


In [7]:
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_classic.chat_models import init_chat_model
from langchain_classic.schema.runnable import RunnablePassthrough, RunnableMap
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv
load_dotenv()

True

In [13]:
## Custom Semantic Chunker with Threshold

class ThresholdSemanticChunker:
    def __init__(self, model_name='all-MiniLM-L6-v2', threshold=0.7):
        self.model = HuggingFaceEmbeddings(model_name=model_name)
        self.threshold = threshold

    def split(self, text: str):
        sentences = [s.strip() for s in text.split('.') if s.strip()]
        embeddings = self.model.embed_documents(sentences)

        chunks = []
        current_chunk = [sentences[0]]

        for i in range(1, len(sentences)):
            sim = cosine_similarity([embeddings[i]], [embeddings[i-1]])[0][0]
            if sim >= self.threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentences[i]]

        chunks.append(' '.join(current_chunk))
        return chunks
    
    def split_documents(self, documents):
        result = []
        for doc in documents:
            chunks = self.split(doc.page_content)
            for chunk in chunks:
                result.append(Document(page_content=chunk, metadata=doc.metadata))
        return result

In [9]:
## Sample text
text = """Langchain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agent, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

doc = Document(page_content=text, metadata={})
doc

Document(metadata={}, page_content='Langchain is a framework for building applications with LLMs.\nLangchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.\nYou can create chains, agent, memory, and retrievers.\nThe Eiffel Tower is located in Paris.\nFrance is a popular tourist destination.\n')

In [14]:
chunker = ThresholdSemanticChunker(threshold=0.6)
chunks = chunker.split_documents([doc])
chunks

[Document(metadata={}, page_content='Langchain is a framework for building applications with LLMs Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone'),
 Document(metadata={}, page_content='You can create chains, agent, memory, and retrievers'),
 Document(metadata={}, page_content='The Eiffel Tower is located in Paris'),
 Document(metadata={}, page_content='France is a popular tourist destination')]

In [15]:
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
vectorstore = FAISS.from_documents(chunks, embeddings)
retriever = vectorstore.as_retriever()

In [16]:
## Prompt Template

template = """Answer the question based on the following context:

{context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\n')

In [20]:
llm = init_chat_model("groq:openai/gpt-oss-20b")

### LCEL chain with retrieval

rag_lcel_chain = (RunnableMap({
    "context": lambda x: retriever.invoke(x["question"]),
    "question": lambda x: x["question"],
})
| prompt
| llm
| StrOutputParser()
)

# Run the RAG chain
query={"question": "What is Langchain used for?"}
response = rag_lcel_chain.invoke(query)

print("Response:")
print(response)

Response:
Langchain is a framework for building applications that use large language models (LLMs). It offers modular abstractions—such as chains, agents, memory, and retrievers—to combine LLMs with tools like OpenAI, Pinecone, and other services.
