In [None]:
#!pip install sentence_transformers
#!pip install langchain_community
#!pip install pypdf
#!pip install langchain-experimental
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    MarkdownHeaderTextSplitter,
    TokenTextSplitter
)
import os
import nltk
import pypdf
from nltk import sent_tokenize
from sentence_transformers import SentenceTransformer, util

# Chunking Methods for RAG Pipelines
---
This notebook demonstrates several methods to split text documents into useful chunks for retrieval-augmented generation (RAG) using LangChain and related libraries.

## 1. Sample Text

In [None]:
sample_text = """# Introduction
Welcome to our demo on text chunking! This section introduces chunking methods and explains why splitting long documents into smaller pieces is critical for efficient retrieval and generation.

## What is Chunking?
Chunking is the process of dividing text into manageable segments.
- Character-based chunking splits purely by length.
- Recursive chunking uses natural text breaks, like paragraphs or sentences.
- Semantic chunking finds topic changes.

Here is a list:
1. Fast and easy: character-based.
2. Natural: recursive or markdown-based.
3. Context-aware: semantic.

## An Example with a Superlongword
Sometimes, data includes strange artifacts like:
ThisIsASingleUnbreakableSupercalifragilisticexpialidociousWordThatExceedsTheChunkSizeLimitAndCausesTroubleForSplitters.

## Topic Change: Semantic Matters
Chunkers that consider **meaning** will split here, as the topic shifts from chunking methods to why semantics matter.
Semantic chunking is especially useful when there are clear boundaries in ideas or narrative flow, even if there's no line break or heading.

In summary, choose your chunker based on your data and task! """

## 2. Different Chunking Methods
### 2.1 Character Splitter
Splits text into fixed-length character chunks (with optional overlap).

In [None]:
char_splitter = CharacterTextSplitter(
    separator="",
    chunk_size=200,
    chunk_overlap=0
)
char_chunks = char_splitter.split_text(sample_text)
print(f"Total chunks (character splitter): {len(char_chunks)}")

Total chunks (character splitter): 6


In [None]:
for i, chunk in enumerate(char_chunks, 1):
    print(f"Chunk {i} (length: {len(chunk)}):")
    print(chunk)
    print("-" * 40 + "\n")

Chunk 1 (length: 200):
# Introduction
Welcome to our demo on text chunking! This section introduces chunking methods and explains why splitting long documents into smaller pieces is critical for efficient retrieval and gene
----------------------------------------

Chunk 2 (length: 200):
ration.

## What is Chunking?
Chunking is the process of dividing text into manageable segments. 
- Character-based chunking splits purely by length.
- Recursive chunking uses natural text breaks, lik
----------------------------------------

Chunk 3 (length: 200):
e paragraphs or sentences.
- Semantic chunking finds topic changes.

Here is a list:
1. Fast and easy: character-based.
2. Natural: recursive or markdown-based.
3. Context-aware: semantic.

## An Exam
----------------------------------------

Chunk 4 (length: 200):
ple with a Superlongword
Sometimes, data includes strange artifacts like: 
ThisIsASingleUnbreakableSupercalifragilisticexpialidociousWordThatExceedsTheChunkSizeLimitAndCausesTroub

### 2.2 Recursive Splitter
Attempts to split text by different logical separators (e.g., paragraphs, sentences) for more "natural" chunks.

In [None]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=0,
    separators=["\n\n", "\n", ".", "!", "?", " "]
)
recursive_chunks = recursive_splitter.split_text(sample_text)
print(f"Total chunks (recursive splitter): {len(recursive_chunks)}")

Total chunks (recursive splitter): 10


In [None]:
for i, chunk in enumerate(recursive_chunks, 1):
    print(f"Chunk {i} (length: {len(chunk)}):")
    print(chunk)
    print("-" * 40 + "\n")

Chunk 1 (length: 14):
# Introduction
----------------------------------------

Chunk 2 (length: 192):
Welcome to our demo on text chunking! This section introduces chunking methods and explains why splitting long documents into smaller pieces is critical for efficient retrieval and generation.
----------------------------------------

Chunk 3 (length: 140):
## What is Chunking?
Chunking is the process of dividing text into manageable segments. 
- Character-based chunking splits purely by length.
----------------------------------------

Chunk 4 (length: 117):
- Recursive chunking uses natural text breaks, like paragraphs or sentences.
- Semantic chunking finds topic changes.
----------------------------------------

Chunk 5 (length: 119):
Here is a list:
1. Fast and easy: character-based.
2. Natural: recursive or markdown-based.
3. Context-aware: semantic.
----------------------------------------

Chunk 6 (length: 83):
## An Example with a Superlongword
Sometimes, data includes strange

### 2.3 Semantic Chunking
Chunking based on **semantic similarity**—splitting where the topic shifts rather than at fixed lengths.  
We'll use `sentence-transformers` to embed sentences and split at points with high semantic "distance".

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
emantic_chunker = SemanticChunker(embedding_model, breakpoint_threshold_type="percentile", breakpoint_threshold_amount=70)
semantic_chunks = semantic_chunker.split_text(sample_text)

print(f"Total chunks: {len(semantic_chunks)}\n")
for i, chunk in enumerate(semantic_chunks, 1):
    print(f"Chunk {i} (length: {len(chunk)}):")
    print(chunk)
    print("-" * 40 + "\n")

Total chunks: 2

Chunk 1 (length: 484):
# Introduction
Welcome to our demo on text chunking! This section introduces chunking methods and explains why splitting long documents into smaller pieces is critical for efficient retrieval and generation. ## What is Chunking? Chunking is the process of dividing text into manageable segments. - Character-based chunking splits purely by length. - Recursive chunking uses natural text breaks, like paragraphs or sentences. - Semantic chunking finds topic changes. Here is a list:
1.
----------------------------------------

Chunk 2 (length: 661):
Fast and easy: character-based. 2. Natural: recursive or markdown-based. 3. Context-aware: semantic. ## An Example with a Superlongword
Sometimes, data includes strange artifacts like: 
ThisIsASingleUnbreakableSupercalifragilisticexpialidociousWordThatExceedsTheChunkSizeLimitAndCausesTroubleForSplitters. ## Topic Change: Semantic Matters
Chunkers that consider **meaning** will split here, as the topic shift

### 2.4 Semantic Chunking with Chunk Size Controlled
Using the same logic, but


*   Merge small chunks to the previous one
*   Set up the max chunk size



In [None]:
embedder = SentenceTransformer("all-mpnet-base-v2")

In [None]:
def semantic_chunking(text, max_chunk_size=300, min_chunk_size=100, sim_threshold=0.7):
    """
    Splits text into semantic chunks, then merges small chunks (<min_chunk_size) to the previous,
    and ensures no chunk exceeds max_chunk_size.
    """
    model = embedder
    sentences = sent_tokenize(text)
    sentence_embeddings = model.encode(sentences)

    chunks = []
    current_chunk = sentences[0]
    current_length = len(sentences[0])

    for i in range(1, len(sentences)):
        # Calculate similarity to previous sentence
        similarity = float(util.cos_sim(sentence_embeddings[i], sentence_embeddings[i-1]))
        # If semantic similarity is low OR adding would exceed max chunk size, start new chunk
        if similarity < sim_threshold or current_length + len(sentences[i]) > max_chunk_size:
            # Merge to previous chunk if too small
            if len(current_chunk) < min_chunk_size and chunks:
                chunks[-1] += " " + current_chunk
            else:
                chunks.append(current_chunk)
            current_chunk = sentences[i]
            current_length = len(sentences[i])
        else:
            current_chunk += " " + sentences[i]
            current_length += len(sentences[i])
    # Append the last chunk
    if len(current_chunk) < min_chunk_size and chunks:
        chunks[-1] += " " + current_chunk
    else:
        chunks.append(current_chunk)
    return chunks

In [67]:
semantic_chunks_2 = semantic_chunking(sample_text, max_chunk_size=300, min_chunk_size=100, sim_threshold=0.7)
print(f"Total chunks: {len(sem_chunks)}\n")
for i, chunk in enumerate(sem_chunks, 1):
    print(f"Chunk {i} (length: {len(chunk)}):")
    print("-" * 40)
    print(chunk)
    print("-" * 40 + "\n")

Total chunks: 5

Chunk 1 (length: 52):
----------------------------------------
# Introduction
Welcome to our demo on text chunking!
----------------------------------------

Chunk 2 (length: 532):
----------------------------------------
This section introduces chunking methods and explains why splitting long documents into smaller pieces is critical for efficient retrieval and generation. ## What is Chunking? Chunking is the process of dividing text into manageable segments. - Character-based chunking splits purely by length. - Recursive chunking uses natural text breaks, like paragraphs or sentences. - Semantic chunking finds topic changes. Here is a list:
1. Fast and easy: character-based. 2. Natural: recursive or markdown-based. 3. Context-aware: semantic.
----------------------------------------

Chunk 3 (length: 204):
----------------------------------------
## An Example with a Superlongword
Sometimes, data includes strange artifacts like: 
ThisIsASingleUnbreakableSupercalifrag