### RAG - Document Splitters

##### Boilerplate code

In [None]:
import langchain
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

load_dotenv()

google_api_key = os.getenv("GOOGLE_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

google_llm = ChatGoogleGenerativeAI(
    temperature=0, 
    model="gemini-2.0-flash", 
    api_key=google_api_key,
    max_tokens=200
)

openai_llm = ChatOpenAI(
    temperature=0, 
    model="gpt-4", 
    api_key=openai_api_key
)

##### TextLoader

In [81]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader('./docs_for_rag/coolie_large.txt')

text_documents = loader.load()

for document in text_documents:
    print(document)

# text_documents

page_content='Devaraj “Deva” runs a boarding house where he takes care of his late friend Rajasekhar’s daughter Preethi and although Rajasekhar’s sudden death is officially blamed on a heart attack Deva immediately suspects foul play and begins to investigate uncovering a dangerous criminal syndicate led by Simon and his men Dayalan (Dayal) and Kalyani who not only smuggle gold and luxury goods but also secretly kill people using a special cremation-chair device to dispose of bodies with Dayal even murdering an undercover policeman disguised as a coolie while Preethi who knows how the device works becomes a direct target of their sinister operations forcing Deva to step in to protect her gradually revealing shocking truths about the gang including Kalyani’s hidden identity Preethi’s actual relation as Deva’s daughter and Simon’s past connections to Deva’s own history recalling that years ago Deva was a union leader in Mandwa leading coolies against exploitation and that Simon is the so

## Splitting
### Length based splitting
- Token based
- Character based

##### Token based splitting

In [84]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base", 
    chunk_size=200, 
    chunk_overlap=0)

# cl100k_base → GPT-3.5-turbo, GPT-4, GPT-4-turbo
# p50k_base → Codex models, text-davinci-002/003, -> gpt 3 family
# r50k_base → GPT-3 models (davinci, curie, etc.)

texts = text_splitter.split_documents(text_documents)

print(f"Length: {len(texts)}")
texts

Created a chunk of size 222, which is longer than the specified 200


Length: 2


[Document(metadata={'source': './docs_for_rag/coolie_large.txt'}, page_content='Devaraj “Deva” runs a boarding house where he takes care of his late friend Rajasekhar’s daughter Preethi and although Rajasekhar’s sudden death is officially blamed on a heart attack Deva immediately suspects foul play and begins to investigate uncovering a dangerous criminal syndicate led by Simon and his men Dayalan (Dayal) and Kalyani who not only smuggle gold and luxury goods but also secretly kill people using a special cremation-chair device to dispose of bodies with Dayal even murdering an undercover policeman disguised as a coolie while Preethi who knows how the device works becomes a direct target of their sinister operations forcing Deva to step in to protect her gradually revealing shocking truths about the gang including Kalyani’s hidden identity Preethi’s actual relation as Deva’s daughter and Simon’s past connections to Deva’s own history recalling that years ago Deva was a union leader in Ma

In [85]:
from langchain_text_splitters import TokenTextSplitter

text_splitter = TokenTextSplitter(encoding_name="cl100k_base", chunk_size=200, chunk_overlap=30)

texts = text_splitter.split_documents(text_documents)

print(f"Length: {len(texts)}")
texts

Length: 2


[Document(metadata={'source': './docs_for_rag/coolie_large.txt'}, page_content='Devaraj “Deva” runs a boarding house where he takes care of his late friend Rajasekhar’s daughter Preethi and although Rajasekhar’s sudden death is officially blamed on a heart attack Deva immediately suspects foul play and begins to investigate uncovering a dangerous criminal syndicate led by Simon and his men Dayalan (Dayal) and Kalyani who not only smuggle gold and luxury goods but also secretly kill people using a special cremation-chair device to dispose of bodies with Dayal even murdering an undercover policeman disguised as a coolie while Preethi who knows how the device works becomes a direct target of their sinister operations forcing Deva to step in to protect her gradually revealing shocking truths about the gang including Kalyani’s hidden identity Preethi’s actual relation as Deva’s daughter and Simon’s past connections to Deva’s own history recalling that years ago Deva was a union leader in Ma

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base", 
    chunk_size=200, chunk_overlap=30)

texts = text_splitter.split_documents(text_documents)

print(f"Length: {len(texts)}")
texts

Length: 3


[Document(metadata={'source': './docs_for_rag/coolie_large.txt'}, page_content='Devaraj “Deva” runs a boarding house where he takes care of his late friend Rajasekhar’s daughter Preethi and although Rajasekhar’s sudden death is officially blamed on a heart attack Deva immediately suspects foul play and begins to investigate uncovering a dangerous criminal syndicate led by Simon and his men Dayalan (Dayal) and Kalyani who not only smuggle gold and luxury goods but also secretly kill people using a special cremation-chair device to dispose of bodies with Dayal even murdering an undercover policeman disguised as a coolie while Preethi who knows how the device works becomes a direct target of their sinister operations forcing Deva to step in to protect her gradually revealing shocking truths about the gang including Kalyani’s hidden identity Preethi’s actual relation as Deva’s daughter and Simon’s past connections to Deva’s own history recalling that years ago Deva was a union leader in Ma

In [None]:
import tiktoken

# This works without any OpenAI API key
encoding = tiktoken.get_encoding("cl100k_base")
tokens = encoding.encode("Hello world!")
print(f"Tokens: {tokens}")
print(f"Token count: {len(tokens)}")

Tokens: [9906, 1917, 0]
Token count: 3


##### Character based splitting

In [89]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=30)

texts = text_splitter.split_documents(text_documents)

print(f"Length: {len(texts)}")
texts

Length: 11


[Document(metadata={'source': './docs_for_rag/coolie_large.txt'}, page_content='Devaraj “Deva” runs a boarding house where he takes care of his late friend Rajasekhar’s daughter Preethi and although Rajasekhar’s sudden death is officially blamed on a heart attack Deva immediately'),
 Document(metadata={'source': './docs_for_rag/coolie_large.txt'}, page_content='heart attack Deva immediately suspects foul play and begins to investigate uncovering a dangerous criminal syndicate led by Simon and his men Dayalan (Dayal) and Kalyani who not only smuggle gold and'),
 Document(metadata={'source': './docs_for_rag/coolie_large.txt'}, page_content='who not only smuggle gold and luxury goods but also secretly kill people using a special cremation-chair device to dispose of bodies with Dayal even murdering an undercover policeman disguised as a'),
 Document(metadata={'source': './docs_for_rag/coolie_large.txt'}, page_content='policeman disguised as a coolie while Preethi who knows how the device w

### Document based splitting
- MD ```.split_text()```
- JSON ```.split_json()```
- HTML ```.split_text()```
- Code ```.create_documents()```

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

markdown_document = """# LangChain  
## What is it?  
A framework to **build apps with LLMs** — think AI meets Lego blocks.  

### Core idea  
Combine **prompts**, **chains**, and **agents** to make smart workflows.  

#### Example  
`Translate: "Hello" → "Bonjour"`  
"""

headers_to_split_on = [
    ("#", "Primary header"),
    ("##", "Secondary header"),
    ("###", "Tertiary header"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on, 
    strip_headers=True, 
    return_each_line=True
    )
md_header_splits = markdown_splitter.split_text(markdown_document)

print(f"Length: {len(md_header_splits)}")
md_header_splits

for document in md_header_splits:
    print(f"Medatada: {document.metadata}")
    print(document.page_content, "\n")

Length: 2
Medatada: {'Primary header': 'LangChain', 'Secondary header': 'What is it?'}
A framework to **build apps with LLMs** — think AI meets Lego blocks. 

Medatada: {'Primary header': 'LangChain', 'Secondary header': 'What is it?', 'Tertiary header': 'Core idea'}
Combine **prompts**, **chains**, and **agents** to make smart workflows.  
#### Example
`Translate: "Hello" → "Bonjour"` 



##### Adding RecursiveCharacterTextSplitter on top of MarkdownHeaderTextSplitter

In [93]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)

text_documents = text_splitter.split_documents(md_header_splits)
print(f"Length: {len(text_documents)}")
text_documents

Length: 8


[Document(metadata={'Primary header': 'LangChain'}, page_content='# LangChain'),
 Document(metadata={'Primary header': 'LangChain', 'Secondary header': 'What is it?'}, page_content='## What is it?'),
 Document(metadata={'Primary header': 'LangChain', 'Secondary header': 'What is it?'}, page_content='A framework to **build apps with LLMs** — think'),
 Document(metadata={'Primary header': 'LangChain', 'Secondary header': 'What is it?'}, page_content='— think AI meets Lego blocks.'),
 Document(metadata={'Primary header': 'LangChain', 'Secondary header': 'What is it?', 'Tertiary header': 'Core idea'}, page_content='### Core idea'),
 Document(metadata={'Primary header': 'LangChain', 'Secondary header': 'What is it?', 'Tertiary header': 'Core idea'}, page_content='Combine **prompts**, **chains**, and **agents**'),
 Document(metadata={'Primary header': 'LangChain', 'Secondary header': 'What is it?', 'Tertiary header': 'Core idea'}, page_content='to make smart workflows.'),
 Document(metadata=

### RecursiveJsonSplitter
uses ```splitter.split_json()```
##### Example 1: Using Car JSON

In [102]:
import json
from langchain_text_splitters import RecursiveJsonSplitter
from pprint import pprint

json_data_1 = """{
  "cars":[
    {
      "make": "Maruti Suzuki",
      "model": "Baleno",
      "year": 2023,
      "price_in_inr": 900000,
      "fuel_type": "Petrol"
    },
    {
      "make": "Tata",
      "model": "Punch",
      "year": 2024,
      "price_in_inr": 750000,
      "fuel_type": "Petrol"
    },
    {
      "make": "Mahindra",
      "model": "XUV700",
      "year": 2023,
      "price_in_inr": 2000000,
      "fuel_type": "Diesel"
    }
]}"""


json_data_2 = """[
    {
      "make": "Maruti Suzuki",
      "model": "Baleno",
      "year": 2023,
      "price_in_inr": 900000,
      "fuel_type": "Petrol"
    },
    {
      "make": "Tata",
      "model": "Punch",
      "year": 2024,
      "price_in_inr": 750000,
      "fuel_type": "Petrol"
    },
    {
      "make": "Mahindra",
      "model": "XUV700",
      "year": 2023,
      "price_in_inr": 2000000,
      "fuel_type": "Diesel"
    }
]"""

json_data = json.loads(json_data_1)

splitter = RecursiveJsonSplitter(max_chunk_size=100)

# Recursively split json data - If you need to access/manipulate the smaller json chunks
json_chunks = splitter.split_json(json_data=json_data, convert_lists=True)
print(f"Length: {len(json_chunks)}")
json_chunks

# for chunk in json_chunks[:3]:
#     pprint(chunk)

Length: 5


[{'cars': {'0': {'make': 'Maruti Suzuki',
    'model': 'Baleno',
    'year': 2023,
    'price_in_inr': 900000}}},
 {'cars': {'0': {'fuel_type': 'Petrol'},
   '1': {'make': 'Tata', 'model': 'Punch', 'year': 2024}}},
 {'cars': {'1': {'price_in_inr': 750000, 'fuel_type': 'Petrol'}}},
 {'cars': {'2': {'make': 'Mahindra',
    'model': 'XUV700',
    'year': 2023,
    'price_in_inr': 2000000}}},
 {'cars': {'2': {'fuel_type': 'Diesel'}}}]

### RecursiveJsonSplitter
uses ```splitter.split_json()```
##### Example 2: Using Chennai city JSON

In [None]:
import json
from langchain_text_splitters import RecursiveJsonSplitter
from pprint import pprint

json_data = """{
    "city": "Chennai",
    "state": "Tamil Nadu",
    "country": "India",
    "population": 11484000,
    "area_sq_km": 426,
    "famous_for": [
        "Marina Beach",
        "Filter Coffee",
        "Classical Music",
        "Temples",
        "IT Industry"
    ],
    "climate": {
        "type": "Tropical Wet and Dry",
        "average_temperature_c": 30,
        "monsoon_months": ["October", "November", "December"]
    },
    "landmarks": [
        {
        "name": "Marina Beach",
        "type": "Beach",
        "description": "One of the longest urban beaches in the world."
        },
        {
        "name": "Kapaleeshwarar Temple",
        "type": "Temple",
        "description": "Historic temple dedicated to Lord Shiva in Mylapore."
        },
        {
        "name": "Fort St. George",
        "type": "Historical Site",
        "description": "Built by the British East India Company in 1644."
        }
    ],
    "language": ["Tamil", "English"],
    "transport": {
        "airport": "Chennai International Airport",
        "metro": true,
        "railway_stations": ["Chennai Central", "Egmore", "Tambaram"]
    },
    "timezone": "IST (UTC+5:30)"
}"""

json_data = json.loads(json_data)

splitter = RecursiveJsonSplitter(max_chunk_size=100)

# Recursively split json data - If you need to access/manipulate the smaller json chunks
json_chunks = splitter.split_json(json_data=json_data, convert_lists=True)

print(f"Length: {len(json_chunks)}")
json_chunks

# for chunk in json_chunks[:3]:
#     pprint(chunk)

Length: 8


[{'city': 'Chennai',
  'state': 'Tamil Nadu',
  'country': 'India',
  'population': 11484000},
 {'area_sq_km': 426,
  'famous_for': ['Marina Beach',
   'Filter Coffee',
   'Classical Music',
   'Temples',
   'IT Industry']},
 {'climate': {'type': 'Tropical Wet and Dry', 'average_temperature_c': 30}},
 {'climate': {'monsoon_months': ['October', 'November', 'December']}},
 {'landmarks': [{'name': 'Marina Beach',
    'type': 'Beach',
    'description': 'One of the longest urban beaches in the world.'},
   {'name': 'Kapaleeshwarar Temple',
    'type': 'Temple',
    'description': 'Historic temple dedicated to Lord Shiva in Mylapore.'},
   {'name': 'Fort St. George',
    'type': 'Historical Site',
    'description': 'Built by the British East India Company in 1644.'}]},
 {'language': ['Tamil', 'English'],
  'transport': {'airport': 'Chennai International Airport'}},
 {'transport': {'metro': True,
   'railway_stations': ['Chennai Central', 'Egmore', 'Tambaram']}},
 {'timezone': 'IST (UTC+5:3

### HTMLHeaderTextSplitter
uses ```splitter.split_text()```

In [106]:
from langchain_text_splitters import HTMLHeaderTextSplitter

html_data = """
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Chennai — City Snapshot</title>
</head>
<body>
  <header>
    <h1>Chennai — City Snapshot</h1>
    <p>Tamil Nadu's coastal capital — culture, coast, and filter coffee.</p>
  </header>

  <main>
    <section>
      <h2>Overview</h2>
      <p>
        <strong>City:</strong> Chennai<br>
        <strong>State:</strong> Tamil Nadu, India<br>
        <strong>Population:</strong> 11.48 million<br>
        <strong>Area:</strong> 426 km²<br>
        <strong>Timezone:</strong> IST (UTC+5:30)
      </p>
    </section>

    <section>
      <h2>Quick Facts</h2>
      <ul>
        <li>Famous for: Marina Beach, classical music, temples, filter coffee</li>
        <li>Climate: Tropical — hot and humid, monsoon in Oct–Dec</li>
        <li>Languages: Tamil (primary), English widely used</li>
        <li>Economy: IT, automobile, healthcare, port industries</li>
      </ul>
      <p>Transport: Chennai International Airport, suburban & long-distance rail hubs (Chennai Central, Egmore), Chennai Metro, buses, and a busy seaport.</p>
    </section>

    <section>
      <h2>Landmarks</h2>
      <ul>
        <li><strong>Marina Beach:</strong> One of the longest urban beaches in the world — sunrise walks and local snacks.</li>
        <li><strong>Kapaleeshwarar Temple:</strong> Iconic Dravidian-style temple in Mylapore, vibrant festivals.</li>
        <li><strong>Fort St. George:</strong> 17th-century British fort — museum and colonial history.</li>
      </ul>
    </section>

    <section>
      <h2>Culture & Food</h2>
      <p>
        Chennai is a hub for Carnatic music and classical dance (Bharatanatyam). Food highlights include idli, dosa, sambar, and strong filter coffee. Temple festivals, classical sabhas, and film culture are big here.
      </p>
    </section>

    <section>
      <h2>Visitor Tips</h2>
      <ol>
        <li>Carry light clothing and stay hydrated — it’s warm most of the year.</li>
        <li>Traffic can be heavy — plan extra travel time across the city.</li>
        <li>Try local snacks along Marina and explore Mylapore for temples & music.</li>
      </ol>
    </section>
  </main>

  <footer>
    <p>Snapshot generated for quick reference — not exhaustive. © Chennai</p>
  </footer>
</body>
</html>
"""

headers_to_split_on = [
    ("h1", "Primary header"),
    ("h2", "Secondary header"),
    ("h3", "Tertiary header")
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)

html_chunks = html_splitter.split_text(html_data)

print(f"Length of html splitter: {len(html_chunks)}")

# from pprint import pprint

for chunk in html_chunks:
    print(f"Metadata: {chunk.metadata}")
    print(f"{chunk.page_content}")
    print("\n")


### Further splitting HTML header chunks with RecursiveCharacterTextSplitter

print("FURTHER SPLITTING BASED ON CHARACTER", "\n\n\n")

from langchain_text_splitters import RecursiveCharacterTextSplitter

character_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
html_chunks_2 = character_splitter.split_documents(html_chunks)

print(f"Length of character splitter: {len(html_chunks_2)}")

for chunk in html_chunks_2:
    print(f"Metadata: {chunk.metadata}")
    print(f"{chunk.page_content}")
    print("\n")

Length of html splitter: 12
Metadata: {'Primary header': 'Chennai — City Snapshot'}
Chennai — City Snapshot


Metadata: {'Primary header': 'Chennai — City Snapshot'}
Tamil Nadu's coastal capital — culture, coast, and filter coffee.


Metadata: {'Primary header': 'Chennai — City Snapshot', 'Secondary header': 'Overview'}
Overview


Metadata: {'Primary header': 'Chennai — City Snapshot', 'Secondary header': 'Overview'}
Chennai Tamil Nadu, India 11.48 million 426 km² IST (UTC+5:30)  
City:  
State:  
Population:  
Area:  
Timezone:


Metadata: {'Primary header': 'Chennai — City Snapshot', 'Secondary header': 'Quick Facts'}
Quick Facts


Metadata: {'Primary header': 'Chennai — City Snapshot', 'Secondary header': 'Quick Facts'}
Famous for: Marina Beach, classical music, temples, filter coffee  
Climate: Tropical — hot and humid, monsoon in Oct–Dec  
Languages: Tamil (primary), English widely used  
Economy: IT, automobile, healthcare, port industries  
Transport: Chennai International Airpo

### Coding language splitters
uses ```splitter.create_documents()```
##### Using Language

In [None]:
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter

# for language in Language:
#     print(language.value)

# print(*[lang.value for lang in Language], sep="\n")

sample_python_code = """
    def is_palindrome(s):
        s = s.lower().replace(" ", "")  # ignore case and spaces
        return s == s[::-1]             # reverse string and compare

    # Test
    words = ["level", "Racecar", "hello", "madam"]
    for word in words:
        print(word, "->", is_palindrome(word))
"""

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, 
    chunk_size=200, 
    chunk_overlap=30)

python_chunks = python_splitter.create_documents([sample_python_code],[{"file": "palindrome.py"}])

python_chunks

# print(f"Length of python chunks: {len(python_chunks)}")
# for chunk in python_chunks:
#     print(f"Metadata: {chunk.metadata}")
#     print(f"{chunk.page_content}")
#     print("\n")

# python_chunks = python_splitter.split_text(sample_python_code)

# print(f"Length of python chunks: {len(python_chunks)}")
# for chunk in python_chunks:
#     print(chunk)
#     print("\n")

Length of python chunks: 2
Metadata: {'file': 'palindrome.py'}
def is_palindrome(s):
        s = s.lower().replace(" ", "")  # ignore case and spaces
        return s == s[::-1]             # reverse string and compare


Metadata: {'file': 'palindrome.py'}
# Test
    words = ["level", "Racecar", "hello", "madam"]
    for word in words:
        print(word, "->", is_palindrome(word))




##### Attaching metadata to language code instead of empty {}

In [None]:
# Single code snippet - still needs to be in a list
python_chunks = python_splitter.create_documents([sample_python_code])

# Multiple code snippets at once
code1 = "def hello(): print('Hello')"
code2 = "def goodbye(): print('Goodbye')"
python_chunks = python_splitter.create_documents([code1, code2])

# With metadata for each
python_chunks = python_splitter.create_documents(
    [code1, code2], 
    [{"file": "hello.py"}, {"file": "goodbye.py"}]
)

python_chunks

### MarkdownHeaderTextSplitter
uses ```splitter.split_text()```

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter


sample_md_data = """
# Chennai: The Gateway to South India

![Chennai Beach](https://upload.wikimedia.org/wikipedia/commons/5/50/Marina_Beach_Chennai.jpg)

## Overview
Chennai is the capital city of the Indian state of Tamil Nadu. Located on the Coromandel Coast of the Bay of Bengal, it is one of the largest cultural, economic, and educational centers in South India.

## History
- Founded in 1639 as **Madras** by the British East India Company.
- Renamed Chennai in 1996.
- Rich colonial history with iconic landmarks like Fort St. George.

## Culture
- **Language:** Tamil (official), English widely spoken.
- **Cuisine:** Famous for **idli, dosa, sambar, filter coffee**, and spicy Chettinad dishes.
- **Festivals:** Pongal, Diwali, Tamil New Year, and Chennai Music Season.

## Attractions
1. **Marina Beach** – One of the longest urban beaches in the world.  
2. **Kapaleeshwarar Temple** – Stunning Dravidian architecture.  
3. **Fort St. George** – Historic British fort and museum.  
4. **Santhome Cathedral** – Famous for its Gothic architecture.  
5. **Government Museum** – Rich collection of archaeology and art.

## Economy
- Major IT hub with **Tidel Park** and **Infosys, Cognizant** offices.  
- Strong automobile and manufacturing industry.  
- Busy port contributing to trade.

## Transportation
- **Chennai International Airport** – Connects to major global cities.  
- Extensive **bus and suburban train network**.  
- **Chennai Metro** is expanding rapidly.

## Fun Facts
- Chennai is often called the "**Detroit of India**" due to its automobile industry.  
- The city has a vibrant **classical music and dance scene**.  
- Home to **India’s first public library**.

> *Chennai is a city where tradition meets modernity, where every street has a story, and every corner has culture.*
"""


headers_to_split_on = [
    ("#", "Primary header"),
    ("##", "Secondary header"),
    ("###", "Tertiary header")
]

md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)


md_chunks = md_splitter.split_text(sample_md_data)


for chunk in md_chunks:
    pprint(f"Metadata: {chunk.metadata}")
    pprint(f"{chunk.page_content}")
    print("\n")