### RAG - Document Splitters

##### Boilerplate code

In [None]:
import langchain
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

load_dotenv()

google_api_key = os.getenv("GOOGLE_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

google_llm = ChatGoogleGenerativeAI(
    temperature=0, 
    model="gemini-2.0-flash", 
    api_key=google_api_key,
    max_tokens=200
)

openai_llm = ChatOpenAI(
    temperature=0, 
    model="gpt-4", 
    api_key=openai_api_key
)

##### TextLoader

In [None]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader('./docs_for_rag/coolie_large.txt')

text_documents = loader.load()

for document in text_documents:
    print(document)

##### CSVLoader

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(
    './docs_for_rag/cars.csv'
)

csv_documents = loader.load()

# print(data)

for document in csv_documents:
    print(document.page_content, "\n")

##### WebBaseLoader

In [None]:
from langchain_community.document_loaders import WebBaseLoader

# loader = WebBaseLoader("https://www.orkut.com/")
# docs = loader.load()

loader_multiple_pages = WebBaseLoader(
    ["https://www.orkut.com/", "https://google.com", "https://facebook.com", "https://linkedin.com", "https://x.com"]
)
web_documents = loader_multiple_pages.lazy_load()


for document in web_documents:
    print(document, "\n")

##### UnstructuredLoader - Loading Images

In [None]:
from langchain_unstructured import UnstructuredLoader

file_paths = [
    './docs_for_rag/images.jpeg',
    './docs_for_rag/nexon_brochure.pdf'
]

try:
    loader = UnstructuredLoader(file_paths)
    unstructured_docs = loader.load()
    for doc in unstructured_docs:
        if doc.page_content:
            print(doc.page_content, "\n")
        else:
            print("No text content found in the image")
except Exception as e:
    print(f"Error: {e}")

## Splitting
### Length based splitting
- Token based
- Character based

##### Token based splitting

In [None]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(encoding_name="cl100k_base", chunk_size=50, chunk_overlap=0)

texts = text_splitter.split_text(text_documents[0].page_content)

texts

In [None]:
from langchain_text_splitters import TokenTextSplitter

text_splitter = TokenTextSplitter(encoding_name="cl100k_base", chunk_size=100, chunk_overlap=30)

texts = text_splitter.split_text(text_documents[0].page_content)

texts

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(encoding_name="cl100k_base", chunk_size=100, chunk_overlap=30)

texts = text_splitter.split_text(text_documents[0].page_content)

texts

In [None]:
import tiktoken

# This works without any OpenAI API key
encoding = tiktoken.get_encoding("cl100k_base")
tokens = encoding.encode("Hello world!")
print(f"Tokens: {tokens}")
print(f"Token count: {len(tokens)}")

### Document based splitting
- HTML
- JSON
- MD
- Code

In [29]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

markdown_document = """# LangChain  
## What is it?  
A framework to **build apps with LLMs** — think AI meets Lego blocks.  

### Core idea  
Combine **prompts**, **chains**, and **agents** to make smart workflows.  

#### Example  
`Translate: "Hello" → "Bonjour"`  
"""

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False, return_each_line=True)
md_header_splits = markdown_splitter.split_text(markdown_document)
md_header_splits

[Document(metadata={'Header 1': 'LangChain'}, page_content='# LangChain'),
 Document(metadata={'Header 1': 'LangChain', 'Header 2': 'What is it?'}, page_content='## What is it?\nA framework to **build apps with LLMs** — think AI meets Lego blocks.'),
 Document(metadata={'Header 1': 'LangChain', 'Header 2': 'What is it?', 'Header 3': 'Core idea'}, page_content='### Core idea\nCombine **prompts**, **chains**, and **agents** to make smart workflows.'),
 Document(metadata={'Header 1': 'LangChain', 'Header 2': 'What is it?', 'Header 3': 'Core idea'}, page_content='#### Example\n`Translate: "Hello" → "Bonjour"`')]

##### Adding RecursiveCharacterTextSplitter on top of MarkdownHeaderTextSplitter

In [30]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=30)

text_documents = text_splitter.split_documents(md_header_splits)
text_documents

[Document(metadata={'Header 1': 'LangChain'}, page_content='# LangChain'),
 Document(metadata={'Header 1': 'LangChain', 'Header 2': 'What is it?'}, page_content='## What is it?\nA framework to **build apps with LLMs** — think AI meets Lego blocks.'),
 Document(metadata={'Header 1': 'LangChain', 'Header 2': 'What is it?', 'Header 3': 'Core idea'}, page_content='### Core idea\nCombine **prompts**, **chains**, and **agents** to make smart workflows.'),
 Document(metadata={'Header 1': 'LangChain', 'Header 2': 'What is it?', 'Header 3': 'Core idea'}, page_content='#### Example\n`Translate: "Hello" → "Bonjour"`')]