In [3]:
import os
from typing import List, Tuple, Dict, Any
import pandas as pd

In [7]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter,CharacterTextSplitter,TokenTextSplitter


## Understanding the Document Structure in LangChian



In [None]:
doc=Document(
    page_content="This is a sample document used to demonstrate the Document structure in LangChain. It contains text data that can be processed and manipulated using various tools and libraries within the LangChain ecosystem.",
    metadata={"source": "sample_document.txt", "author": "OpenAI"}
)
# meta data is  important part of Document Structure in the Langchain because it helps to understand where the chunk came from extra meta
# data it helps RAG to distinguish different chunks
print(f"Document content:{doc.page_content}")

Document content:This is a sample document used to demonstrate the Document structure in LangChain. It contains text data that can be processed and manipulated using various tools and libraries within the LangChain ecosystem.


### Reading a simple text file (both creation from code as well as reading)

In [1]:
import os
os.makedirs("data/textfiles", exist_ok=True)

In [3]:
sample_text={
    "data/textfiles/doc1.txt":"""Modern organizations increasingly rely on data-driven decision-making to improve efficiency and accuracy. As businesses scale, they accumulate vast amounts of operational, transactional, and behavioral data. Turning this raw information into actionable insights requires a combination of strong data engineering pipelines, analytical models, and visualization tools. By building systems that automate data collection and transform it into structured formats, companies can better understand trends, detect anomalies, and respond proactively to changing conditions.""",
    "data/textfiles/doc2.txt":"""In recent years, the field of artificial intelligence (AI) has seen significant advancements, particularly in machine learning and deep learning techniques. These technologies enable computers to learn from data and make predictions or decisions without being explicitly programmed. AI applications span various industries, including healthcare, finance, and transportation, where they enhance capabilities such as image recognition, natural language processing, and autonomous driving. As AI continues to evolve, ethical considerations around bias, privacy, and transparency remain critical topics of discussion.""",
}
for filepath, text in sample_text.items():
    with open (filepath,"w",encoding="utf-8") as f:
        f.write(text)

print("Sample text files created in data/textfiles/ directory.")

Sample text files created in data/textfiles/ directory.


### Reading file


In [8]:
from langchain_community.document_loaders import TextLoader
loader1=TextLoader("data/textfiles/doc1.txt",encoding="utf-8")
loader2=TextLoader("data/textfiles/doc2.txt",encoding="utf-8")
documents1=loader1.load() # list of Document objects
print(type(documents1))
print(documents1)
print(f"Loaded {len(documents1)} document(s).")
for doc in documents1:
    print(f"Document content: {doc.page_content}")
documents2=loader2.load() # list of Document objects
print(f"Loaded {len(documents2)} document(s).")
for doc in documents2:
    print(f"Document content: {doc.page_content}")

<class 'list'>
[Document(metadata={'source': 'data/textfiles/doc1.txt'}, page_content='Modern organizations increasingly rely on data-driven decision-making to improve efficiency and accuracy. As businesses scale, they accumulate vast amounts of operational, transactional, and behavioral data. Turning this raw information into actionable insights requires a combination of strong data engineering pipelines, analytical models, and visualization tools. By building systems that automate data collection and transform it into structured formats, companies can better understand trends, detect anomalies, and respond proactively to changing conditions.')]
Loaded 1 document(s).
Document content: Modern organizations increasingly rely on data-driven decision-making to improve efficiency and accuracy. As businesses scale, they accumulate vast amounts of operational, transactional, and behavioral data. Turning this raw information into actionable insights requires a combination of strong data engin

### Reading raw data from the directory using DirectoryLoader(Multiple files read)

In [None]:
from langchain_community.document_loaders import DirectoryLoader
loader=DirectoryLoader("data/textfiles",glob="**/*.txt",loader_cls=TextLoader,loader_kwargs={"encoding":"utf-8"})
# glob is basically file pattern matching
# if only text files present in the directory then glob can be "*.txt" and loader_cls is TextLoader
# if multiple file types are present in the directory then glob can be "**/*.*" and loader_cls can be UnstructuredFileLoader
documents=loader.load() # list of Document objects
print(f"Loaded {len(documents)} document(s) from directory.")
for doc in documents:
    print(f"Document content: {doc.page_content}")

# For DirectoryLoader the directory must consist of same type of files

Loaded 2 document(s) from directory.
Document content: Modern organizations increasingly rely on data-driven decision-making to improve efficiency and accuracy. As businesses scale, they accumulate vast amounts of operational, transactional, and behavioral data. Turning this raw information into actionable insights requires a combination of strong data engineering pipelines, analytical models, and visualization tools. By building systems that automate data collection and transform it into structured formats, companies can better understand trends, detect anomalies, and respond proactively to changing conditions.
Document content: In recent years, the field of artificial intelligence (AI) has seen significant advancements, particularly in machine learning and deep learning techniques. These technologies enable computers to learn from data and make predictions or decisions without being explicitly programmed. AI applications span various industries, including healthcare, finance, and tra

### Chunking Techniques
## Different splitting techniques which is used to split into Chunks

In [None]:
print(documents[0].page_content)

Modern organizations increasingly rely on data-driven decision-making to improve efficiency and accuracy. As businesses scale, they accumulate vast amounts of operational, transactional, and behavioral data. Turning this raw information into actionable insights requires a combination of strong data engineering pipelines, analytical models, and visualization tools. By building systems that automate data collection and transform it into structured formats, companies can better understand trends, detect anomalies, and respond proactively to changing conditions.


### Recursive Character Text Splitter
✅ MOST RECOMMENDED splitter
✅ Smartly splits text using a hierarchy of separators
✅ Falls back gradually until chunk size is satisfied

✅ How it works (conceptually):

It tries to split in this order:

["\n\n", "\n", ".", " ", ""]


Meaning:

Try splitting by paragraphs

If too long → split by lines

If too long → split by sentences

If too long → split by spaces

If still too long → force split by character

In [20]:
from langchain_text_splitters import RecursiveCharacterTextSplitter,CharacterTextSplitter,TokenTextSplitter
print("----- RecursiveCharacterTextSplitter -----")
text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=20,
    separators=["\n\n","\n","."," ",""],
    length_function=len
)
chunks=text_splitter.split_documents(documents)
print(type(chunks))
print(f"Total chunks created: {len(chunks)}")
for i,chunk in enumerate(chunks):
    print(f"--- Chunk {i+1} ---")
    print(chunk.page_content)
    print("\n") 

----- RecursiveCharacterTextSplitter -----
<class 'list'>
Total chunks created: 9
--- Chunk 1 ---
Modern organizations increasingly rely on data-driven decision-making to improve efficiency and accuracy


--- Chunk 2 ---
. As businesses scale, they accumulate vast amounts of operational, transactional, and behavioral data


--- Chunk 3 ---
. Turning this raw information into actionable insights requires a combination of strong data engineering pipelines, analytical models, and visualization tools


--- Chunk 4 ---
. By building systems that automate data collection and transform it into structured formats, companies can better understand trends, detect anomalies, and respond proactively to changing conditions.


--- Chunk 5 ---
In recent years, the field of artificial intelligence (AI) has seen significant advancements, particularly in machine learning and deep learning techniques


--- Chunk 6 ---
. These technologies enable computers to learn from data and make predictions or decisio

In [21]:
from langchain_text_splitters import CharacterTextSplitter
print("----- CharacterTextSplitter -----")
text_splitter=CharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=20,
    separator=" ",
    length_function=len
)
chunks=text_splitter.split_documents(documents)
print(f"Total chunks created: {len(chunks)}")
for i,chunk in enumerate(chunks):
    print(f"--- Chunk {i+1} ---")
    print(chunk.page_content)
    print("\n")



----- CharacterTextSplitter -----
Total chunks created: 8
--- Chunk 1 ---
Modern organizations increasingly rely on data-driven decision-making to improve efficiency and accuracy. As businesses scale, they accumulate vast amounts of operational, transactional, and


--- Chunk 2 ---
transactional, and behavioral data. Turning this raw information into actionable insights requires a combination of strong data engineering pipelines, analytical models, and visualization tools. By


--- Chunk 3 ---
tools. By building systems that automate data collection and transform it into structured formats, companies can better understand trends, detect anomalies, and respond proactively to changing


--- Chunk 4 ---
to changing conditions.


--- Chunk 5 ---
In recent years, the field of artificial intelligence (AI) has seen significant advancements, particularly in machine learning and deep learning techniques. These technologies enable computers to


--- Chunk 6 ---
enable computers to learn from dat

In [22]:
from langchain_text_splitters import TokenTextSplitter
print("----- TokenTextSplitter -----")
text_splitter=TokenTextSplitter(
    chunk_size=50,
    chunk_overlap=10,
)
chunks=text_splitter.split_documents(documents)
print(f"Total chunks created: {len(chunks)}")   
for i,chunk in enumerate(chunks):
    print(f"--- Chunk {i+1} ---")
    print(chunk.page_content)
    print("\n")

----- TokenTextSplitter -----
Total chunks created: 5
--- Chunk 1 ---
Modern organizations increasingly rely on data-driven decision-making to improve efficiency and accuracy. As businesses scale, they accumulate vast amounts of operational, transactional, and behavioral data. Turning this raw information into actionable insights requires a combination of strong data engineering


--- Chunk 2 ---
 actionable insights requires a combination of strong data engineering pipelines, analytical models, and visualization tools. By building systems that automate data collection and transform it into structured formats, companies can better understand trends, detect anomalies, and respond proactively to changing conditions.


--- Chunk 3 ---
In recent years, the field of artificial intelligence (AI) has seen significant advancements, particularly in machine learning and deep learning techniques. These technologies enable computers to learn from data and make predictions or decisions without bein