# Document Splitting

In [89]:
import openai
openai.api_key  = "xxx"

In [54]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [55]:
max_segment_length = 50
segment_overlap_size = 10

In [56]:
rec_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=max_segment_length,
    chunk_overlap=segment_overlap_size
)
char_text_splitter = CharacterTextSplitter(
    chunk_size=max_segment_length,
    chunk_overlap=segment_overlap_size
)

In [57]:
sample_text1 = "Lorem Ipsum Dolor Sit Amet, Consectetur Adipiscing Elit. Sed Do Eiusmod Tempor Incididunt Ut Labore Et Dolore Magna."
print(rec_text_splitter.split_text(sample_text1))

sample_text2 = "Nulla Facilisi. Phasellus Imperdiet, Nulla Et Dictum Interdum, Nisi Lorem Efficitur Ipsum, At Volutpat Augue Nulla Ut Ipsum."
print(rec_text_splitter.split_text(sample_text2))

['Lorem Ipsum Dolor Sit Amet, Consectetur Adipiscing', 'Elit. Sed Do Eiusmod Tempor Incididunt Ut Labore', 'Ut Labore Et Dolore Magna.']
['Nulla Facilisi. Phasellus Imperdiet, Nulla Et', 'Nulla Et Dictum Interdum, Nisi Lorem Efficitur', 'Efficitur Ipsum, At Volutpat Augue Nulla Ut', 'Nulla Ut Ipsum.']


In [58]:
complex_text = "The quick brown fox jumps over the lazy dog. In a distant galaxy, far beyond the stars, the intergalactic council convenes to discuss peace treaties between warring factions. Meanwhile, Earth continues its cycle of life, unaffected by the grand schemes of the universe."

In [59]:
rec_text_splitter.split_text(complex_text)

['The quick brown fox jumps over the lazy dog. In a',
 'dog. In a distant galaxy, far beyond the stars,',
 'stars, the intergalactic council convenes to',
 'to discuss peace treaties between warring',
 'warring factions. Meanwhile, Earth continues its',
 'its cycle of life, unaffected by the grand',
 'the grand schemes of the universe.']

In [60]:
char_text_splitter.split_text(complex_text)

['The quick brown fox jumps over the lazy dog. In a distant galaxy, far beyond the stars, the intergalactic council convenes to discuss peace treaties between warring factions. Meanwhile, Earth continues its cycle of life, unaffected by the grand schemes of the universe.']

In [61]:
char_text_splitter = CharacterTextSplitter(
    chunk_size=max_segment_length,
    chunk_overlap=segment_overlap_size,
    separator=' '
)
char_text_splitter.split_text(complex_text)

['The quick brown fox jumps over the lazy dog. In a',
 'dog. In a distant galaxy, far beyond the stars,',
 'the stars, the intergalactic council convenes to',
 'to discuss peace treaties between warring',
 'warring factions. Meanwhile, Earth continues its',
 'its cycle of life, unaffected by the grand schemes',
 'schemes of the universe.']

# Recursive splitting details

In [62]:
large_text_block = """
In the world of artificial intelligence, models are rapidly evolving.
The process of training these models involves vast amounts of data,
which must be divided into manageable chunks for efficient processing.

Researchers and engineers work tirelessly to refine algorithms, ensuring
that AI systems can comprehend and generate human-like text. Despite
significant advancements, challenges such as bias, data privacy, and
contextual understanding remain prominent.

The future of AI depends on addressing these concerns while pushing the
boundaries of what technology can achieve. New techniques continue to
emerge, each offering potential solutions to longstanding problems.
"""

In [63]:
len(large_text_block)

684

In [64]:
char_text_splitter = CharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50,
    separator=' '
)
rec_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50,
    separators=["\n\n", "\n", " ", ""]
)

In [65]:
char_text_splitter.split_text(large_text_block)

['In the world of artificial intelligence, models are rapidly evolving. \nThe process of training these models involves vast amounts of data, \nwhich must be divided into manageable chunks for efficient processing. \n\nResearchers and engineers work tirelessly to refine algorithms, ensuring \nthat AI systems can comprehend and generate human-like text. Despite \nsignificant advancements, challenges such',
 'Despite \nsignificant advancements, challenges such as bias, data privacy, and \ncontextual understanding remain prominent.\n\nThe future of AI depends on addressing these concerns while pushing the \nboundaries of what technology can achieve. New techniques continue to \nemerge, each offering potential solutions to longstanding problems.']

In [66]:
rec_text_splitter.split_text(large_text_block)

['In the world of artificial intelligence, models are rapidly evolving. \nThe process of training these models involves vast amounts of data, \nwhich must be divided into manageable chunks for efficient processing.',
 'Researchers and engineers work tirelessly to refine algorithms, ensuring \nthat AI systems can comprehend and generate human-like text. Despite \nsignificant advancements, challenges such as bias, data privacy, and \ncontextual understanding remain prominent.',
 'The future of AI depends on addressing these concerns while pushing the \nboundaries of what technology can achieve. New techniques continue to \nemerge, each offering potential solutions to longstanding problems.']

Let's reduce the chunk size a bit and add a period to our separators:

In [67]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\. ", " ", ""]
)
r_splitter.split_text(large_text_block)

['In the world of artificial intelligence, models are rapidly evolving. \nThe process of training these models involves vast amounts of data,',
 'which must be divided into manageable chunks for efficient processing.',
 'Researchers and engineers work tirelessly to refine algorithms, ensuring \nthat AI systems can comprehend and generate human-like text. Despite',
 'significant advancements, challenges such as bias, data privacy, and \ncontextual understanding remain prominent.',
 'The future of AI depends on addressing these concerns while pushing the \nboundaries of what technology can achieve. New techniques continue to',
 'emerge, each offering potential solutions to longstanding problems.']

In [68]:
# !pip install -U langchain-community

In [69]:
# !pip install pypdf

In [70]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("sample_data/sample_textbook.pdf")
pages = loader.load()

In [72]:
docs = text_splitter.split_documents(pages)

In [73]:
len(docs)

189

In [74]:
len(pages)

4

In [75]:
print(pages[0])

page_content='This is a sample document to
showcase page-based formatting. It
contains a chapter from a Wikibook
called Sensory Systems. None of the
content has been changed in this
article, but some content has been
removed.
Anatomy of the Somatosensory System
FROM WIKIBOOKS1
Our somatosensory system consists of sensors in the skin
and sensors in our muscles, tendons, and joints. The re-
ceptors in the skin, the so called cutaneous receptors, tell
us about temperature (thermoreceptors), pressure and sur-
face texture (mechano receptors), and pain (nociceptors).
The receptors in muscles and joints provide information
about muscle length, muscle tension, and joint angles.
Cutaneous receptors
Sensory information from Meissner corpuscles and rapidly
adapting afferents leads to adjustment of grip force when
objects are lifted. These afferents respond with a brief
burst of action potentials when objects move a small dis-
tance during the early stages of lifting. In response to
Figure 1: Rec

# Token-Based Text Splitting
Token splitting is useful when dealing with LLMs, which often have context windows defined by tokens. Each token typically averages around 4 characters.

In [76]:
from langchain.text_splitter import TokenTextSplitter

In [77]:
# !pip install tiktoken

In [79]:
token_splitter = TokenTextSplitter(chunk_size=2, chunk_overlap=0)
sample_text = "Quantum computing is fascinating and highly complex. Understanding it requires patience and dedication."
token_splitter.split_text(sample_text)

token_splitter = TokenTextSplitter(chunk_size=15, chunk_overlap=0)
split_text_docs = token_splitter.split_documents(pages)

In [80]:
split_text_docs[0]

Document(metadata={'producer': 'Prince 20150210 (www.princexml.com)', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Anatomy of the Somatosensory System', 'source': 'sample_data/sample_textbook.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='This is a sample document to\nshowcase page-based formatting. It')

In [82]:
pages[0].metadata

{'producer': 'Prince 20150210 (www.princexml.com)',
 'creator': 'PyPDF',
 'creationdate': '',
 'title': 'Anatomy of the Somatosensory System',
 'source': 'sample_data/sample_textbook.pdf',
 'total_pages': 4,
 'page': 0,
 'page_label': '1'}

# Context aware splitting
Chunking aims to keep text with common context together.

A text splitting often uses sentences or other delimiters to keep related text together but many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting.

We can use MarkdownHeaderTextSplitter to preserve header metadata in our chunks, as show below.

In [83]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [84]:
markdown_text = """# Research Report\n\n \
## Introduction\n\n \
The purpose of this study is to explore the impacts of climate change on marine ecosystems. \
## Background\n\n \
Global temperatures are rising at unprecedented rates.\n\n \
### Causes\n\n \
Primarily due to greenhouse gas emissions.\n\n \
### Effects\n\n \
Rising sea levels, ocean acidification, and coral bleaching.\n\n \
## Conclusion\n\n \
Mitigation efforts must be accelerated to prevent irreversible damage."""

In [85]:
headers_to_segment_on = [
    ("#", "Section 1"),
    ("##", "Section 2"),
    ("###", "Subsection")
]

In [86]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_segment_on
)
segmented_texts = markdown_splitter.split_text(markdown_text)

In [87]:
segmented_texts[0]

Document(metadata={'Section 1': 'Research Report', 'Section 2': 'Introduction'}, page_content='The purpose of this study is to explore the impacts of climate change on marine ecosystems. ## Background  \nGlobal temperatures are rising at unprecedented rates.')

In [88]:
segmented_texts[1]

Document(metadata={'Section 1': 'Research Report', 'Section 2': 'Introduction', 'Subsection': 'Causes'}, page_content='Primarily due to greenhouse gas emissions.')