In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter

In [2]:
chunk_size = 26
chunk_overlap = 4

text1 = "abcdefghijklmnopqrstuvwxyz"
text2 = "abcdefghijklmnopqrstuvwxyzabcdefg"

In [3]:
r_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                            chunk_overlap=chunk_overlap)
c_splitter = CharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)

## Text splitting

In [4]:
r_splitter.split_text(text1),r_splitter.split_text(text2)

(['abcdefghijklmnopqrstuvwxyz'], ['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg'])

In [5]:
c_splitter.split_text(text1),c_splitter.split_text(text2)

(['abcdefghijklmnopqrstuvwxyz'], ['abcdefghijklmnopqrstuvwxyzabcdefg'])

In [6]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [7]:
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [8]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [9]:
## but we cannot do with CharacterTextSplitter(), so we introduce separator 

c_splitter = CharacterTextSplitter(chunk_size = chunk_size,
                                   chunk_overlap = chunk_overlap,
                                   separator = " ")

c_splitter.split_text((text3))

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

## Recursive splitting details

- RecursiveCharacterTextSplitter() is recommended for generic text.


In [10]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [11]:
len(some_text)

496

In [12]:
r_splitter = RecursiveCharacterTextSplitter(chunk_size=450,
                                            chunk_overlap = 0,
                                            separators = ["\n\n","\n"," ",""])
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [13]:
c_splitter = CharacterTextSplitter(chunk_overlap=0,
                                   chunk_size=450,
                                   separator = " ")
c_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

## Token splitting

In [14]:
from langchain.text_splitter import TokenTextSplitter

In [15]:
token_splitter = TokenTextSplitter(chunk_size=1,
                                   chunk_overlap=0)
text1 = "ii wwanna know how people are roming in foreign countries"
token_splitter.split_text(text1)

['ii',
 ' w',
 'w',
 'anna',
 ' know',
 ' how',
 ' people',
 ' are',
 ' r',
 'oming',
 ' in',
 ' foreign',
 ' countries']

In [16]:
token_splitter = TokenTextSplitter(chunk_size=5,
                                   chunk_overlap=1)
text1 = "hello gguys, Goodd morning. i wanna know how people are roming in foreign countries"
token_splitter.split_text(text1)

['hello gguys,',
 ', Goodd morning.',
 '. i wanna know how',
 ' how people are roming',
 'oming in foreign countries']

## Context aware splitting

Chunking aims to keep text with common context together.

A text splitting often uses sentences or other delimiters to keep related text together but many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting.

We can use MarkdownHeaderTextSplitter to preserve header metadata in our chunks, as show below.


In [22]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [18]:
markdown_document = """
#### Title\n\n \
Introduction of candidate
### Chapter_1\n\n \
Hi this is Arjun \n\n Hi this is Michael\n\n \
## Section\n\n \
Their jobs on IT field\n\n \
### Chapter_2\n\n \
Age of Arjun is 25\n\n \ Age of Michael is 49
"""

In [19]:
header__to_split = [
    ("####", "Header_1"),
    ("###",  "Header_2"),
    ("##",   "Header_3")
]

In [20]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = header__to_split)

header_split = markdown_splitter.split_text(markdown_document)

In [21]:
header_split

[Document(page_content='Introduction of candidate', metadata={'Header_1': 'Title'}),
 Document(page_content='Hi this is Arjun  \nHi this is Michael', metadata={'Header_2': 'Chapter_1'}),
 Document(page_content='Their jobs on IT field', metadata={'Header_3': 'Section'}),
 Document(page_content='Age of Arjun is 25  \n\\ Age of Michael is 49', metadata={'Header_3': 'Section', 'Header_2': 'Chapter_2'})]