In [27]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [28]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

#### Chunk Size
Determines how much of the text is included in a single chunk. Larger chunk sizes contain more content, while smaller chunk sizes break the text into finer pieces.

#### Chunk Overlap
Ensures that important information spanning chunk boundaries is preserved across chunks. This is to ensure that no context is lost when splitting text, particularly for tasks that require an understanding of the relationships between different parts of the text.

In [29]:
# number of tokens in each chunk of text
chunk_size =26

# amount of overlap between consecutive chunks of text
chunk_overlap = 4

#### RecursiveCharacterTextSplitter
Splits text into chunks by recursively attempting to break at natural boundaries (paragraphs, sentences, words) for better coherence while respecting chunk size and overlap constraints.

#### CharacterTextSplitter
Splits text into chunks strictly based on character count, without considering natural language boundaries, using chunk size and overlap settings.

In [30]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

Recall that the chunk size is 26 and the chunk overlap is 4

In [31]:
text1 = 'abcdefghijklmnopqrstuvwxyz'
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [32]:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'
r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

You can see below that if a space is introduced, it will affect the chunks returned by the splitter. In the below example, the string 'l m ' is the 4 characters overlap

In [33]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

The character text splitter splits on a single character and by default that character is newline character '\n', but there's no newlines.

In [34]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [35]:
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

### Demo different text splitters

#### Markdown

In [42]:
from langchain.text_splitter import MarkdownTextSplitter

# Example Markdown text
markdown_text = """
# Header 1
This is the first section.

## Subheader 1.1
Details under subheader 1.1.

# Header 2
Content for the second header.
"""

# Split the Markdown text
markdown_splitter = MarkdownTextSplitter(chunk_size=500, chunk_overlap=50)
markdown_chunks = markdown_splitter.split_text(markdown_text)

# Print the results
for i, chunk in enumerate(markdown_chunks, 1):
    print(f"Chunk {i}:\n{chunk}\n")


Chunk 1:
# Header 1
This is the first section.

## Subheader 1.1
Details under subheader 1.1.

# Header 2
Content for the second header.



#### Python Code

In [44]:
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Example Python code
python_code = """
def func1():
    print("Function 1")
    
class MyClass:
    def method1(self):
        print("Method 1")
        
def func2():
    print("Function 2")
"""

# Split Python code by "def" and "class"
def python_code_splitter(code):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_text(code)

# Split the Python code
chunks = python_code_splitter(python_code)

# Print the results
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}:\n{chunk}\n")


Chunk 1:
def func1():
    print("Function 1")
    
class MyClass:
    def method1(self):
        print("Method 1")
        
def func2():
    print("Function 2")



#### HTML

In [45]:
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Example HTML content
html_content = """
<html>
    <body>
        <h1>Title</h1>
        <p>This is the first paragraph.</p>
        <div>
            <p>Content inside a div.</p>
        </div>
        <p>This is another paragraph.</p>
    </body>
</html>
"""

# Parse and extract text from HTML
soup = BeautifulSoup(html_content, "html.parser")
html_text = soup.get_text()

# Split the extracted text
html_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
html_chunks = html_splitter.split_text(html_text)

# Print the results
for i, chunk in enumerate(html_chunks, 1):
    print(f"Chunk {i}:\n{chunk}\n")


Chunk 1:
Title
This is the first paragraph.

Content inside a div.

This is another paragraph.



In [48]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

len(some_text)

c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""]
)

In [49]:
c_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [50]:
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

Let's reduce the chunk size a bit and add a period to our separators:

In [51]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\. ", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [52]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

#### PDF File

In [56]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/example.pdf")
pages = loader.load()

from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

docs = text_splitter.split_documents(pages)

print(len(docs), len(pages))

78 22


#### Token splitting
We can also split on token count explicity, if we want.

This can be useful because LLMs often have context windows designated in tokens.

Tokens are often ~4 characters.

In [57]:
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)
text1 = "foo bar bazzyfoo"
text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [61]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)
docs = text_splitter.split_documents(pages)
print(docs[0])
print(pages[0].metadata)

page_content='MachineLearning-Lecture01  
' metadata={'source': 'docs/example.pdf', 'page': 0}
{'source': 'docs/example.pdf', 'page': 0}


#### Context aware splitting
Chunking aims to keep text with common context together.

A text splitting often uses sentences or other delimiters to keep related text together but many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting.

We can use MarkdownHeaderTextSplitter to preserve header metadata in our chunks, as show below.

# Title
## Chapter 1
Hi this is Jim
Hi this is Joe
### Section 
Hi this is Lance 

## Chapter 2
Hi this is Molly

In [76]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)

print(md_header_splits[0],'\n')
print(md_header_splits[1])

page_content='Hi this is Jim  
Hi this is Joe' metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'} 

page_content='Hi this is Lance' metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'}


In [78]:
loader = NotionDirectoryLoader("docs/Notion_DB")
docs = loader.load()
txt = ' '.join([d.page_content for d in docs])

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

md_header_splits = markdown_splitter.split_text(txt)
md_header_splits[0]

IndexError: list index out of range