In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter

## `RecursiveCharacterTextSplitter`

The default recommended text splitter is the `RecursiveCharacterTextSplitter`. This text splitter takes a list of characters. It tries to create chunks based on splitting on the first character, but if any chunks are too large it then moves onto the next character, and so forth. By default the characters it tries to split on are ["\n\n", "\n", " ", ""].

In [None]:
## Loading the file into Text

with open("state_of_the_union.txt") as f:
    info = f.read()

type(info)

In [None]:
## Intializing the `RecursiveCharacterTextSplitter`

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,      # the maximum size of the chunks (measured by `length_function`)
    chunk_overlap  = 20,   # the maximum overlap between chunks (It can be nice to have some overlap to maintain some continuity between chunks)
    length_function = len, # how the length of chunks is calulated
)

In [None]:
## Splitting the Text

texts = text_splitter.create_documents([info])

type(texts), type(texts[0])

In [None]:
texts[0]

In [None]:
texts[1]

## `CharacterTextSplitter`

In [None]:
text_splitter = CharacterTextSplitter(        
    separator = " ",
    chunk_size = 100,
    chunk_overlap  = 22,
    length_function = len,
)

In [None]:
texts = text_splitter.create_documents([info])

print(texts[0])
print(texts[1])

## `More`

You can see more about TextSplitters here: https://docs.langchain.com/docs/components/indexing/text-splitters