# Text Splitting

## Libraries

In [1]:
from langchain.embeddings import GPT4AllEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, Language

## Corpus

In [2]:
with open("../../datasets/raw/TheAdventuresofPinocchio.txt") as f:
    corpus = f.read()

## Fixed Character Text Splitting

In [3]:
text_splitter = CharacterTextSplitter(
    separator=" ", # To avoid splitting in the middle of a word
    chunk_size=150,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

In [4]:
texts = text_splitter.create_documents([corpus])

In [5]:
print(texts[0])
print(texts[1])

page_content='The Project Gutenberg eBook of The Adventures of Pinocchio\n\nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of'
page_content='other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the'


## Recursive Character Splitting

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100, # Set a small chunk size, just to show.
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

In [7]:
texts = text_splitter.create_documents([corpus])

In [8]:
print(texts[0])
print(texts[1])

page_content='The Project Gutenberg eBook of The Adventures of Pinocchio'
page_content='This ebook is for the use of anyone anywhere in the United States and'


## Document Based

### Python Code

In [9]:
PYTHON_CODE = """
def hello_world() -> None:
    print("Hello, World!")

if __name__ == "__main__":
    hello_world()
"""

In [10]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE])
python_docs

[Document(page_content='def hello_world() -> None:'),
 Document(page_content='print("Hello, World!")'),
 Document(page_content='if __name__ == "__main__":\n    hello_world()')]

### Scala Code

In [11]:
SCALA_CODE = """
package enough.scala.spark
package get_started

import org.apache.spark.sql.types._

object Schema extends SparkSessionWrapper {
  def main(args: Array[String]): Unit = {
//  Without Inferring Schema
    var df = spark.read
      .option("header", true)
      .csv("data/raw/AAPL.csv")
    df.printSchema()

//  Inferring Schema
    df = spark.read
      .option("header", true)
      .option("inferSchema", true)
      .csv("data/raw/AAPL.csv")

    df.printSchema()

//  Define Schema
    val schema = StructType(
      Array(
        StructField("Date", DateType, true),
        StructField("Open", DoubleType, true),
        StructField("High", DoubleType, true),
        StructField("Low", DoubleType, true),
        StructField("Close", DoubleType, true),
        StructField("Volume", DoubleType, true)))

    df = spark.read
      .option("header", true)
      .schema(schema)
      .csv("data/raw/AAPL.csv")
    df.printSchema()
  }
}
"""

In [12]:
scala_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.SCALA, chunk_size=50, chunk_overlap=0
)
scala_docs = python_splitter.create_documents([SCALA_CODE])
scala_docs[0:5]

[Document(page_content='package enough.scala.spark\npackage get_started'),
 Document(page_content='import org.apache.spark.sql.types._'),
 Document(page_content='object Schema extends SparkSessionWrapper {'),
 Document(page_content='def main(args: Array[String]): Unit = {'),
 Document(page_content='//  Without Inferring Schema')]

### HTML

In [13]:
with open("../../datasets/raw/medium_blog.html") as f:
    html_corpus = f.read()

In [14]:
html_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.HTML, chunk_size=50, chunk_overlap=0
)
html_docs = python_splitter.create_documents([html_corpus])
html_docs[0:5]

[Document(page_content='<!doctype html><html lang="en"><head><title'),
 Document(page_content='data-rh="true">Effortless Document Extraction: A'),
 Document(page_content='Guide to Using Unstructured API and Data'),
 Document(page_content='Connectors | by Unstructured | Unstructured |'),
 Document(page_content='Medium</title><meta data-rh="true"')]

### Markdown

In [15]:
with open("../../README.md") as f:
    mkdown_corpus = f.read()

In [16]:
mkdown_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=50, chunk_overlap=0
)
mkdown_docs = python_splitter.create_documents([mkdown_corpus])
mkdown_docs[0:5]

[Document(page_content='# Advanced Retrieval Methods for RAG'),
 Document(page_content='## Introduction'),
 Document(page_content='This repository contains the code for carrying'),
 Document(page_content='out different "Advanced Retrieval Methods for'),
 Document(page_content='RAG" such as')]

### Other languages

In [17]:
[lang for lang in Language]

[<Language.CPP: 'cpp'>,
 <Language.GO: 'go'>,
 <Language.JAVA: 'java'>,
 <Language.KOTLIN: 'kotlin'>,
 <Language.JS: 'js'>,
 <Language.TS: 'ts'>,
 <Language.PHP: 'php'>,
 <Language.PROTO: 'proto'>,
 <Language.PYTHON: 'python'>,
 <Language.RST: 'rst'>,
 <Language.RUBY: 'ruby'>,
 <Language.RUST: 'rust'>,
 <Language.SCALA: 'scala'>,
 <Language.SWIFT: 'swift'>,
 <Language.MARKDOWN: 'markdown'>,
 <Language.LATEX: 'latex'>,
 <Language.HTML: 'html'>,
 <Language.SOL: 'sol'>,
 <Language.CSHARP: 'csharp'>,
 <Language.COBOL: 'cobol'>,
 <Language.C: 'c'>,
 <Language.LUA: 'lua'>,
 <Language.PERL: 'perl'>]


## Semantic Chunking

This chunking strategy requires an embeddings model, in this case `all-MiniLM-L6-v2-f16.gguf` from [ChatGPT4All](https://gpt4all.io/index.html) was used.

In [18]:
model = GPT4AllEmbeddings()

In [19]:
text_splitter = SemanticChunker(model, breakpoint_threshold_type="percentile")
docs = text_splitter.create_documents([corpus])

In [20]:
print(docs[0].page_content)

The Project Gutenberg eBook of The Adventures of Pinocchio

This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook. Title: The Adventures of Pinocchio

Author: Carlo Collodi

Translator: Carol Della Chiesa

Release date: January 12, 2006 [eBook #500]
                Most recently updated: September 28, 2020

Language: English

Credits: Produced by Charles Keller (for Tina); and David Widger


*** START OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF PINOCCHIO ***




Produced by Charles Keller (for Tina); and David Widger







Dashes; small checks; quick pass; gutchecked twice; jeebies; spellcheck


THE

In [21]:
print(len(docs))

117


In [22]:
text_splitter = SemanticChunker(model, breakpoint_threshold_type="standard_deviation")
docs = text_splitter.create_documents([corpus])

In [23]:
print(docs[0].page_content)

The Project Gutenberg eBook of The Adventures of Pinocchio

This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook. Title: The Adventures of Pinocchio

Author: Carlo Collodi

Translator: Carol Della Chiesa

Release date: January 12, 2006 [eBook #500]
                Most recently updated: September 28, 2020

Language: English

Credits: Produced by Charles Keller (for Tina); and David Widger


*** START OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF PINOCCHIO ***




Produced by Charles Keller (for Tina); and David Widger







Dashes; small checks; quick pass; gutchecked twice; jeebies; spellcheck


THE

In [24]:
print(len(docs))

22


In [25]:
text_splitter = SemanticChunker(model, breakpoint_threshold_type="interquartile")
docs = text_splitter.create_documents([corpus])

In [26]:
print(docs[0].page_content)

The Project Gutenberg eBook of The Adventures of Pinocchio

This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook. Title: The Adventures of Pinocchio

Author: Carlo Collodi

Translator: Carol Della Chiesa

Release date: January 12, 2006 [eBook #500]
                Most recently updated: September 28, 2020

Language: English

Credits: Produced by Charles Keller (for Tina); and David Widger


*** START OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF PINOCCHIO ***




Produced by Charles Keller (for Tina); and David Widger







Dashes; small checks; quick pass; gutchecked twice; jeebies; spellcheck


THE

In [27]:
print(len(docs))

120


## Agentic Chunking