Character Splitter

In [11]:
from langchain.text_splitter import CharacterTextSplitter

text="Why did ther scarecrow win an award? " \
"Because he was outstanding in his field! This is the example trext for this fun exercise."

text_splitter = CharacterTextSplitter(chunk_size=35,chunk_overlap=2,separator='')
documents=text_splitter.create_documents([text])

In [12]:
for document in documents:
    print(document.page_content)

Why did ther scarecrow win an award
rd? Because he was outstanding in h
his field! This is the example tre
rext for this fun exercise.


 Recursive Character Splitter

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text="Why did the scarecrow win an award? " \
"Because he was outstanding in his field! This is the example text for this fun exercise." \
"Another sentence for the example text." \
"Splitting text can be fun and educational."

splitter1 = RecursiveCharacterTextSplitter(chunk_size=50,chunk_overlap=0)
documents1=splitter1.create_documents([text])

splitter2 = RecursiveCharacterTextSplitter(chunk_size=100,chunk_overlap=0)
documents2=splitter2.create_documents([text])

splitter3 = RecursiveCharacterTextSplitter(chunk_size=50,chunk_overlap=10)
documents3=splitter3.create_documents([text])

splitter4 = RecursiveCharacterTextSplitter(chunk_size=100,chunk_overlap=20)
documents4=splitter4.create_documents([text])


def print_documents(documents,config_number):
    print(f"config {config_number}")
    for i, doc in enumerate(documents,1):
        print(f"Chunk{i}: {doc.page_content}")

In [16]:
print_documents(documents1,1)
print_documents(documents2,2)
print_documents(documents3,3)
print_documents(documents4,4)

config 1
Chunk1: Why did the scarecrow win an award? Because he was
Chunk2: outstanding in his field! This is the example
Chunk3: text for this fun exercise.Another sentence for
Chunk4: the example text.Splitting text can be fun and
Chunk5: educational.
config 2
Chunk1: Why did the scarecrow win an award? Because he was outstanding in his field! This is the example
Chunk2: text for this fun exercise.Another sentence for the example text.Splitting text can be fun and
Chunk3: educational.
config 3
Chunk1: Why did the scarecrow win an award? Because he was
Chunk2: he was outstanding in his field! This is the
Chunk3: is the example text for this fun exercise.Another
Chunk4: sentence for the example text.Splitting text can
Chunk5: text can be fun and educational.
config 4
Chunk1: Why did the scarecrow win an award? Because he was outstanding in his field! This is the example
Chunk2: This is the example text for this fun exercise.Another sentence for the example text.Splitting text
Chunk3: t

Sentence Splitter

In [17]:
import spacy

class SpacySentenceTokenizer:
    def __init__(self,stride=2,overlap=0):
        self.stride=stride
        self.overlap=overlap
        self.nlp = spacy.load("en_core_web_sm")

    def create_documents(self,text):
        doc=self.nlp(text)
        sentences=[sent.text for sent in doc.sents]
        chunks=[]

        start=0
        while start<len(sentences):
            end=start+self.stride
            chunk=sentences[start:end]
            chunks.append(' '.join(chunk))
            start+=self.stride-self.overlap
        
        return chunks
    
text="Why did the scarecrow win an award? " \
"Because he was outstanding in his field! This is the example text for this fun exercise." \
"Another sentence to add more variety." \
"Splitting text can be fun and educational."

# configuration I: Stride of 2 sentences, overlap of  0 sentences
tokenizer1=SpacySentenceTokenizer(stride=2,overlap=0)
documents1=tokenizer1.create_documents(text)

# configuration II: Stride of 3 sentences, overlap of 1 sentence
tokenizer2=SpacySentenceTokenizer(stride=3,overlap=1)
documents2=tokenizer2.create_documents(text)

def print_documents(documents,config_number):
    print(f"config {config_number}")
    for i, doc in enumerate(documents,1):
        print(f"Chunk{i}: {doc}")

print_documents(documents1,1)
print_documents(documents2,2)

config 1
Chunk1: Why did the scarecrow win an award? Because he was outstanding in his field!
Chunk2: This is the example text for this fun exercise. Another sentence to add more variety.
Chunk3: Splitting text can be fun and educational.
config 2
Chunk1: Why did the scarecrow win an award? Because he was outstanding in his field! This is the example text for this fun exercise.
Chunk2: This is the example text for this fun exercise. Another sentence to add more variety. Splitting text can be fun and educational.
Chunk3: Splitting text can be fun and educational.


In [2]:
from sentence_transformers import SentenceTransformer,util

class SimilarSentenceSplitter:
    def __init__(self,group_max_sentences=3,similarity_threshold=0.8):
        self.group_max_sentences=group_max_sentences
        self.similarity_threshold=similarity_threshold
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def create_documents(self,text):
        sentences=[sent.strip() + '.' for sent in text.split('. ') if sent]
        embeddings=self.model.encode(sentences,convert_to_tensor=True)
        chunks,current_chunk=[],[sentences[0]]

        for i in range(1,len(sentences)):
            if len(current_chunk)>=self.group_max_sentences or util.pytorch_cos_sim(embeddings[i-1],embeddings[i]).item()<self.similarity_threshold:
                chunks.append(' '.join(current_chunk))
                current_chunk=[sentences[i]]
            else:
                current_chunk.append(sentences[i])

        if current_chunk:
            chunks.append(' '.join(current_chunk))
        
        return chunks
    
    
text=("Why did the scarecrow win an award? Because he was outstanding in his field! "
"This is the example text for this fun exercise. Another sentence to add more variety."
"Splitting text can be fun and educational. Here's another sentence. And one more to check the clustering.")

splitter1=SimilarSentenceSplitter(group_max_sentences=3,similarity_threshold=0.8)
documents1=splitter1.create_documents(text)

splitter2=SimilarSentenceSplitter(group_max_sentences=2,similarity_threshold=0.2)
documents2=splitter2.create_documents(text)


def print_documents(documents,config_number):
    print(f"config {config_number}")
    for i, doc in enumerate(documents,1):
        print(f"Chunk{i}: {doc}")

print_documents(documents1,1)
print_documents(documents2,2)

config 1
Chunk1: Why did the scarecrow win an award? Because he was outstanding in his field! This is the example text for this fun exercise.
Chunk2: Another sentence to add more variety.Splitting text can be fun and educational.
Chunk3: Here's another sentence.
Chunk4: And one more to check the clustering..
config 2
Chunk1: Why did the scarecrow win an award? Because he was outstanding in his field! This is the example text for this fun exercise. Another sentence to add more variety.Splitting text can be fun and educational.
Chunk2: Here's another sentence.
Chunk3: And one more to check the clustering..
