In [1]:
from langchain_experimental.text_splitter import SemanticChunker # Imports the semantic chunker
from langchain_openai.embeddings import OpenAIEmbeddings # Imports OpenAI's embedding model
from dotenv import load_dotenv # For loading environment variables

# Load environment variables from a .env file. 🌍
# This is crucial for securely loading your API keys (e.g., OPENAI_API_KEY),
# which are required for OpenAIEmbeddings.
load_dotenv()

True

In [2]:
# Initialize the SemanticChunker. ✂️
# This splitter is different from character-based splitters because it uses
# embeddings to understand the meaning of the text.
text_splitter = SemanticChunker(
    OpenAIEmbeddings(), # The embedding model used to convert text into numerical vectors.
                        # SemanticChunker calculates embeddings for sentences/paragraphs
                        # to measure their semantic similarity.
    breakpoint_threshold_type="standard_deviation", # Determines how break points are identified.
                                                    # "standard_deviation" means it looks for
                                                    # points where the cosine similarity between
                                                    # consecutive sentences deviates significantly
                                                    # from the average.
    breakpoint_threshold_amount=3 # The multiplier for the standard deviation. A higher value
                                  # means it will be less sensitive to minor changes and
                                  # create larger, fewer chunks. A lower value (e.g., 1)
                                  # would create more, smaller chunks, being more sensitive
                                  # to subtle semantic shifts.
)

In [3]:
# Define the sample text to be split. 📄
# This text contains distinct topics: farming, cricket (IPL), and terrorism.
# The expectation is that SemanticChunker will identify these topic shifts.
sample = """
Farmers were working hard in the fields, preparing the soil and planting seeds for the next season. The sun was bright, and the air smelled of earth and fresh grass. The Indian Premier League (IPL) is the biggest cricket league in the world. People all over the world watch the matches and cheer for their favourite teams.
Terrorism is a big danger to peace and safety. It causes harm to people and creates fear in cities and villages. When such attacks happen, they leave behind pain and sadness. To fight terrorism, we need strong laws, alert security forces, and support from people who care about peace and safety.
"""

In [4]:
# Create documents (chunks) from the sample text. 📝
# The `create_documents` method processes the input text, calculates embeddings,
# identifies semantic breakpoints based on the configured threshold, and returns
# a list of `Document` objects, where each document is a semantically coherent chunk.
docs = text_splitter.create_documents([sample])

In [5]:
# Print the number of chunks created. 📏
# This will show how many distinct semantic segments the text was broken into.
print(f"Number of chunks: {len(docs)}")

Number of chunks: 1


In [6]:
# Print the content of each generated chunk. 📊
# You'll notice that the chunks ideally separate the different topics present in the `sample` text.
print("Generated Chunks:")
for i, doc in enumerate(docs):
    print(f"--- Chunk {i+1} ---")
    print(doc.page_content)
    print("-------------------")

Generated Chunks:
--- Chunk 1 ---

Farmers were working hard in the fields, preparing the soil and planting seeds for the next season. The sun was bright, and the air smelled of earth and fresh grass. The Indian Premier League (IPL) is the biggest cricket league in the world. People all over the world watch the matches and cheer for their favourite teams. Terrorism is a big danger to peace and safety. It causes harm to people and creates fear in cities and villages. When such attacks happen, they leave behind pain and sadness. To fight terrorism, we need strong laws, alert security forces, and support from people who care about peace and safety. 
-------------------
