In [1]:
text = """
Artificial intelligence is revolutionizing the healthcare industry. Machine learning algorithms 
can now analyze medical images with remarkable accuracy. Doctors are leveraging AI tools to 
make faster and more precise diagnoses. This technological advancement is saving countless lives.

Climate change poses a significant threat to our planet. Rising global temperatures are causing 
polar ice caps to melt at an alarming rate. Scientists worldwide are calling for immediate action 
to reduce carbon emissions. The consequences of inaction could be catastrophic for future generations.

Python programming has become increasingly popular in recent years. Its simple syntax and vast 
library ecosystem make it ideal for beginners and experts alike. Data scientists particularly 
favor Python for machine learning and data analysis tasks. The language continues to evolve with 
regular updates and improvements.

Space exploration has entered a new era with private companies. SpaceX and Blue Origin are making 
space travel more accessible. Mars colonization is no longer just science fiction but a realistic 
goal. The next decade promises exciting developments in aerospace technology.
"""

In [3]:
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/740 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

model_optimized.onnx:   0%|          | 0.00/218M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [13]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter


# Hybrid approach: Semantic first, then size-based refinement
def semantic_chunk_with_size_langchain(text, max_size, overlap):
    """
    Use LangChain's semantic chunker with size post-processing
    """
    # Step 1: Semantic chunking
    semantic_splitter = SemanticChunker(
        embed_model,
        breakpoint_threshold_type="percentile",
        breakpoint_threshold_amount=95
    )
    
    semantic_chunks = semantic_splitter.create_documents([text])
    
    # Step 2: Further split large chunks
    size_splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_size,
        chunk_overlap=overlap,
        length_function=len,
        
    )
    
    final_chunks = []
    for chunk in semantic_chunks:
        if len(chunk.page_content) > max_size:
            # Split oversized chunks
            sub_chunks = size_splitter.split_text(chunk.page_content)
            final_chunks.extend(sub_chunks)
        else:
            final_chunks.append(chunk.page_content)
    
    return final_chunks

# Usage


In [16]:
chunks = semantic_chunk_with_size_langchain(text, max_size=100, overlap=20)

In [15]:
chunks

['Artificial intelligence is revolutionizing the healthcare industry. Machine learning algorithms',
 'can now analyze medical images with remarkable accuracy. Doctors are leveraging AI tools to',
 'make faster and more precise diagnoses. This technological advancement is saving countless lives.',
 'countless lives. Climate change poses a significant threat to our planet. Rising global',
 'Rising global temperatures are causing',
 'polar ice caps to melt at an alarming rate. Scientists worldwide are calling for immediate action',
 'to reduce carbon emissions. The consequences of inaction could be catastrophic for future',
 'for future generations. Python programming has become increasingly popular in recent years. Its',
 'recent years. Its simple syntax and vast',
 'library ecosystem make it ideal for beginners and experts alike. Data scientists particularly',
 'favor Python for machine learning and data analysis tasks. The language continues to evolve with',
 'regular updates and impro