In [1]:
Document_A = """ Q: What is the return policy?
A: Items can be returned within 30 days of purchase with original receipt.

Q: Do you offer international shipping?
A: Yes, we ship to over 50 countries worldwide. Shipping times vary by location.

Q: How do I track my order?
A: Use the tracking number sent to your email after shipment.
"""
Document_B = """ Installation Guide

Step 1: Download the installer from our website.
Extract the zip file to your desired location.

Step 2: Run setup.exe as administrator.
Follow the on-screen instructions.

Step 3: Configure your API key in the settings file.
The settings file is located at config/settings.json.
"""
Document_C = """ The Future of Renewable Energy

Solar and wind power have seen tremendous growth in recent years. As technology improves
and costs decrease, renewable energy becomes increasingly competitive with fossil fuels.

Energy storage solutions are critical for renewable adoption. Battery technology advances
enable better grid management and reliability. This addresses the intermittent nature of
solar and wind power.

Policy support and public awareness continue to drive the transition. Many countries have
set ambitious renewable energy targets for the coming decades.
"""

In [4]:
def chunk_by_paragraphs(text, min_chunk_size=50):
    """
    Split text by paragraphs (double newlines).
    For FAQs, we want to keep Q&A pairs together but separate from other pairs.
    """
    # Split by double newlines to isolate Q&A pairs
    paragraphs = text.split('\n\n')
    
    chunks = []
    current_chunk = ""
    
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
            
        # LOGIC: Check if adding this paragraph helps meet the minimum size
        # OR if we should just keep it separate (to preserve FAQ boundaries)
        if len(current_chunk) + len(para) < min_chunk_size:
            current_chunk += "\n\n" + para if current_chunk else para
        else:
            # If we have a chunk ready, save it
            if current_chunk:
                chunks.append(current_chunk.strip())
            # Start the new chunk with the current paragraph
            current_chunk = para
    
    # Save the final chunk if anything is left
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# 3. Test it
# We use a small min_chunk_size (50) to ensure we don't accidentally merge two different questions.
chunks = chunk_by_paragraphs(Document_A, min_chunk_size=50)

print(f"Number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i} ({len(chunk)} chars):")
    print(chunk)
    print("-" * 80)

Number of chunks: 3

Chunk 1 (104 chars):
Q: What is the return policy?
A: Items can be returned within 30 days of purchase with original receipt.
--------------------------------------------------------------------------------
Chunk 2 (120 chars):
Q: Do you offer international shipping?
A: Yes, we ship to over 50 countries worldwide. Shipping times vary by location.
--------------------------------------------------------------------------------
Chunk 3 (89 chars):
Q: How do I track my order?
A: Use the tracking number sent to your email after shipment.
--------------------------------------------------------------------------------


In [13]:
def chunk_by_paragraphs(text, min_chunk_size=100):
    """
    Split text by paragraphs (double newlines).
    Args:
        text: The text to chunk
        min_chunk_size: Minimum characters per chunk (combine small paragraphs)
    Returns:
        List of text chunks
    """
    # Split by double newlines (paragraph separator)
    paragraphs = text.split('\n\n')
    
    chunks = []
    current_chunk = ""
    
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
            
        # LOGIC: Check if the paragraph is small. 
        # If it's small (<100 chars), append it to the current buffer to avoid orphan lines.
        # If it's large (>=100 chars), flush the buffer and start a new chunk.
        if len(para) < min_chunk_size:
            current_chunk += "\n\n" + para if current_chunk else para
        else:
            # Save previous chunk if exists
            if current_chunk:
                chunks.append(current_chunk.strip())
            # Start new chunk with this paragraph
            current_chunk = para
    
    # Don't forget the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# 3. Test it
chunks = chunk_by_paragraphs(Document_B, min_chunk_size=50)

print(f"Number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i} ({len(chunk)} chars):")
    print(chunk)
    print("-" * 80)

Number of chunks: 4

Chunk 1 (18 chars):
Installation Guide
--------------------------------------------------------------------------------
Chunk 2 (95 chars):
Step 1: Download the installer from our website.
Extract the zip file to your desired location.
--------------------------------------------------------------------------------
Chunk 3 (74 chars):
Step 2: Run setup.exe as administrator.
Follow the on-screen instructions.
--------------------------------------------------------------------------------
Chunk 4 (106 chars):
Step 3: Configure your API key in the settings file.
The settings file is located at config/settings.json.
--------------------------------------------------------------------------------


In [5]:
def chunk_by_sentences(text, max_chunk_size=500):
    """
    Split text into chunks by sentences, keeping sentences intact.
    
    Args:
        text: The text to chunk
        max_chunk_size: Maximum characters per chunk
    
    Returns:
        List of text chunks
    """
    # Simple sentence splitting (split on . ! ?)
    import re
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        # Check if adding this sentence would exceed max size
        if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
            # Save current chunk and start new one
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            # Add sentence to current chunk
            current_chunk += " " + sentence if current_chunk else sentence
    
    # Don't forget the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Test it
chunks = chunk_by_sentences(Document_C, max_chunk_size=400)

print(f"Number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i} ({len(chunk)} chars):")
    print(chunk)
    print("-" * 80)

Number of chunks: 2

Chunk 1 (346 chars):
The Future of Renewable Energy

Solar and wind power have seen tremendous growth in recent years. As technology improves
and costs decrease, renewable energy becomes increasingly competitive with fossil fuels. Energy storage solutions are critical for renewable adoption. Battery technology advances
enable better grid management and reliability.
--------------------------------------------------------------------------------
Chunk 2 (216 chars):
This addresses the intermittent nature of
solar and wind power. Policy support and public awareness continue to drive the transition. Many countries have
set ambitious renewable energy targets for the coming decades.
--------------------------------------------------------------------------------


### Answers

```python
# Document A: FAQ
strategy_A = Paragraphy chunking
reason_A = Paragraph splits ensure the Q and A stay locked together
chunks_A = ?      # Your implementation

# Document B: Technical Documentation
strategy_B = "?"
reason_B = Due to orderliness here is important hence need for chunking together
chunks_B = ?

# Document C: Article
strategy_C = Sentence Chunking
reason_C = sentence chunking is the best for large document texts
chunks_C = ?
```
