<a href="https://colab.research.google.com/github/PassionateAbdullah/RAG-LLM-Langchain/blob/main/Semantic_chunking_with_diff_method_and_thrashold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print("=== PERCENTILE METHOD ===")

# Conservative chunking (fewer breakpoints)
breakpoints_percentile_95 = compute_breakpoints(similarities, method="percentile", threshold=95)
print(f"Percentile 95% threshold: {len(breakpoints_percentile_95)} breakpoints")
print(f"Breakpoints: {breakpoints_percentile_95}")

# Moderate chunking (moderate breakpoints)
breakpoints_percentile_90 = compute_breakpoints(similarities, method="percentile", threshold=90)
print(f"Percentile 90% threshold: {len(breakpoints_percentile_90)} breakpoints")
print(f"Breakpoints: {breakpoints_percentile_90}")

# Aggressive chunking (more breakpoints)
breakpoints_percentile_80 = compute_breakpoints(similarities, method="percentile", threshold=80)
print(f"Percentile 80% threshold: {len(breakpoints_percentile_80)} breakpoints")
print(f"Breakpoints: {breakpoints_percentile_80}")

# Very aggressive chunking (many breakpoints)
breakpoints_percentile_70 = compute_breakpoints(similarities, method="percentile", threshold=70)
print(f"Percentile 70% threshold: {len(breakpoints_percentile_70)} breakpoints")
print(f"Breakpoints: {breakpoints_percentile_70}")

print("\n" + "="*50 + "\n")


In [None]:

# Method 2: Standard Deviation Method
# Finds breakpoints where similarity is X standard deviations below the mean
# Higher threshold = more breakpoints (more sensitive)
# Lower threshold = fewer breakpoints (less sensitive)

print("=== STANDARD DEVIATION METHOD ===")

# Conservative chunking (fewer breakpoints)
breakpoints_std_1 = compute_breakpoints(similarities, method="standard_deviation", threshold=1)
print(f"1 standard deviation threshold: {len(breakpoints_std_1)} breakpoints")
print(f"Breakpoints: {breakpoints_std_1}")

# Moderate chunking (moderate breakpoints)
breakpoints_std_1_5 = compute_breakpoints(similarities, method="standard_deviation", threshold=1.5)
print(f"1.5 standard deviation threshold: {len(breakpoints_std_1_5)} breakpoints")
print(f"Breakpoints: {breakpoints_std_1_5}")

# Aggressive chunking (more breakpoints)
breakpoints_std_2 = compute_breakpoints(similarities, method="standard_deviation", threshold=2)
print(f"2 standard deviation threshold: {len(breakpoints_std_2)} breakpoints")
print(f"Breakpoints: {breakpoints_std_2}")

# Very aggressive chunking (many breakpoints)
breakpoints_std_2_5 = compute_breakpoints(similarities, method="standard_deviation", threshold=2.5)
print(f"2.5 standard deviation threshold: {len(breakpoints_std_2_5)} breakpoints")
print(f"Breakpoints: {breakpoints_std_2_5}")

print("\n" + "="*50 + "\n")

In [None]:
# Method 3: Interquartile Range (IQR) Method
# Uses the IQR outlier detection rule: Q1 - 1.5 * IQR
# This method doesn't use the threshold parameter in the traditional sense
# The threshold is fixed at 1.5 (standard IQR outlier multiplier)
# But you can modify it by changing the multiplier

print("=== INTERQUARTILE RANGE METHOD ===")

# Standard IQR method (threshold parameter is ignored, but we'll show it)
breakpoints_iqr = compute_breakpoints(similarities, method="interquartile", threshold=90)
print(f"Standard IQR method: {len(breakpoints_iqr)} breakpoints")
print(f"Breakpoints: {breakpoints_iqr}")

print("\n" + "="*50 + "\n")

In [None]:

# Comparison of all methods
print("=== COMPARISON OF ALL METHODS ===")
print(f"Percentile 90%:        {len(breakpoints_percentile_90)} breakpoints")
print(f"Percentile 80%:        {len(breakpoints_percentile_80)} breakpoints")
print(f"Standard Dev 1.5:      {len(breakpoints_std_1_5)} breakpoints")
print(f"Standard Dev 2.0:      {len(breakpoints_std_2)} breakpoints")
print(f"IQR method:            {len(breakpoints_iqr)} breakpoints")

In [None]:

# Create chunks with different methods for comparison
print("\n=== CHUNK COMPARISON ===")

# Using percentile 90%
chunks_percentile_90 = split_into_chunks(sentences, breakpoints_percentile_90)
print(f"\nPercentile 90% - {len(chunks_percentile_90)} chunks:")
for i, chunk in enumerate(chunks_percentile_90):
    print(f"  Chunk {i+1}: {chunk[:100]}..." if len(chunk) > 100 else f"  Chunk {i+1}: {chunk}")

# Using standard deviation 1.5
chunks_std_1_5 = split_into_chunks(sentences, breakpoints_std_1_5)
print(f"\nStandard Dev 1.5 - {len(chunks_std_1_5)} chunks:")
for i, chunk in enumerate(chunks_std_1_5):
    print(f"  Chunk {i+1}: {chunk[:100]}..." if len(chunk) > 100 else f"  Chunk {i+1}: {chunk}")

# Using IQR method
chunks_iqr = split_into_chunks(sentences, breakpoints_iqr)
print(f"\nIQR method - {len(chunks_iqr)} chunks:")
for i, chunk in enumerate(chunks_iqr):
    print(f"  Chunk {i+1}: {chunk[:100]}..." if len(chunk) > 100 else f"  Chunk {i+1}: {chunk}")