In [1]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings

In [2]:
from rich import print
from rich.console import Console

console = Console(width=140)

In [3]:
# file_path = "boericke_full_text.txt"

# with open(file_path, "r", encoding="utf-8") as f:
#     text = f.read()

# print(text[:100])

text="""MEDUSA
Jelly-fish
Whole face puffed œdematous-eyes, nose, ears, lips.
Skin.––Numbness; burning, pricking heat. Vesicular eruption especially on face, arms, shoulders, and breasts. Nettlerash (Apis; Chloral; Dulc).
Female.––Marked action on lacteal glands. The secretion of milk was established after lack of it in all previous confinements.
Relationship.––Compare: Pyrarara, Physalia (urticaria); Urtica, Homar, Sep.

MEL CUM SALE
Honey with Salt
Prolapsus uteri and chronic metritis, especially when associated with subinvolution and inflammation of the cervix. The special symptom leading to its selection is a feeling of soreness across the hypogastrium from ileum to ileum.
Uterine displacements, and in the commencement of metritis Sensation as if bladder were too full. Pain from sacrum towards pubes. Pain as if in ureters.
Dose.––Third to sixth potency. Honey for itching of anus and worms.

METHYLENUM COERULEUM
Aniline Dye
Methylene Blue
A remedy for neuralgia, neurasthenia, malaria; typhoid, here it diminishes the tympanites, delirium, and fever; pus infection. Tendency to tremor, chorea and epilepsy. Nephritis (acute parenchymatous), scarlatinal nephritis. Urine acquires a green color. Bladder irritation from its use antidoted by a little nutmeg.
Surgical kidney with large amount of pus in urine. Gonorrhœal rheumatism and cystitis. Backache, sciatica. Later states of apoplexy (Gisevius).
Dose.––3x attenuation. A 2 per cent solution locally, in chronic otitis with foul smelling discharge.
A 1 per cent aqueous solution for ulcers and abscesses of cornea."""

# General Models

In [4]:
all_mini_lm_l6_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs = {"device": "mps"}
)

In [5]:
baai_bge_small_embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs = {"device": "mps"}
)

In [6]:
jina_ai_small_embeddings = HuggingFaceEmbeddings(
    model_name="jinaai/jina-embeddings-v2-small-en",
    model_kwargs = {"device": "mps"}
)

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

### Semantic Chunking

In [7]:
from time import time

def semantic_chunking(text,embeddings,breakpoint_threshold_type,breakpoint_threshold_amount,n=10):

    start = time()
    text_splitter = SemanticChunker(
        embeddings=embeddings,
        breakpoint_threshold_type=breakpoint_threshold_type,  
        breakpoint_threshold_amount=breakpoint_threshold_amount,          
    )
    chunks = text_splitter.create_documents([text])
    end = time()
    console.print(f"Time taken for chunking: {(end - start):.3f} s")
    console.print(f"No of chunks: {len(chunks)}")

    n = min(n,len(chunks))

    console.print(f"First {n} chunks with {embeddings.model_name}")

    

    for i, chunk in enumerate(chunks[:n]):
        print(f"\n--- Chunk {i+1} ---")
        console.print(chunk.page_content)

## MiniLM

In [8]:
semantic_chunking(text,all_mini_lm_l6_embeddings,"percentile",90)

In [9]:
semantic_chunking(text,all_mini_lm_l6_embeddings,"standard_deviation",1.3)

## Jina AI

In [10]:
semantic_chunking(text,jina_ai_small_embeddings,"percentile",90)



In [11]:
semantic_chunking(text,jina_ai_small_embeddings,"standard_deviation",1.3)

## BAAI

In [12]:
semantic_chunking(text,baai_bge_small_embeddings,"percentile",90)

In [13]:
semantic_chunking(text,baai_bge_small_embeddings,"standard_deviation",1.3)

### Split Each Medicine and then Make Chunks

In [14]:
import re
from typing import List

def split_by_caps_headers(text: str) -> List[str]:
    """
    Splits text into sections whenever an ALL-CAPS medicine name appears.
    """

    # Normalize newlines
    text = re.sub(r"\r\n", "\n", text)

    # Regex: line with mostly capital letters and spaces
    header_pattern = re.compile(
        r"\n(?=[A-Z][A-Z\s\-]{3,}\n)"
    )

    sections = header_pattern.split("\n" + text)

    # Clean and return
    return [s.strip() for s in sections if s.strip()]


In [15]:
docs = split_by_caps_headers(text)

console.print(f"No of docs: {len(docs)}")

for i,doc in enumerate(docs[:3]):
    console.print(f"Doc No: {i+1}")
    console.print(doc)

In [16]:
embeddings = all_mini_lm_l6_embeddings

breakpoint_threshold_type = "percentile"  
breakpoint_threshold_amount = 90          

text_splitter = SemanticChunker(
    embeddings=embeddings,
    breakpoint_threshold_type=breakpoint_threshold_type,
    breakpoint_threshold_amount=breakpoint_threshold_amount,
)

In [17]:

lc_documents = []

for i, doc in enumerate(docs):
    lines = doc.splitlines()
    medicine_name = lines[0].strip() if lines else "Unknown"

    semantic_docs = text_splitter.create_documents([doc])

    for d in semantic_docs:
        d.metadata["medicine"] = medicine_name
        lc_documents.append(d)

print(f"Total semantic chunks created: {len(lc_documents)}")

# Optional: preview first few chunks
for i, d in enumerate(lc_documents[:10]):
    print(f"\n--- Chunk {i+1} ---")
    print("Medicine:", d.metadata["medicine"])
    print(d.page_content)
