In [None]:
import os
from typing import List


hf_model_cache = os.path.join(os.getcwd(), '.hg_model_cache')
os.environ['HF_HOME'] = hf_model_cache

import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from BioClinicalBERTEmbeddings import BioClinicalBERTEmbeddings
from langchain_openai import AzureOpenAIEmbeddings

In [None]:
notes = pd.read_csv('data/2025_ADRD_case_finding/data/adrd_study_700_notes_all.csv')
label = pd.read_csv('data/2025_ADRD_case_finding/data/adrd_study_700_label.csv')
has_label = label[label['rand_ind'] > 33]
# rule_based_chunk = pd.read_csv('rule_based_33toEnd.csv', header = None, names=['report_number', 'section_name', 'content'])

In [None]:
len(has_label)

# Before Chunking

Add each patient's doctor note as a document along with patient's metadata

For raw documents

In [None]:
documents = []

for _,label in has_label.iterrows():
    patient_notes = notes[notes['empi'] == label['empi']]
    if len(patient_notes) > 0:  # some patients have ADRD labels but no notes. Filtering them out
        for _, note in patient_notes.iterrows():
            content = note['notetxt']
            metadata = {
                'dob': label['dob'],
                'empi': label['empi'],
                'notetype': note['notetype'],
                'report_date': note['report_date'],
                'report_description': note['report_description'],
                'report_number': note['report_number'],
            }
            documents.append(Document(page_content=content, metadata=metadata))

print("Documents done")
print(len(documents))

chunk_size = 600
chunk_overlap = 100

In [None]:
from langchain_chroma import Chroma

# embedding_model = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2', model_kwargs={'device': 'cuda'}, encode_kwargs={'batch_size': 512, 'normalize_embeddings': False})
embedding_model = BioClinicalBERTEmbeddings(device='cuda')

vectorstore = Chroma.from_documents(
    documents,
    embedding_model,
    persist_directory="chroma_bioclinicalbert_rule_based_33toEnd"
)

# Chunk using `RecursiveCharacterTextSplitter`

Using default separators and print the chunked results

In [None]:
import time

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    add_start_index=True
)


chunks_1 = text_splitter.split_documents(documents)


print(f"{len(documents)} documents split into {len(chunks_1)} chunks.\n")


# Use different separators

In [None]:
import time
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    add_start_index=True,
    separators=[
        '\r\n\r\n\r\n',
        '\r\n\r\n',
        '\r\n',
        '.',
        ',',
        ' ',
        ''
    ]
)
start = time.perf_counter()
chunks_2 = text_splitter.split_documents(documents)
end = time.perf_counter()

print(f"{len(documents)} documents split into {len(chunks_2)} chunks.\n")
print(f"Splitting took {end-start} seconds.")

# embedding_model = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2', model_kwargs={'device': 'cuda'}, encode_kwargs={'batch_size': 512, 'normalize_embeddings': False})
#
# from langchain_chroma import Chroma
# vectorstore = Chroma.from_documents(
#     chunks_2,
#     embedding_model,
#     persist_directory=f'chroma_mpnet_{chunk_size}_{chunk_overlap}_33toEnd_separators'
# )

# Semantic Chunker

https://python.langchain.com/api_reference/experimental/text_splitter/langchain_experimental.text_splitter.SemanticChunker.html#langchain_experimental.text_splitter.SemanticChunker.split_documents

In [None]:

embedding_model = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2', model_kwargs={'device': 'cuda'}, encode_kwargs={'batch_size': 256, 'normalize_embeddings': False})

text_splitter = SemanticChunker(embeddings=embedding_model, buffer_size=2)

chunks_3 = text_splitter.split_documents(documents)

print(f"{len(documents)} documents split into {len(chunks_3)} chunks. Showing first 3 chunks:\n")
print(chunks_3[:5])

In [None]:


embedding_model = HuggingFaceEmbeddings(model_name='neuml/pubmedbert-base-embeddings', model_kwargs={'device': 'cuda'}, encode_kwargs={'batch_size': 256, 'normalize_embeddings': False})

text_splitter = SemanticChunker(embeddings=embedding_model)

chunks_4 = text_splitter.split_documents(documents)

print(f"{len(documents)} documents split into {len(chunks_4)} chunks. Showing first 3 chunks:\n")
print(chunks_4[:5])

# Different sentence splitting

In [None]:
import re
from typing import List

class CustomSemanticChunker(SemanticChunker):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def split_text(
        self,
        text: str,
    ) -> List[str]:
        # Splitting the essay (by default on '.', '?', and '!')
        single_sentences_list = re.split(self.sentence_split_regex, text)

        # having len(single_sentences_list) == 1 would cause the following
        # np.percentile to fail.
        if len(single_sentences_list) == 1:
            return single_sentences_list
        # similarly, the following np.gradient would fail
        if (
            self.breakpoint_threshold_type == "gradient"
            and len(single_sentences_list) == 2
        ):
            return single_sentences_list
        distances, sentences = self._calculate_sentence_distances(single_sentences_list)
        if self.number_of_chunks is not None:
            breakpoint_distance_threshold = self._threshold_from_clusters(distances)
            breakpoint_array = distances
        else:
            (
                breakpoint_distance_threshold,
                breakpoint_array,
            ) = self._calculate_breakpoint_threshold(distances)

        indices_above_thresh = [
            i
            for i, x in enumerate(breakpoint_array)
            if x > breakpoint_distance_threshold
        ]

        chunks = []
        start_index = 0

        # Iterate through the breakpoints to slice the sentences
        for index in indices_above_thresh:
            # The end index is the current breakpoint
            end_index = index

            # Slice the sentence_dicts from the current start index to the end index
            group = sentences[start_index : end_index + 1]
            combined_text = "\r\n".join([d["sentence"] for d in group])
            # If specified, merge together small chunks.
            if (
                self.min_chunk_size is not None
                and len(combined_text) < self.min_chunk_size
            ):
                continue
            chunks.append(combined_text)

            # Update the start index for the next group
            start_index = index + 1

        # The last group, if any sentences remain
        if start_index < len(sentences):
            combined_text = "\r\n".join([d["sentence"] for d in sentences[start_index:]])
            chunks.append(combined_text)
        return chunks

embedding_model = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2', model_kwargs={'device': 'cuda'}, encode_kwargs={'batch_size': 512, 'normalize_embeddings': False})


text_splitter = CustomSemanticChunker(
    embeddings=embedding_model,
    sentence_split_regex=r"\r\n",
    buffer_size=3,
)
filtered_docs =[doc for doc in documents if doc.page_content.strip()]
print("Start splitting now")
start = time.perf_counter()
chunks_5 = text_splitter.split_documents(filtered_docs)
end = time.perf_counter()

print(f"{len(filtered_docs)} documents split into {len(chunks_5)} chunks.\n")
print(f"Splitting took {end-start} seconds.")
# from langchain_chroma import Chroma
# vectorstore = Chroma.from_documents(
#     chunks_5,
#     embedding_model,
#     persist_directory=f'chroma_text-embedding-3-large_rn_separators_semantic_300toEnd'
# )
# print(f"Created Chroma vector store with {vectorstore._collection.count()} embeddings.")

Section based chunking

In [None]:
import time
rule_based_chunk = pd.read_csv('rule_based_33toEnd.csv', header = None, names=['report_number', 'section_name', 'content'])


start_time = time.perf_counter()

# Rule based chunks save to ChromaDB
documents = []


for index, label in has_label.iterrows():
    patient_notes = notes[notes['empi'] == label['empi']]
    if len(patient_notes) > 0:
        print(label['empi'])
        for _, note in patient_notes.iterrows():
            note_id = note['report_number']
            for _, section in rule_based_chunk[rule_based_chunk['report_number'] == note_id].iterrows():
                metadata = {
                    'dob': label['dob'],
                    'empi': label['empi'],
                    'notetype': note['notetype'],
                    'report_date': note['report_date'],
                    'report_description': note['report_description'],
                    'report_number': note['report_number'],
                    'section_name': section['section_name']
                }
                documents.append(Document(page_content=section['content'], metadata=metadata))

print(f"Loading documents took {time.perf_counter() - start_time} seconds.")
print(len(documents))

In [None]:
start_time = time.perf_counter()
embedding_model = AzureOpenAIEmbeddings(
    # api key here
)

batch_size = 50
delay_in_seconds = 1

def batch_documents(docs, size):
    """Yield successive n-sized chunks from a list of documents."""
    for i in range(0, len(docs), size):
        yield docs[i:i + size]

document_batches = list(batch_documents(chunks_2, batch_size))

from langchain_chroma import Chroma

total_time = 10095.399431978352

vectorstore = Chroma.from_documents(
    documents=document_batches[0],
    embedding=embedding_model,
    persist_directory=f'chroma_text-embedding-3-large_recursive_33toEnd_time_measurement_final'
)

total_time += time.perf_counter() - start_time

print("First batch processed and DB created")

start_index = 1

for i, batch in enumerate(document_batches[start_index:], start=start_index):
    print(f"Processing batch {i+1}/{len(document_batches)}...")
    start_time = time.perf_counter()
    while True:
        try:
            vectorstore.add_documents(documents=batch)
            break
        except Exception as e:
            print(e)

    total_time += time.perf_counter() - start_time
    print(f"Total time so far: {total_time}")
    time.sleep(delay_in_seconds)

print(f"Created Chroma vector store with {vectorstore._collection.count()} embeddings.")
print(f"Total time: {total_time}")
print(f"Start date: {start_time}")
with open("time recursive", 'w') as f:
    f.write(str(total_time))


In [None]:
has_label = label[label['rand_ind'] > 33]
rule_based_chunk = pd.read_csv('rule_based_33toEnd.csv', header = None, names=['report_number', 'section_name', 'content'])

# Rule based chunks save to ChromaDB
documents = []

for index, label in has_label.iterrows():
    patient_notes = notes[notes['empi'] == label['empi']]
    if len(patient_notes) > 0:
        print(label['empi'])
        for _, note in patient_notes.iterrows():
            note_id = note['report_number']
            for _, section in rule_based_chunk[rule_based_chunk['report_number'] == note_id].iterrows():
                metadata = {
                    'dob': label['dob'],
                    'empi': label['empi'],
                    'notetype': note['notetype'],
                    'report_date': note['report_date'],
                    'report_description': note['report_description'],
                    'report_number': note['report_number'],
                    'section_name': section['section_name']
                }
                documents.append(Document(page_content=section['content'], metadata=metadata))

print(len(documents))

chunk_7 = text_splitter.split_documents(documents)

print(f"{len(documents)} documents split into {len(chunk_7)} chunks. Showing first 5 chunks:\n")
print(chunk_7[:5])
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(
    chunk_7,
    embedding_model,
    persist_directory=f'chroma_text-embedding-3-large_rule_based_33toEnd'
)
print(f"Created Chroma vector store with {vectorstore._collection.count()} embeddings.")