In [None]:
import os
from ollama import generate
from sentence_transformers import CrossEncoder
import time
from openai import AzureOpenAI
import pandas as pd
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from BioClinicalBERTEmbeddings import BioClinicalBERTEmbeddings
from langchain_chroma import Chroma

hf_model_cache = os.path.join(os.getcwd(), '.hg_model_cache')
os.environ['HF_HOME'] = hf_model_cache

In [None]:
notes = pd.read_csv('data/2025_ADRD_case_finding/data/adrd_study_700_notes_all.csv')
label = pd.read_csv('data/2025_ADRD_case_finding/data/adrd_study_700_label.csv')
diagnosis_list = pd.read_csv("data/2025_ADRD_case_finding/data/adrd_combined_diagnosis.csv")  # Result of Diagnosis query
problem_list = pd.read_csv("data/2025_ADRD_case_finding/data/adrd_combined_problems.csv")  # Result of Problem List query


has_label = label[label['rand_ind'] <= 33]
len(has_label)

In [None]:
problem_list['diagnosis_date'] = pd.to_datetime(problem_list['diagnosis_date'])
filter_date = pd.to_datetime('2024-01-31')
problem_list = problem_list[problem_list['diagnosis_date'] <= filter_date]

In [None]:
diagnosis_list['diagnosis_date'] = pd.to_datetime(diagnosis_list['diagnosis_date'])
diagnosis_list = diagnosis_list[diagnosis_list['diagnosis_date'] <= filter_date]

In [None]:
documents = []
no_notes = 0
no_notes_yes_structured = 0
for _,label in has_label.iterrows():
    patient_notes = notes[notes['empi'] == label['empi']]
    patient_diagnosis = diagnosis_list[diagnosis_list['empi'] == label['empi']]
    patient_problem = problem_list[problem_list['empi'] == label['empi']]
    if len(patient_notes) > 0:  # some patients have ADRD labels but no notes. Filtering them out
        for _, note in patient_notes.iterrows():
            content = note['notetxt']
            metadata = {
                'dob': label['dob'],
                'empi': label['empi'],
                'notetype': note['notetype'],
                'report_date': note['report_date'],
                'report_description': note['report_description'],
                'report_number': note['report_number'],
            }
            documents.append(Document(page_content=content, metadata=metadata))
    else:
        no_notes += 1
        if len(patient_problem) > 0 or len(patient_diagnosis) > 0:
            no_notes_yes_structured += 1
            print(f"{label['empi']} has no notes but have structured data" )
        else:
            print(f"{label['empi']} has no notes nor any structured data" )


print("Documents done")
print(len(documents))
print(f"{no_notes} have no notes")
print(f"{no_notes_yes_structured} have no notes but have structured data")

In [None]:
from langchain_openai import AzureOpenAIEmbeddings
def run_experiment(chunk_strategy: str, embedding_name: str, search_method: str, llm_name: str, result_file: str):
    """
    Reports the confusion matrix, time spent, model settings, and response for the given setting in csv format.

    Args:
        chunk_strategy (str): "rule_based" or "rn_separators_semantic"
        embedding_name (str): "mpnet" or "bioclinicalbert" or "text-embedding-3-large"
        search_method (str): "Cosine similarity" or "Max marginal relevance"
        llm_name (str): "GPT-4o" or any ollama model name
        result_file (str): The output file in csv format.

    Returns:
        None

    """
    if embedding_name == "mpnet":
        bi_encoder_model = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2', model_kwargs={'device': 'cuda'}, encode_kwargs={'batch_size': 256, 'normalize_embeddings': False})

        if chunk_strategy == "rule_based":
            vectorstore = Chroma(
                persist_directory=f'chroma_{embedding_name}_{chunk_strategy}_0to300',
                embedding_function=bi_encoder_model
            )
            vectorstore2 = Chroma(
                persist_directory=f'chroma_{embedding_name}_{chunk_strategy}_300toEnd',
                embedding_function=bi_encoder_model
            )
        elif chunk_strategy == "rn_separators_semantic":
            vectorstore = Chroma(
                persist_directory=f'chroma_{embedding_name}_0to300_{chunk_strategy}',
                embedding_function=bi_encoder_model
            )
            vectorstore2 = Chroma(
                persist_directory=f'chroma_{embedding_name}_300toEnd_{chunk_strategy}',
                embedding_function=bi_encoder_model
            )
        elif chunk_strategy == "recursive":
            vectorstore = Chroma(
                persist_directory=f'chroma_mpnet_600_100_0to300_separators',
                embedding_function=bi_encoder_model
            )
            vectorstore2 = Chroma(
                persist_directory=f'chroma_mpnet_600_100_300toEnd_separators',
                embedding_function=bi_encoder_model
            )
        else:
            print("Embedding strategy unknown")
            return
    elif embedding_name == "bioclinicalbert":
        bi_encoder_model = BioClinicalBERTEmbeddings(device="cuda")

        if chunk_strategy == "rule_based":
            vectorstore = Chroma(
                persist_directory=f'chroma_{embedding_name}_{chunk_strategy}_0to33',
                embedding_function=bi_encoder_model
            )
            vectorstore2 = Chroma(
                persist_directory=f'chroma_{embedding_name}_{chunk_strategy}_33toEnd',
                embedding_function=bi_encoder_model
            )
        elif chunk_strategy == "rn_separators_semantic":
            vectorstore = Chroma(
                persist_directory=f'chroma_{embedding_name}_{chunk_strategy}_0to300',
                embedding_function=bi_encoder_model
            )
            vectorstore2 = Chroma(
                persist_directory=f'chroma_{embedding_name}_{chunk_strategy}_300toEnd',
                embedding_function=bi_encoder_model
            )
        else:
            print("Embedding strategy unknown")
            return
    elif embedding_name == "text-embedding-3-large":
        bi_encoder_model = AzureOpenAIEmbeddings(
        )

        if chunk_strategy == "rule_based":
            vectorstore = Chroma(
                persist_directory=f'chroma_{embedding_name}_{chunk_strategy}_0to33',
                embedding_function=bi_encoder_model
            )
            vectorstore2 = Chroma(
                persist_directory=f'chroma_{embedding_name}_{chunk_strategy}_33toEnd',
                embedding_function=bi_encoder_model
            )

        elif chunk_strategy == "recursive":
            vectorstore = Chroma(
                persist_directory=f'chroma_{embedding_name}_{chunk_strategy}_33toEnd_time_measurement_REALLY_final',
                embedding_function=bi_encoder_model
            )
            vectorstore2 = Chroma(
                persist_directory=f'chroma_{embedding_name}_{chunk_strategy}_33toEnd_time_measurement_REALLY_final',
                embedding_function=bi_encoder_model
            )

        else:
            print("Combination doesn't exist")
            return
    else:
        print("Embedding name unknown")
        return

    print(f"Created Chroma vector store with {vectorstore._collection.count()} embeddings.")
    print(f"Created Chroma vector store with {vectorstore2._collection.count()} embeddings.")

    cross_encoder_model_name = "cross-encoder/ms-marco-MiniLM-L6-v2" # Or another Cross-encoder
    cross_encoder = CrossEncoder(cross_encoder_model_name)

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    records = {
        'empi': [],
        'prediction': [],
        'label': [],
        'prompt': [],
        'top-k': [],
        'response': [],
        'result': [],
        'chunk_method': [],
        'embedding_model': [],
        'cross_encoder_model': [],
        'search_method': [],
        'llm_model': [],
        'retrieval_time (s)': [],
        'generation_time (s)': [],
        'total_time (s)': [],
        'system_prompt': [],
        'query_phrase': []
    }

    client = None
    deployment = None
    if llm_name == "GPT-4o":
        client = AzureOpenAI(
        )
        deployment = "gpt-4o-2"


    for index,label in has_label.iterrows():
        patient_notes = notes[notes['empi'] == label['empi']]
        # these hard coded patient EMPIs have no patient notes but structured data, include them.
        if len(patient_notes) > 0:
            query = 'Dementia'
            records['query_phrase'].append(query)
            target_empi = label['empi']
            missing_doc = []
            print(f'Patient with EMPI {target_empi}')

            start_timer = time.perf_counter()
            if search_method == "Cosine similarity":
                retrieved_docs = vectorstore.similarity_search(query, k=30, filter={"empi": target_empi})
                if len(retrieved_docs) == 0:
                    retrieved_docs = vectorstore2.similarity_search(query, k=30, filter={"empi": target_empi})
                    if len(retrieved_docs) == 0:
                        print(f"{target_empi} found no notes")
            elif search_method == "Max marginal relevance":
                retrieved_docs = vectorstore.max_marginal_relevance_search(query, k=30, fetch_k=40, filter={"empi": target_empi})
                # missing_doc = vectorstore.get(
                #     where={"empi": target_empi}
                # )
                # print(missing_doc)
                if len(retrieved_docs) == 0:
                    retrieved_docs = vectorstore2.max_marginal_relevance_search(query, k=30, fetch_k=40, filter={"empi": target_empi})
                    # missing_doc = vectorstore2.get(
                    #     where={"empi": target_empi}
                    # )

                    if len(retrieved_docs) == 0:
                        print(f"{target_empi} found no notes")
            else:
                print("Search method unknown")
                return

            context = []

            # from langchain_community.utils.math import cosine_similarity
            #
            #
            # query_embed = bi_encoder_model.embed_query(query)
            # for d in retrieved_docs:
            #     d_embed = bi_encoder_model.embed_query(d.page_content)
            #     score = cosine_similarity([query_embed], [d_embed])[0][0]
            #     print(f"Relevance score of {score}:")
            #     print(d.page_content)
            #     print()
            #
            # for d in missing_doc['documents']:
            #     if chunk_contains in d:
            #         d_embed = bi_encoder_model.embed_query(d)
            #         score = cosine_similarity([query_embed], [d_embed])[0][0]
            #         print(f"------------------\nRelevance score of missed doc {score}:")
            #         print(d)
            #         print()
            #
            # return


            # Prepare pairs for the Cross-encoder
            if len(retrieved_docs) > 0:
                cross_encoder_inputs = [[query, doc.page_content] for doc in retrieved_docs]

                # Get scores
                cross_encoder_scores = cross_encoder.predict(cross_encoder_inputs)

                # Combine documents and scores, sort by score descending
                scored_docs = sorted(zip(cross_encoder_scores, retrieved_docs), key=lambda x: x[0], reverse=True)

                # Select the top N reranked documents
                top_n_reranked = 15

                final_retrieved_docs = []
                for score, doc in scored_docs[:top_n_reranked]:
                    final_retrieved_docs.append(doc)

                retrieval_done_time = time.perf_counter() - start_timer

                records['top-k'].append(top_n_reranked)
                records['chunk_method'].append("section based")
                if embedding_name == "text-embedding-3-large":
                    records['embedding_model'].append(embedding_name)
                else:
                    records['embedding_model'].append(bi_encoder_model.model_name)
                records['cross_encoder_model'].append(cross_encoder_model_name if scored_docs else "None")
                records['search_method'].append(search_method)
                records['retrieval_time (s)'].append(retrieval_done_time)

                context = "\n\n\n".join([
                    f"--- Note Type: {doc.metadata['notetype']}, Report Date: {doc.metadata['report_date']}, Report Description: {doc.metadata.get('report_description')}, Date of Birth: {doc.metadata['dob']}, Report Number: {doc.metadata['report_number']}{f', Section Name: {doc.metadata['section_name']}' if doc.metadata.get('section_name') else ' '} ---\nEXCERPT START:\n\n{doc.page_content}\n\nEXCERPT END."
                    for doc in final_retrieved_docs
                ])
            else:
                retrieval_done_time = time.perf_counter() - start_timer
                records['top-k'].append("NA")
                records['chunk_method'].append("NA")
                records['embedding_model'].append("NA")
                records['cross_encoder_model'].append("NA")
                records['search_method'].append("NA")
                records['retrieval_time (s)'].append("NA")

            diagnoses = "\n".join([f"- {doc['diagnosis_date']}: {doc['combined_diagnosis']}"
                                   for _,doc in diagnosis_list[diagnosis_list['empi'] == target_empi].iterrows()])

            problems = "\n".join([f"- {doc['diagnosis_date']}: {doc['combined_diagnosis']}"
                                   for _,doc in problem_list[problem_list['empi'] == target_empi].iterrows()])

            # llm_query = '''
            # As of the most recent available documentation, does the patient have dementia?
            # Carefully review the timeline of clinical documentation. Determine whether there is an explicit and confirmed diagnosis of dementia specifically for this patient (not a family member or relative).
            # Pay close attention to when information was recorded — prioritize more recent notes and diagnoses, but consider earlier entries to evaluate progression over time.
            # Disregard:
            #     - Mentions of cognitive complaints alone
            #     - Medications or treatment trials (e.g., rivastigmine, Aricept) without a confirmed diagnosis
            #     - Indirect assessments (e.g., "work-up for dementia" or "referred for evaluation") unless they result in a clear diagnosis
            #
            # ONLY START YOUR RESPONSE WITH \"YES\" or \"NO\", fall back to NO if unsure. Proceed with explanations AFTER you've given your answer. Example: \"Yes, the patient has dementia...\" or \"No, the patient does not have dementia\", but NEVER start with \"Based on the provided documentation\"... because the response did not immediately start with the answer.
            # '''

            # llm_query = "Does the patient have dementia? Determine if there is an explicit and confirmed diagnosis of dementia specifically for this patient (not a family member or relative). Disregard mentions of cognitive complaints, medications, treatment trials (e.g., rivastigmine or Aricept), or indirect assessments (such as work-ups or evaluations) that do not explicitly state a dementia diagnosis. Consider more recent notes with more importance than older ones, and allow newer notes to negate older diagnosis if dementia information is conflicting.\n\nIf results of cognitive tests like MMSE, MOCA, or Mini-cog are mentioned, interpret them as following:\nMMSE: 24 and higher: Normal cognition, no dementia. 19-23: mild dementia. 10-18: moderate dementia. 9 and lower: severe dementia.\n\nMOCA: 26 or above: normal. 18-25: mild cognitive impairment. 10-17: moderate cognitive impairment. Less than 10: severe cognitive impairment.\n\nMini-cog: If 0 of out 3 words are recalled, it is a positive screen for dementia, regardless of the clock-drawing. If all 3 words are recalled, it is immediately a negative screen for dementia, regardless of the clock-drawing. If the patient recalls 1-2 words, then the clinician needs to further refer to the clock-drawing: If there is 1-2 words (of out 3) for recall, with a normal clock, it is a negative screen for dementia. If there is 1-2 words (of out 3) for recall, with an abnormal clock, it is a positive screen for dementia.\n\nONLY START YOUR RESPONSE WITH \"YES\" or \"NO\", fall back to NO if unsure. Proceed with explanations AFTER you've given your answer. Example: \"Yes, the patient has dementia...\" or \"No, the patient does not have dementia.\""

            llm_query = """Step 1: Analyze the Timeline of Documentation
    Review the complete timeline of clinical notes and diagnoses to determine if a diagnosis of dementia was explicitly made for this patient (not for a family member).
    Prioritize more recent clinical notes and diagnoses, ensuring they align with and confirm earlier findings. If there is only one dementia-related diagnosis code (e.g., Alzheimer’s disease or related dementia), subsequent documentation must continue to support or confirm the diagnosis for it to be valid.


    Step 2: Assess Supporting Evidence
    Consider Cognitive Test Scores: Include scores from cognitive assessments (e.g., MoCA, MMSE). Low scores alone do not confirm dementia but, if consistently impaired over time or paired with a diagnosis, they strengthen the case.
    Evaluate Functional Status: If the patient is described as functionally independent or improving and there is no corroborating evidence of progressive decline, carefully reassess the validity of the earlier dementia diagnosis.
    Transitions in Diagnosis: If earlier records list dementia but newer documentation clarifies mild cognitive impairment (MCI) or indicates improvement, favor the most recent clinically-supported status only if it contradicts earlier findings.


    Step 3: Confirm Continuity of Evidence
    Explicitly check whether the latest clinical documentation reaffirms an earlier dementia-related diagnosis. If the most recent evidence does not explicitly confirm or support the earlier diagnosis, reassess its validity.


    Step 4: Disregard Irrelevant Mentions
    Ignore the following unless explicitly tied to a clinical confirmation of dementia:
    Memory concerns or subjective complaints without diagnosis.
    Medications (e.g., donepezil, rivastigmine) unless paired with a documented diagnosis.
    Phrases like “rule out dementia,” “referred for cognitive workup,” or “possible Alzheimer’s” unless subsequent documentation confirms diagnosis.
    Isolated billing codes or problem list entries unless supported by clinical notes or functional assessments.


    Step 5: Answer Classification
    Provide your answer based on the following criteria:

    "Yes" if:
    There is an explicit dementia diagnosis in the most recent clinical documentation and it aligns with or confirms earlier findings.
    The diagnosis is supported by clinical observations, cognitive testing, or evidence of progressive functional decline.
    "No" if:
    There is no confirmed dementia diagnosis in the most recent documentation.
    The latest evidence indicates MCI or contradicts earlier mentions of dementia.
    The earlier dementia diagnosis is unsupported by subsequent clinical findings, particularly when there is only one dementia-related diagnosis code."""


            prompt = f"""
    Below are some excerpts from clinical notes for this patient

    ------------------------------------------

    Patient Notes:
    {context}

    ------------------------------------------


    Given information above, answer this question: {llm_query}
    """

            records['prompt'].append(prompt)

            system_prompt='''You are a knowledgeable and elite professional healthcare specialist, able to answer complicated and medical related questions from clinical notes. When answering questions, answer based on your knowledge and the provided patient notes. The question will be answered by yes or no format. After answering the question, give thorough explanations and reasons for your decision. Include specific information from the notes and cite the report number when it\'s helpful. Make your answer evidence based. Start your response with either YES or NO, fall back to NO if unsure. ONLY START YOUR RESPONSE WITH \"YES\" or \"NO\", fall back to NO if unsure. Proceed with explanations AFTER you've given your answer. Example: \"Yes, the patient has dementia...\" or \"No, the patient does not have dementia\", but NEVER start with \"Based on the provided documentation\"... because the response did not immediately start with the answer.'''

            records['system_prompt'].append(system_prompt)

            if llm_name == "GPT-4o":
                records['llm_model'].append(llm_name)
                start_timer = time.perf_counter()
                try:
                    response = client.chat.completions.create(
                        messages=[
                            {
                                'role': 'system',
                                'content':system_prompt
                            },
                            {
                                'role': 'user',
                                'content': prompt
                            }
                        ],
                        max_tokens=4096,
                        temperature=0,
                        model=deployment
                    )

                    llm_response = response.choices[0].message.content

                except Exception as e:
                    print(f"An unexpected error occurred: {e}")
                    llm_response = ""

                finally:
                    generation_done_time = time.perf_counter() - start_timer
                    records['generation_time (s)'].append(generation_done_time)
                    records['total_time (s)'].append(retrieval_done_time + generation_done_time)

            else:
                records['llm_model'].append(llm_name)
                start_timer = time.perf_counter()
                response = generate(model=llm_name,
                                    system=system_prompt,
                                    prompt=prompt,
                                    options={'temperature': 0}
                    )
                generation_done_time = time.perf_counter() - start_timer
                records['generation_time (s)'].append(generation_done_time)
                records['total_time (s)'].append(retrieval_done_time + generation_done_time)
                llm_response = response.response

            records['response'].append(llm_response)
            print(f'\n\n{llm_response}\n\n')

            if 'yes' in llm_response.lower()[:10]:
                predicted_label = 'yes'
            elif 'no' in llm_response.lower()[:10]:
                predicted_label = 'no'
            else:
                predicted_label = 'no'

            records['prediction'].append(predicted_label)
            actual_label = label['final_label'].lower()

            records['label'].append(actual_label)
            records['empi'].append(target_empi)

            if actual_label == 'yes':
                if predicted_label == 'yes':
                    tp += 1
                    records['result'].append('tp')
                else:
                    fn += 1
                    records['result'].append('fn')
            elif actual_label == 'no':
                if predicted_label == 'no':
                    tn += 1
                    records['result'].append('tn')
                else:
                    fp += 1
                    records['result'].append('fp')

    print(f'TP = {tp}, FP = {fp}, TN = {tn}, FN = {fn}')

    saving = pd.DataFrame(records)
    print(f"Total latency: {saving['total_time (s)'].sum()} seconds. Remember to run reanalysis.ipynb")
    saving.to_csv(result_file)

In [None]:
run_experiment(chunk_strategy="rule_based", embedding_name="text-embedding-3-large", search_method="Max marginal relevance", llm_name="GPT-4o", result_file="cost_analysis.csv")