In [25]:
# set path for testing
import sys
import os
parent_dir = os.path.abspath("../")
print(parent_dir)
sys.path.append(parent_dir)

import os
from dotenv import load_dotenv
import json
import pandas as pd
import anthropic
from embeddings import snomed_store, loinc_store
from embeddings.embeddings import SearchResult

load_dotenv(override=True)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

snomed_store = snomed_store.connect_snomed()
loinc_store = loinc_store.connect_loinc()

# import data to temporarily plug in missing metadata that will be in the vector store
df = pd.read_csv("/Users/hanna/openfn/ai_experiments/data/LOINC-Clinical-Terminology/LoincTableCore.csv")
loinc_num_dict = dict(zip(df.LONG_COMMON_NAME.to_list(), df.LOINC_NUM.to_list()))

# use LLM to get input text CT synonym then search vector store. Also add hybrid search.

synonym_expand_system_prompt = """
You are an assisstant for matching terminology between health record systems. 
You will be given a source text from one health system, which might be in another language.
You should output its equivalent name (the long name, not the code) in LOINC clinical terminology with no further explanation.
"""

synonym_expand_user_prompt = """
Source text to output a LOINC term for: "{input_text}"
"""

synonym_expand_select_system_prompt = """
You are an assisstant for matching terminology between health record systems. 
You will be given a LOINC entry to look for in a list of similar records. You will also be given the original soruce term for context.
You should select the correct LOINC entry from the options and output just the LONG_COMMON_NAME and the LOINC_NUMBER separated by a semicolon.
"""

synonym_expand_select_user_prompt = """
Original source term for context: {input_text}
LOINC entry to look for: {LLM_guess}
LOINC entries to select from: "{retreived_texts}"
"""

def synonym_method_2(input_query):
    # input_query = "malaria_RDT_result"

    synonym_expand_user_prompt_formatted = synonym_expand_user_prompt.format(input_text=input_query)
    message = client.messages.create(
        model= "claude-3-5-sonnet-20241022", #"claude-3-5-sonnet-20241022", #claude-3-5-haiku-20241022
        max_tokens=500,
        temperature=0,
        system=synonym_expand_system_prompt,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": synonym_expand_user_prompt_formatted
                    }
                ]
            }
        ]
    )
    synonym_expand_answer = message.content[0].text
    print(f"LLM guess for the CT: {synonym_expand_answer}")
    synonym_expand_retreived_data = loinc_store.search(input_query, search_kwargs={"k": 100})

    # add keyword search to conduct hybrid search
    keyword_search_results = df[df["LONG_COMMON_NAME"].str.contains(synonym_expand_answer)].LONG_COMMON_NAME.to_list()
    keyword_search_results = [SearchResult(text=json.dumps({"LONG_COMMON_NAME": s}), metadata={}, score=None) for s in keyword_search_results]
    search_results_combined = keyword_search_results+synonym_expand_retreived_data

    # temp fix for missing loinc numbers
    search_results_combined = [SearchResult(text=r.text, metadata={**r.metadata, 'LOINC_NUMBER': loinc_num_dict.get(json.loads(r.text)['LONG_COMMON_NAME'])}, score=r.score) for r in search_results_combined]
    synonym_expand_select_user_prompt_formatted = synonym_expand_select_user_prompt.format(input_text=input_query, LLM_guess=synonym_expand_answer, retreived_texts=search_results_combined)
    message = client.messages.create(
        model= "claude-3-5-sonnet-20241022", #"claude-3-5-sonnet-20241022", #claude-3-5-haiku-20241022
        max_tokens=500,
        temperature=0,
        system=synonym_expand_select_system_prompt,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": synonym_expand_select_user_prompt_formatted
                    }
                ]
            }
        ]
    )
    synonym_expand_select_answer = message.content[0].text
    
    return synonym_expand_select_answer

body_site_step_1_system_prompt = """
You are an assistant for tagging diagnoses and processes in health records with
the correct body sites as defined in SNOMED Clinical Terminology.
You will be given an input text from a health record. 
Simply state the SNOMED body site term for the input.
"""
# 
body_site_step_1_user_prompt = """
Input to return a body site for: "{input_text}"
"""


def get_snomed_bodysite(input_query):

    body_site_step_1_user_prompt_formatted = body_site_step_1_user_prompt.format(input_text=input_query.replace("_", " "))

    message = client.messages.create(
        model= "claude-3-5-sonnet-20241022", #"claude-3-5-sonnet-20241022", #"claude-3-5-haiku-20241022"
        max_tokens=500,
        temperature=0,
        system=body_site_step_1_system_prompt,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": body_site_step_1_user_prompt_formatted
                    }
                ]
            }
        ]
    )
    body_site_answer = message.content[0].text
    body_site_snomed_entry = snomed_store.search(body_site_answer, search_kwargs={"k": 1})
    found_name = body_site_snomed_entry[0].metadata["Description"]
    found_code = body_site_snomed_entry[0].metadata["Code"]

    return f"llm guess: {body_site_answer}; matched entry: {found_name};{found_code}"


/Users/hanna/openfn/ai_experiments/apollo/services


  df = pd.read_csv("/Users/hanna/openfn/ai_experiments/data/LOINC-Clinical-Terminology/LoincTableCore.csv")


# You can test the mapping functions below this cell by changing the text in input_query and running the cell.

In [None]:
# Synonyms – mapping an input to LOINC terms
# Method 2 – LLM outputs what it thinks the term is, then we use it to retreive results using hybrid search (similarity search + keyword search).
# We then use the LLM again to pick the correct exact term and the LOINC code.

input_query = "smoking_history"
synonym_method_2(input_query)

LLM guess for the CT: Tobacco smoking status


'Tobacco smoking status;72166-2'

In [22]:
# Snomed body site mapping
# Method 1 – LLM outputs what it thinks the term is, then we use it to retreive results.
# Note: Currently the SNOMED code retreival part fails - the retreival step needs fixing.

input_query = "epilepsy"
get_snomed_bodysite(input_query)

'llm guess: For "epilepsy", the relevant SNOMED body site is:\n\nBrain structure (entire brain); matched entry: Entire ependyma of brain ventricle (body structure);731442007.0'