In [1]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

In [2]:
# set path for testing
import sys
import os
parent_dir = os.path.abspath("../")
print(parent_dir)
sys.path.append(parent_dir)

/Users/hanna/openfn/ai_experiments/apollo/services


In [92]:
from embeddings import snomed_store, loinc_store

In [None]:
from embeddings.embeddings import SearchResult

In [20]:
snomed_store = snomed_store.connect_snomed()

In [93]:
loinc_store = loinc_store.connect_loinc()

In [6]:
snomed_store.search("nasal")

  store = self.VectorStoreClass(


[SearchResult(text='{"Value Set Name": "Body Site Value Set", "Description": "Entire mucous membrane of nasal cavity and nasal sinus (body structure)", "Purpose: Clinical Focus": "All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site."}', metadata={'Code': 736508000.0, 'Description': 'Entire mucous membrane of nasal cavity and nasal sinus (body structure)', 'Purpose: Clinical Focus': 'All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site.', 'Value Set Name': 'Body Site Value Set'}, score=None),
 SearchResult(text='{"Value Set Name": "Body Site Value Set", "Description": "Entire mucous membrane of nasal cavity and nasal sinus (body structure)", "Purpose: Clinical Focus": "All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site."}', metadata={'Code': 736508000.0, 'Description': 'Entire mucous me

In [118]:
# import data to temporarily plug in missing metadata that will be in the vector store
import pandas as pd

df = pd.read_csv("/Users/hanna/openfn/ai_experiments/data/LOINC-Clinical-Terminology/LoincTableCore.csv")
loinc_num_dict = dict(zip(df.LONG_COMMON_NAME.to_list(), df.LOINC_NUM.to_list()))

  df = pd.read_csv("/Users/hanna/openfn/ai_experiments/data/LOINC-Clinical-Terminology/LoincTableCore.csv")


Pipeline for searching for approximate synonyms or other textual similarities

Approach:
Step 1: Use input term to search the vector store.
Step 2: Use an LLM to pick one result from the search results. It will give its reasoning as well which could perhaps be consulted by the user.
Step 3: Use an LLM to extract only the name from the Step 2 answer to select the full entry from the DB & use the correct code.


Problems: 
- Some do not look similar. E.g. avpu_score should map to "Level of responsiveness" in LOINC
    Possible solutions:
    - Add another LLM step before the search. Could selectively add disambiguation.

-The similarity search is biased against the simplest answer (e.g. hits "Self-Reported Maximum Adult Height" etc. not "Body height")
    Possible solutions:
    - check missing entries in db; try different search methods and thresholds/Ks

In [160]:
input_query = "avpu_score"
# input_query = "hepatitis_b_result"
input_query = input_query.replace("_", " ")
retreived_data = loinc_store.search(input_query, search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.01})
retreived_data


[SearchResult(text='{"LONG_COMMON_NAME": "Papova virus SV 40 DNA [#/volume] (viral load) in Urine by NAA with probe detection", "METHOD_TYP": "Probe.amp.tar", "CLASS": "MICRO", "SYSTEM": "Urine"}', metadata={'CLASS': 'MICRO', 'LONG_COMMON_NAME': 'Papova virus SV 40 DNA [#/volume] (viral load) in Urine by NAA with probe detection', 'METHOD_TYP': 'Probe.amp.tar', 'SYSTEM': 'Urine'}, score=None),
 SearchResult(text='{"LONG_COMMON_NAME": "Parvovirus B19 DNA [#/volume] (viral load) in Upper respiratory specimen by NAA with probe detection", "METHOD_TYP": "Probe.amp.tar", "CLASS": "MICRO", "SYSTEM": "Respiratory.upper"}', metadata={'CLASS': 'MICRO', 'LONG_COMMON_NAME': 'Parvovirus B19 DNA [#/volume] (viral load) in Upper respiratory specimen by NAA with probe detection', 'METHOD_TYP': 'Probe.amp.tar', 'SYSTEM': 'Respiratory.upper'}, score=None),
 SearchResult(text='{"LONG_COMMON_NAME": "Papova virus SV 40 DNA [#/volume] (viral load) in Serum or Plasma by NAA with probe detection", "METHOD_TY

In [108]:
retreived_data = loinc_store.search(input_query, search_kwargs={"k": 50})
retreived_data

[SearchResult(text='{"LONG_COMMON_NAME": "Hepatitis B virus e Ag [Presence] in Serum, Plasma or Blood by Rapid immunoassay", "METHOD_TYP": "IA.rapid", "CLASS": "MICRO", "SYSTEM": "Ser/Plas/Bld"}', metadata={'CLASS': 'MICRO', 'LONG_COMMON_NAME': 'Hepatitis B virus e Ag [Presence] in Serum, Plasma or Blood by Rapid immunoassay', 'METHOD_TYP': 'IA.rapid', 'SYSTEM': 'Ser/Plas/Bld'}, score=None),
 SearchResult(text='{"LONG_COMMON_NAME": "Hepatitis B virus surface Ab [Presence] in Serum, Plasma or Blood by Rapid immunoassay", "METHOD_TYP": "IA.rapid", "CLASS": "MICRO", "SYSTEM": "Ser/Plas/Bld"}', metadata={'CLASS': 'MICRO', 'LONG_COMMON_NAME': 'Hepatitis B virus surface Ab [Presence] in Serum, Plasma or Blood by Rapid immunoassay', 'METHOD_TYP': 'IA.rapid', 'SYSTEM': 'Ser/Plas/Bld'}, score=None),
 SearchResult(text='{"LONG_COMMON_NAME": "Hepatitis B virus e Ab [Presence] in Serum, Plasma or Blood by Rapid immunoassay", "METHOD_TYP": "IA.rapid", "CLASS": "MICRO", "SYSTEM": "Ser/Plas/Bld"}', m

In [None]:
# TODO missing metadata in LOINC - codes
# TODO add missing sections to snomed emebddings
# TODO check all terms are def in embeddings. Spot cheet respiratory rate 9279-1
# TODO maybe pick threshold first, if not enough results use top k

In [None]:
synonym_system_prompt = """
You are an assisstant for matching fields between health record systems. 
You will be given a source text, and a set of candidate
target records to pick from. Pick only one target record.
"""

synonym_user_prompt = """
Source text to search for: "{input_text}"

Here are the candidate target health record fields:

"{retreived_data}"
"""

In [144]:
synonym_user_prompt_formatted = synonym_user_prompt.format(input_text=input_query, retreived_data=retreived_data)


In [145]:
message = client.messages.create(
    model= "claude-3-5-sonnet-20241022", #"claude-3-5-sonnet-20241022", #claude-3-5-haiku-20241022
    max_tokens=500,
    temperature=0,
    system=synonym_system_prompt,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": synonym_user_prompt_formatted
                }
            ]
        }
    ]
)
answer = message.content[0].text
answer

'For the source text "hepatitis b result", I would select the following target record:\n\n{"LONG_COMMON_NAME": "Hepatitis B virus surface Ag [Presence] in Serum, Plasma or Blood by Rapid immunoassay", "METHOD_TYP": "IA.rapid", "CLASS": "MICRO", "SYSTEM": "Ser/Plas/Bld"}\n\nRationale:\n- The surface antigen (HBsAg) test is typically the first and most basic screening test for hepatitis B infection\n- When someone refers to a general "hepatitis B result" without specifying which marker, the surface antigen test is most commonly meant\n- The other options test for specific antibodies (Ab) or e antigen, which are more specialized markers\n- Surface antigen testing is considered the primary test for detecting hepatitis B infection'

Synonym approach 2) Guess & Search

This approach aims to address the problems noted above the first method (when input texts do not look at all like the output). Leverage the LLM's in-built intuition for clinical terminology.

Step 1 - Use LLM to output the LOINC text
Step 2 - Use the text from the LLM output to search for the entry. (option b if we see issues: use both original query + the LLM output to search the db)

In [170]:
synonym_expand_system_prompt = """
You are an assisstant for matching terminology between health record systems. 
You will be given a source text from one health system, which might be in another language.
You should output its equivalent name (the long name, not the code) in LOINC clinical terminology with no further explanation.
"""

synonym_expand_user_prompt = """
Source text to output a LOINC term for: "{input_text}"
"""

In [244]:
synonym_expand_select_system_prompt = """
You are an assisstant for matching terminology between health record systems. 
You will be given a LOINC entry to look for in a list of similar records. You will also be given the original soruce term for context.
You should select the correct LOINC entry from the options and output just the LONG_COMMON_NAME and the LOINC_NUMBER separated by a semicolon.
"""

synonym_expand_select_user_prompt = """
Original source term for context: {input_text}
LOINC entry to look for: {LLM_guess}
LOINC entries to select from: "{retreived_texts}"
"""

In [252]:
# use LLM to get input text CT synonym then search vector store. Also add hybrid search.
input_query = "malaria_RDT_result"

synonym_expand_user_prompt_formatted = synonym_expand_user_prompt.format(input_text=input_query)
message = client.messages.create(
    model= "claude-3-5-sonnet-20241022", #"claude-3-5-sonnet-20241022", #claude-3-5-haiku-20241022
    max_tokens=500,
    temperature=0,
    system=synonym_expand_system_prompt,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": synonym_expand_user_prompt_formatted
                }
            ]
        }
    ]
)
synonym_expand_answer = message.content[0].text
print(f"LLM guess for the CT: {synonym_expand_answer}")
synonym_expand_retreived_data = loinc_store.search(input_query, search_kwargs={"k": 100})

# add keyword search to conduct hybrid search
keyword_search_results = df[df["LONG_COMMON_NAME"].str.contains(synonym_expand_answer)].LONG_COMMON_NAME.to_list()
keyword_search_results = [SearchResult(text=json.dumps({"LONG_COMMON_NAME": s}), metadata={}, score=None) for s in keyword_search_results]
search_results_combined = keyword_search_results+synonym_expand_retreived_data

# temp fix for missing loinc numbers
search_results_combined = [SearchResult(text=r.text, metadata={**r.metadata, 'LOINC_NUMBER': loinc_num_dict.get(json.loads(r.text)['LONG_COMMON_NAME'])}, score=r.score) for r in search_results_combined]

# tried this rule-based retreival, but there are unexpected issues e.g. "Body height" hits "Body height Mother"
# final_search_result = ""
# for search_result in synonym_expand_retreived_data:
#     if synonym_expand_answer in search_result.metadata["LONG_COMMON_NAME"]:
#         final_search_result = (search_result, loinc_num_dict[search_result.metadata["LONG_COMMON_NAME"]])
# final_search_result

synonym_expand_select_user_prompt_formatted = synonym_expand_select_user_prompt.format(input_text=input_query, LLM_guess=synonym_expand_answer, retreived_texts=search_results_combined)
message = client.messages.create(
    model= "claude-3-5-sonnet-20241022", #"claude-3-5-sonnet-20241022", #claude-3-5-haiku-20241022
    max_tokens=500,
    temperature=0,
    system=synonym_expand_select_system_prompt,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": synonym_expand_select_user_prompt_formatted
                }
            ]
        }
    ]
)
synonym_expand_select_answer = message.content[0].text
synonym_expand_select_answer

LLM guess for the CT: Malaria rapid diagnostic test


'Plasmodium sp Ag [Identifier] in Blood by Rapid immunoassay;70569-9'

In [None]:
# # remove additional text if LLM included it
# synonym_expand_select_answer = synonym_expand_select_answer.replace('{"LONG_COMMON_NAME": "', '').replace('"}', '')
# final_search_result = ""
# for search_result in synonym_expand_retreived_data:
#     if synonym_expand_select_answer in search_result.metadata["LONG_COMMON_NAME"]:
#         final_search_result = (search_result, loinc_num_dict[search_result.metadata["LONG_COMMON_NAME"]])
# final_search_result

Need to test more inputs to understand pros and cons of each approach.

Pipeline for searching for a body site (NOT synonym)

Searching for the correct body site for a diagnosis or process presents a different kind of problem from searching for a similar term (the mapping issue above). We cannot use RAG directly, because the input an output will not be "similar".

- - -

The tested method will use the following steps to try and generate a body site:

Step 1: Use an LLM to directly guess the correct SNOMED body site. From previous experiments with ChatGPT, we found that the clinical term is often accurate, but the code will be hallucinated.

Step 2: Use the LLM output from step 1 to search the SNOMED terms using a similarity search (remove digits from the answer; filter for body site). Take the top 1 match. The code field in the metadata is our SNOMED body site code output. We use 

- - -
Problem: There are a lot of (reasonable) errors in the output. Not sure if the answers require contextual knowledge of the conventions of the specific health record system in use, or medical knowledge. Or, are there many correct outputs in some cases?

Errors found:

migraine -> head (nervous system)
constipation -> large intestine (intestinal system)
hypokalaemia -> Blood (Structure of cardiovascular system)
Depressive_episode,_unspecified -> Brain structure (Central nervous system)  [note that e.g. bipolar and schizophrenia are brain structure]

In [73]:
body_site_step_1_system_prompt = """
You are an assistant for tagging diagnoses and processes in health records with
the correct body sites as defined in SNOMED Clinical Terminology.
You will be given an input text from a health record. 
Simply state the SNOMED body site term for the input.
"""
# 
body_site_step_1_user_prompt = """
Input to return a body site for: "{input_text}"
"""

body_site_step_1_user_prompt_formatted = body_site_step_1_user_prompt.format(input_text="Constipation".replace("_", " "))

In [None]:
# "You should return the
# correct SNOMED term for the input, or several options if the condition involves
# several parts of the body.""

In [None]:
message = client.messages.create(
    model= "claude-3-5-sonnet-20241022", #"claude-3-5-sonnet-20241022", #"claude-3-5-haiku-20241022"
    max_tokens=500,
    temperature=0,
    system=body_site_step_1_system_prompt,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": body_site_step_1_user_prompt_formatted
                }
            ]
        }
    ]
)
body_site_answer = message.content[0].text
body_site_answer

'Structure of entire gastrointestinal tract'

In [76]:
body_site_snomed_entry = snomed_store.search(body_site_answer, search_kwargs={"k": 1})
body_site_snomed_entry

[SearchResult(text='{"Value Set Name": "Body Site Value Set", "Description": "Gastrointestinal tract structure (body structure)", "Purpose: Clinical Focus": "All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site."}', metadata={'Code': 122865005.0, 'Description': 'Gastrointestinal tract structure (body structure)', 'Purpose: Clinical Focus': 'All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site.', 'Value Set Name': 'Body Site Value Set'}, score=None)]

In [None]:
body_site_snomed_entry[0].metadata

{'Code': 122865005.0,
 'Description': 'Gastrointestinal tract structure (body structure)',
 'Purpose: Clinical Focus': 'All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site.',
 'Value Set Name': 'Body Site Value Set'}

In [89]:
body_site_snomed_entry[0].metadata["Description"]

'Gastrointestinal tract structure (body structure)'

In [90]:
body_site_snomed_entry[0].metadata["Code"]

122865005.0

In [None]:
# TODO add filtering for body site

Examine the data

In [54]:
import pandas as pd

In [59]:
df = pd.read_csv("/Users/hanna/openfn/ai_experiments/data/SNOMED-CT-Code-Value-Semantic-Set/SNOMED-CT-Code-Value-Semantic-Set.csv")
df_body = df[df["Value Set Name"]=="Body Site Value Set"] 
df_body.to_csv("/Users/hanna/openfn/ai_experiments/data/SNOMED-CT-Code-Value-Semantic-Set/SNOMED-CT-Code-Value-Semantic-Set_body_site.csv", index=False)

In [60]:
df_body[df_body["Description"].str.contains("blood")]

Unnamed: 0,Code System,Value Set Name,Code,Description,Purpose: Clinical Focus,Value Set OID,Code System OID
281,SNOMEDCT,Body Site Value Set,91747007,Structure of lumen of blood vessel (body struc...,"All SNOMED CT anatomic structures, locations, ...",2.16.840.1.113883.3.88.12.3221.8.9,2.16.840.1.113883.6.96
755,SNOMEDCT,Body Site Value Set,89615005,Abnormal cellular component of blood (morpholo...,"All SNOMED CT anatomic structures, locations, ...",2.16.840.1.113883.3.88.12.3221.8.9,2.16.840.1.113883.6.96
1142,SNOMEDCT,Body Site Value Set,871850003,Entire blood vessel within duodenum (body stru...,"All SNOMED CT anatomic structures, locations, ...",2.16.840.1.113883.3.88.12.3221.8.9,2.16.840.1.113883.6.96
1143,SNOMEDCT,Body Site Value Set,871851004,Structure of blood vessel within esophagus (bo...,"All SNOMED CT anatomic structures, locations, ...",2.16.840.1.113883.3.88.12.3221.8.9,2.16.840.1.113883.6.96
1144,SNOMEDCT,Body Site Value Set,871852006,Entire blood vessel within esophagus (body str...,"All SNOMED CT anatomic structures, locations, ...",2.16.840.1.113883.3.88.12.3221.8.9,2.16.840.1.113883.6.96
...,...,...,...,...,...,...,...
38983,SNOMEDCT,Body Site Value Set,1149178005,Entire blood vessel in colonic submucosa and c...,"All SNOMED CT anatomic structures, locations, ...",2.16.840.1.113883.3.88.12.3221.8.9,2.16.840.1.113883.6.96
38984,SNOMEDCT,Body Site Value Set,1149179002,Structure of lymphatic vessel and/or small blo...,"All SNOMED CT anatomic structures, locations, ...",2.16.840.1.113883.3.88.12.3221.8.9,2.16.840.1.113883.6.96
38985,SNOMEDCT,Body Site Value Set,1149181000,Entire lymphatic vessel and small blood vessel...,"All SNOMED CT anatomic structures, locations, ...",2.16.840.1.113883.3.88.12.3221.8.9,2.16.840.1.113883.6.96
39464,SNOMEDCT,Body Site Value Set,111014003,Dual red blood cell population (morphologic ab...,"All SNOMED CT anatomic structures, locations, ...",2.16.840.1.113883.3.88.12.3221.8.9,2.16.840.1.113883.6.96


In [115]:
del df