Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 43 additions & 22 deletions src/omop_mcp/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,40 @@ async def run_agent(user_prompt: str, llm_provider: str = "azure_openai") -> dic

# Step 1: Get LLM reasoning about keyword interpretation and extract components
reasoning_prompt = f"""
You are an OMOP concept mapping expert. Analyze this request and extract the key components:

User request: "{user_prompt}"

Output format:
KEYWORD: [the main clinical term/keyword to map]
OMOP_TABLE: [the OMOP table mentioned or implied]
OMOP_FIELD: [the OMOP field mentioned or implied]
ADJUSTED_KEYWORD: [the keyword you would actually search for]
REASONING: [brief explanation of any keyword adjustments or interpretation]

Keep the REASONING concise - just note if you made any changes to the keyword and why.
"""
You are an OMOP concept mapping expert with deep clinical knowledge. Your task is to analyze this request and determine what the medical keyword actually means in clinical context, even if the user doesn't specify exact OMOP details.

User request: "{user_prompt}"

**CRITICAL: You must interpret the medical keyword clinically, not just extract it literally.**

Consider:
- What does this keyword mean in medical terminology?
- What is the most likely clinical concept the user is looking for?
- Are there common medical abbreviations that need expansion?
- What would a clinician understand this to mean?
- What OMOP table/field would be most appropriate if not specified?

Examples of proper interpretation:
- "CP" in condition context → "chest pain" (not just "CP")
- "temp" in measurement context → "temperature" (not just "temp")
- "BP" in measurement context → "blood pressure" (not just "BP")
- "MI" in condition context → "myocardial infarction" (not just "MI")

**Handle natural language flexibly:**
- "Map chest pain" → infer condition_occurrence.condition_concept_id
- "Find concept for diabetes" → infer condition_occurrence.condition_concept_id
- "What's the OMOP code for aspirin?" → infer drug_exposure.drug_concept_id
- "Temperature measurement" → infer measurement.measurement_concept_id

Output format:
KEYWORD: [the main clinical term/keyword to map]
OMOP_TABLE: [the OMOP table mentioned or implied - infer if not specified]
OMOP_FIELD: [the OMOP field mentioned or implied - infer if not specified]
INFERRED_KEYWORD: [the keyword you would actually search for - this should be the CLINICAL interpretation, not the literal input]
REASONING: [explain your clinical interpretation, why you expanded/changed the keyword, and how you inferred the OMOP table/field if not specified]

**Remember: The INFERRED_KEYWORD should be what you would actually search for in a medical database, not the literal user input. If OMOP details aren't specified, make intelligent inferences based on the clinical concept.**
"""

reasoning_result = await agent.run(reasoning_prompt)
reasoning_response = (
Expand All @@ -75,7 +96,7 @@ async def run_agent(user_prompt: str, llm_provider: str = "azure_openai") -> dic
keyword = ""
omop_table = ""
omop_field = ""
adjusted_keyword = ""
inferred_keyword = ""
reasoning = ""

for line in reasoning_response.split("\n"):
Expand All @@ -86,26 +107,26 @@ async def run_agent(user_prompt: str, llm_provider: str = "azure_openai") -> dic
omop_table = line.replace("OMOP_TABLE:", "").strip()
elif line.startswith("OMOP_FIELD:"):
omop_field = line.replace("OMOP_FIELD:", "").strip()
elif line.startswith("ADJUSTED_KEYWORD:"):
adjusted_keyword = line.replace("ADJUSTED_KEYWORD:", "").strip()
elif line.startswith("INFERRED_KEYWORD:"):
inferred_keyword = line.replace("INFERRED_KEYWORD:", "").strip()
elif line.startswith("REASONING:"):
reasoning = line.replace("REASONING:", "").strip()

# If parsing failed, use fallbacks
if not keyword:
keyword = "unknown"
if not adjusted_keyword:
adjusted_keyword = keyword
if not inferred_keyword:
inferred_keyword = keyword
if not reasoning:
reasoning = f"Used keyword '{adjusted_keyword}' as provided."
reasoning = f"Used keyword '{inferred_keyword}' as provided."

# Step 2: Use the extracted information in a tool call
final_prompt = f"""
{MCP_DOC_INSTRUCTION}

Original user request: {user_prompt}

Based on your analysis, find concepts for `{adjusted_keyword}` for `{omop_field}` in the `{omop_table}` table.
Based on your analysis, find concepts for `{inferred_keyword}` for `{omop_field}` in the `{omop_table}` table.

Your previous reasoning for this keyword was: {reasoning}

Expand Down Expand Up @@ -138,9 +159,9 @@ async def run_agent(user_prompt: str, llm_provider: str = "azure_openai") -> dic
"keyword": keyword,
"omop_table": omop_table,
"omop_field": omop_field,
"adjusted_keyword": adjusted_keyword,
"inferred_keyword": inferred_keyword,
},
"original_reasoning": reasoning,
"keyword_interpretation_reasoning": reasoning,
},
}

Expand Down
15 changes: 12 additions & 3 deletions src/omop_mcp/prompts.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
MCP_DOC_INSTRUCTION = """
When selecting the best OMOP concept and vocabulary, always refer to the official OMOP CDM v5.4 documentation: https://ohdsi.github.io/CommonDataModel/faq.html and https://ohdsi.github.io/CommonDataModel/vocabulary.html.
Use the mapping conventions, standard concept definitions, and vocabulary guidance provided there to ensure your selection is accurate and consistent with OMOP best practices. Prefer concepts that are marked as 'Standard' and 'Valid', and use the recommended vocabularies for each domain (e.g., SNOMED for conditions, RxNorm for drugs, LOINC for measurements, etc.) unless otherwise specified.
When selecting the best OMOP concept and vocabulary, ALWAYS check the omop://documentation resource first for official OMOP CDM v5.4 vocabulary rules and mapping guidelines.

Prefer concepts that are marked as 'Standard' and 'Valid'. When selecting the best OMOP concept and vocabulary,

**IMPORTANT: Before making any vocabulary decisions, access omop://documentation and omop://preferred_vocabularies to see the current guidelines and preferences.**

**USER PREFERENCE HANDLING:**
- If user says "Find from LOINC vocabulary" → ONLY consider LOINC concepts
- If user says "Prefer SNOMED" → Prioritize SNOMED concepts
- If user specifies a vocabulary → That vocabulary becomes the PRIMARY choice
- Default preferences only apply when user doesn't specify a preference

The find_omop_concept tool will return multiple candidate concepts with their metadata. You must evaluate and select the most appropriate concept based on:

1. **Clinical Appropriateness**: Does the concept accurately represent the clinical term?
2. **Context Requirements**: Any specific vocabulary, validity, or other requirements mentioned in the prompt
3. **OMOP Best Practices**: Generally prefer Standard + Valid concepts from recommended vocabularies
3. **OMOP Best Practices**: Generally prefer Standard + Valid concepts from preferred vocabularies
4. **Use Case Considerations**: Research needs, granularity requirements, etc.

**IMPORTANT: You are not limited to Standard/Valid concepts if the context requires otherwise (e.g., mapping legacy data, specific vocabulary requirements, research needs).**
Expand Down
71 changes: 49 additions & 22 deletions src/omop_mcp/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import io
import json
import logging
import time
from pathlib import Path
from typing import Any, Dict, List

from bs4 import BeautifulSoup

logging.basicConfig(level=logging.INFO)
import aiohttp
import mcp.types as types
from mcp.server.fastmcp import FastMCP
Expand All @@ -29,6 +31,50 @@ async def list_omop_tables() -> Dict[str, List[str]]:
return OMOP_CDM


@mcp.resource("omop://documentation")
async def omop_documentation() -> str:
"""Fetch live OMOP CDM documentation including vocabulary rules."""
url = "https://ohdsi.github.io/CommonDataModel/vocabulary.html"
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status == 200:
html_content = await response.text()
soup = BeautifulSoup(html_content, "html.parser")

# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()

# Extract main content
main_content = (
soup.find("div", class_="container-fluid main-container")
or soup.body
)

if main_content:
text = main_content.get_text()
# Clean up
lines = (line.strip() for line in text.splitlines())
chunks = (
phrase.strip() for line in lines for phrase in line.split(" ")
)
clean_text = " ".join(chunk for chunk in chunks if chunk)
return clean_text


@mcp.resource("omop://preferred_vocabularies")
async def get_vocabulary_preference() -> Dict[str, List[str]]:
"""Preferred vocabulary for each OMOP domain in the order of preference."""
return {
"measurement": ["LOINC", "SNOMED"],
"condition_occurrence": ["SNOMED", "ICD10CM", "ICD9CM"],
"drug_exposure": ["RxNorm", "RxNorm Extension", "SNOMED"],
"procedure_occurrence": ["SNOMED", "CPT4", "ICD10PCS"],
"observation": ["SNOMED"],
"device_exposure": ["SNOMED"],
}


@mcp.prompt()
async def map_clinical_concept() -> types.GetPromptResult:
"""Create a prompt for mapping clinical concepts."""
Expand Down Expand Up @@ -103,6 +149,7 @@ async def find_omop_concept(
"error": f"Failed to query Athena: {str(e)}",
}

logging.debug(f"Athena response: {data}")
concepts = []
if isinstance(data, list):
concepts = data
Expand Down Expand Up @@ -133,37 +180,17 @@ async def find_omop_concept(
}
candidates.append(candidate)

# Provide metadata to help LLM make informed decisions
domain_mapping = {
"drug_exposure": "Drug",
"condition_occurrence": "Condition",
"measurement": "Measurement",
"procedure_occurrence": "Procedure",
"observation": "Observation",
"device_exposure": "Device",
}

vocab_recommendations = {
"drug_exposure": ["RxNorm", "RxNorm Extension", "SNOMED"],
"condition_occurrence": ["SNOMED", "ICD10CM", "ICD9CM"],
"measurement": ["LOINC", "SNOMED"],
"procedure_occurrence": ["SNOMED", "CPT4", "ICD10PCS"],
"observation": ["SNOMED"],
"device_exposure": ["SNOMED"],
}

return {
"candidates": candidates,
"search_metadata": {
"keyword_searched": keyword,
"omop_table": omop_table,
"omop_field": omop_field,
"expected_domain": domain_mapping.get(omop_table, "Unknown"),
"recommended_vocabularies": vocab_recommendations.get(omop_table, []),
"total_found": len(concepts),
"candidates_returned": len(candidates),
"selection_guidance": (
"Select the most appropriate concept based on clinical context. "
"Access omop://preferred_vocabularies for vocabulary preferences. "
"Generally prefer Standard + Valid concepts from recommended vocabularies, "
"but context may require different choices (e.g., research needs, "
"specific vocabulary requirements, or non-standard mappings)."
Expand Down
7 changes: 5 additions & 2 deletions tests/fetch_test_data.sql
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,12 @@ GROUP BY SRC_NAME
ORDER BY count DESC;

-- Find medication keywords
-- concept id should be fetched from v5_4.concept table by joining on concept_code
SELECT DISTINCT SRC_CODE AS keyword,
NULL AS count,
CODE AS concept_id_manual_mapping
FROM mappings.master_drug_mappings_index
c.concept_id AS concept_id_manual_mapping
FROM mappings.master_drug_mappings_index id
JOIN (SELECT concept_id, concept_code FROM v5_4.concept WHERE vocabulary_id='RxNorm') c
ON id.CODE = c.concept_code
WHERE CODE <> '' AND SRC <> 'MHH_COVID';