OHNLP · JaerongA · Aug 21, 2025 · Aug 12, 2025 · Aug 14, 2025 · Aug 14, 2025
diff --git a/src/omop_mcp/agent.py b/src/omop_mcp/agent.py
@@ -50,19 +50,40 @@ async def run_agent(user_prompt: str, llm_provider: str = "azure_openai") -> dic
 
     # Step 1: Get LLM reasoning about keyword interpretation and extract components
     reasoning_prompt = f"""
-You are an OMOP concept mapping expert. Analyze this request and extract the key components:
-
-User request: "{user_prompt}"
-
-Output format:
-KEYWORD: [the main clinical term/keyword to map]
-OMOP_TABLE: [the OMOP table mentioned or implied]
-OMOP_FIELD: [the OMOP field mentioned or implied] 
-ADJUSTED_KEYWORD: [the keyword you would actually search for]
-REASONING: [brief explanation of any keyword adjustments or interpretation]
-
-Keep the REASONING concise - just note if you made any changes to the keyword and why.
-"""
+    You are an OMOP concept mapping expert with deep clinical knowledge. Your task is to analyze this request and determine what the medical keyword actually means in clinical context, even if the user doesn't specify exact OMOP details.
+
+    User request: "{user_prompt}"
+
+    **CRITICAL: You must interpret the medical keyword clinically, not just extract it literally.**
+
+    Consider:
+    - What does this keyword mean in medical terminology?
+    - What is the most likely clinical concept the user is looking for?
+    - Are there common medical abbreviations that need expansion?
+    - What would a clinician understand this to mean?
+    - What OMOP table/field would be most appropriate if not specified?
+
+    Examples of proper interpretation:
+    - "CP" in condition context → "chest pain" (not just "CP")
+    - "temp" in measurement context → "temperature" (not just "temp") 
+    - "BP" in measurement context → "blood pressure" (not just "BP")
+    - "MI" in condition context → "myocardial infarction" (not just "MI")
+
+    **Handle natural language flexibly:**
+    - "Map chest pain" → infer condition_occurrence.condition_concept_id
+    - "Find concept for diabetes" → infer condition_occurrence.condition_concept_id
+    - "What's the OMOP code for aspirin?" → infer drug_exposure.drug_concept_id
+    - "Temperature measurement" → infer measurement.measurement_concept_id
+
+    Output format:
+    KEYWORD: [the main clinical term/keyword to map]
+    OMOP_TABLE: [the OMOP table mentioned or implied - infer if not specified]
+    OMOP_FIELD: [the OMOP field mentioned or implied - infer if not specified] 
+    INFERRED_KEYWORD: [the keyword you would actually search for - this should be the CLINICAL interpretation, not the literal input]
+    REASONING: [explain your clinical interpretation, why you expanded/changed the keyword, and how you inferred the OMOP table/field if not specified]
+
+    **Remember: The INFERRED_KEYWORD should be what you would actually search for in a medical database, not the literal user input. If OMOP details aren't specified, make intelligent inferences based on the clinical concept.**
+    """
 
     reasoning_result = await agent.run(reasoning_prompt)
     reasoning_response = (
@@ -75,7 +96,7 @@ async def run_agent(user_prompt: str, llm_provider: str = "azure_openai") -> dic
     keyword = ""
     omop_table = ""
     omop_field = ""
-    adjusted_keyword = ""
+    inferred_keyword = ""
     reasoning = ""
 
     for line in reasoning_response.split("\n"):
@@ -86,26 +107,26 @@ async def run_agent(user_prompt: str, llm_provider: str = "azure_openai") -> dic
             omop_table = line.replace("OMOP_TABLE:", "").strip()
         elif line.startswith("OMOP_FIELD:"):
             omop_field = line.replace("OMOP_FIELD:", "").strip()
-        elif line.startswith("ADJUSTED_KEYWORD:"):
-            adjusted_keyword = line.replace("ADJUSTED_KEYWORD:", "").strip()
+        elif line.startswith("INFERRED_KEYWORD:"):
+            inferred_keyword = line.replace("INFERRED_KEYWORD:", "").strip()
         elif line.startswith("REASONING:"):
             reasoning = line.replace("REASONING:", "").strip()
 
     # If parsing failed, use fallbacks
     if not keyword:
         keyword = "unknown"
-    if not adjusted_keyword:
-        adjusted_keyword = keyword
+    if not inferred_keyword:
+        inferred_keyword = keyword
     if not reasoning:
-        reasoning = f"Used keyword '{adjusted_keyword}' as provided."
+        reasoning = f"Used keyword '{inferred_keyword}' as provided."
 
     # Step 2: Use the extracted information in a tool call
     final_prompt = f"""
 {MCP_DOC_INSTRUCTION}
 
 Original user request: {user_prompt}
 
-Based on your analysis, find concepts for `{adjusted_keyword}` for `{omop_field}` in the `{omop_table}` table.
+Based on your analysis, find concepts for `{inferred_keyword}` for `{omop_field}` in the `{omop_table}` table.
 
 Your previous reasoning for this keyword was: {reasoning}
 
@@ -138,9 +159,9 @@ async def run_agent(user_prompt: str, llm_provider: str = "azure_openai") -> dic
                 "keyword": keyword,
                 "omop_table": omop_table,
                 "omop_field": omop_field,
-                "adjusted_keyword": adjusted_keyword,
+                "inferred_keyword": inferred_keyword,
             },
-            "original_reasoning": reasoning,
+            "keyword_interpretation_reasoning": reasoning,
         },
     }
 

diff --git a/src/omop_mcp/prompts.py b/src/omop_mcp/prompts.py
@@ -1,12 +1,21 @@
 MCP_DOC_INSTRUCTION = """
-When selecting the best OMOP concept and vocabulary, always refer to the official OMOP CDM v5.4 documentation: https://ohdsi.github.io/CommonDataModel/faq.html and https://ohdsi.github.io/CommonDataModel/vocabulary.html.
-Use the mapping conventions, standard concept definitions, and vocabulary guidance provided there to ensure your selection is accurate and consistent with OMOP best practices. Prefer concepts that are marked as 'Standard' and 'Valid', and use the recommended vocabularies for each domain (e.g., SNOMED for conditions, RxNorm for drugs, LOINC for measurements, etc.) unless otherwise specified.
+When selecting the best OMOP concept and vocabulary, ALWAYS check the omop://documentation resource first for official OMOP CDM v5.4 vocabulary rules and mapping guidelines.
+
+Prefer concepts that are marked as 'Standard' and 'Valid'. When selecting the best OMOP concept and vocabulary, 
+
+**IMPORTANT: Before making any vocabulary decisions, access omop://documentation and omop://preferred_vocabularies to see the current guidelines and preferences.**
+
+**USER PREFERENCE HANDLING:**
+- If user says "Find from LOINC vocabulary" → ONLY consider LOINC concepts
+- If user says "Prefer SNOMED" → Prioritize SNOMED concepts
+- If user specifies a vocabulary → That vocabulary becomes the PRIMARY choice
+- Default preferences only apply when user doesn't specify a preference
 
 The find_omop_concept tool will return multiple candidate concepts with their metadata. You must evaluate and select the most appropriate concept based on:
 
 1. **Clinical Appropriateness**: Does the concept accurately represent the clinical term?
 2. **Context Requirements**: Any specific vocabulary, validity, or other requirements mentioned in the prompt
-3. **OMOP Best Practices**: Generally prefer Standard + Valid concepts from recommended vocabularies
+3. **OMOP Best Practices**: Generally prefer Standard + Valid concepts from preferred vocabularies
 4. **Use Case Considerations**: Research needs, granularity requirements, etc.
 
 **IMPORTANT: You are not limited to Standard/Valid concepts if the context requires otherwise (e.g., mapping legacy data, specific vocabulary requirements, research needs).**

diff --git a/src/omop_mcp/server.py b/src/omop_mcp/server.py
@@ -2,10 +2,12 @@
 import io
 import json
 import logging
-import time
 from pathlib import Path
 from typing import Any, Dict, List
 
+from bs4 import BeautifulSoup
+
+logging.basicConfig(level=logging.INFO)
 import aiohttp
 import mcp.types as types
 from mcp.server.fastmcp import FastMCP
@@ -29,6 +31,50 @@ async def list_omop_tables() -> Dict[str, List[str]]:
     return OMOP_CDM
 
 
+@mcp.resource("omop://documentation")
+async def omop_documentation() -> str:
+    """Fetch live OMOP CDM documentation including vocabulary rules."""
+    url = "https://ohdsi.github.io/CommonDataModel/vocabulary.html"
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url) as response:
+            if response.status == 200:
+                html_content = await response.text()
+                soup = BeautifulSoup(html_content, "html.parser")
+
+                # Remove script and style elements
+                for script in soup(["script", "style"]):
+                    script.decompose()
+
+                # Extract main content
+                main_content = (
+                    soup.find("div", class_="container-fluid main-container")
+                    or soup.body
+                )
+
+                if main_content:
+                    text = main_content.get_text()
+                    # Clean up
+                    lines = (line.strip() for line in text.splitlines())
+                    chunks = (
+                        phrase.strip() for line in lines for phrase in line.split("  ")
+                    )
+                    clean_text = " ".join(chunk for chunk in chunks if chunk)
+    return clean_text
+
+
+@mcp.resource("omop://preferred_vocabularies")
+async def get_vocabulary_preference() -> Dict[str, List[str]]:
+    """Preferred vocabulary for each OMOP domain in the order of preference."""
+    return {
+        "measurement": ["LOINC", "SNOMED"],
+        "condition_occurrence": ["SNOMED", "ICD10CM", "ICD9CM"],
+        "drug_exposure": ["RxNorm", "RxNorm Extension", "SNOMED"],
+        "procedure_occurrence": ["SNOMED", "CPT4", "ICD10PCS"],
+        "observation": ["SNOMED"],
+        "device_exposure": ["SNOMED"],
+    }
+
+
 @mcp.prompt()
 async def map_clinical_concept() -> types.GetPromptResult:
     """Create a prompt for mapping clinical concepts."""
@@ -103,6 +149,7 @@ async def find_omop_concept(
                 "error": f"Failed to query Athena: {str(e)}",
             }
 
+        logging.debug(f"Athena response: {data}")
         concepts = []
         if isinstance(data, list):
             concepts = data
@@ -133,37 +180,17 @@ async def find_omop_concept(
             }
             candidates.append(candidate)
 
-        # Provide metadata to help LLM make informed decisions
-        domain_mapping = {
-            "drug_exposure": "Drug",
-            "condition_occurrence": "Condition",
-            "measurement": "Measurement",
-            "procedure_occurrence": "Procedure",
-            "observation": "Observation",
-            "device_exposure": "Device",
-        }
-
-        vocab_recommendations = {
-            "drug_exposure": ["RxNorm", "RxNorm Extension", "SNOMED"],
-            "condition_occurrence": ["SNOMED", "ICD10CM", "ICD9CM"],
-            "measurement": ["LOINC", "SNOMED"],
-            "procedure_occurrence": ["SNOMED", "CPT4", "ICD10PCS"],
-            "observation": ["SNOMED"],
-            "device_exposure": ["SNOMED"],
-        }
-
         return {
             "candidates": candidates,
             "search_metadata": {
                 "keyword_searched": keyword,
                 "omop_table": omop_table,
                 "omop_field": omop_field,
-                "expected_domain": domain_mapping.get(omop_table, "Unknown"),
-                "recommended_vocabularies": vocab_recommendations.get(omop_table, []),
                 "total_found": len(concepts),
                 "candidates_returned": len(candidates),
                 "selection_guidance": (
                     "Select the most appropriate concept based on clinical context. "
+                    "Access omop://preferred_vocabularies for vocabulary preferences. "
                     "Generally prefer Standard + Valid concepts from recommended vocabularies, "
                     "but context may require different choices (e.g., research needs, "
                     "specific vocabulary requirements, or non-standard mappings)."

diff --git a/tests/fetch_test_data.sql b/tests/fetch_test_data.sql
@@ -53,9 +53,12 @@ GROUP BY SRC_NAME
 ORDER BY count DESC;
 
 -- Find medication keywords
+-- concept id should be fetched from v5_4.concept table by joining on concept_code
 SELECT DISTINCT SRC_CODE AS keyword,
        NULL AS count,
-       CODE AS concept_id_manual_mapping
-FROM mappings.master_drug_mappings_index
+    c.concept_id AS concept_id_manual_mapping
+FROM mappings.master_drug_mappings_index id
+JOIN (SELECT concept_id, concept_code FROM v5_4.concept WHERE vocabulary_id='RxNorm') c
+ON id.CODE = c.concept_code
 WHERE CODE <> '' AND SRC <> 'MHH_COVID';