In [2]:
import pandas as pd
import os 

from pymongo import MongoClient

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import json
import re
from sklearn.model_selection import train_test_split


mongouser = os.getenv('MONGO_INITDB_ROOT_USERNAME')
mongopass = os.getenv('MONGO_INITDB_ROOT_PASSWORD')

client = MongoClient(f"mongodb://{mongouser}:{mongopass}@mongodb:27017")


from openai import OpenAI
 
openai_client = OpenAI()

In [11]:
# /pipeline_datalake is our mounted datalake volume with local machine
terms = pd.read_excel('/pipeline_datalake/List of clinical definitions lookups.xlsx')
my_terms_list = terms['Concept Name'].to_list()
lower_terms = [c.lower() for c in my_terms_list]
# This was run once, don't need to run again
run_annotation = False

## Purpose of this notebook is just to prep 20 examples for me to annotate

The output is two files for use in modeling in the next two notebooks which will be at the base of the repo

- Annoted terms.xlsx -> has the labels created off a rough estimation of entity types
- Annoted terms With Set.xlsx' -> me taking the output and using UMLS browser, athena web browser from ohdsi, google, and gemini to validate those 20
- Lessons learned were key to certain decisions and clarity around what is correct
- The next notebook (2) will build a default prompt model


### Step 1 - A Quick EDA to know what we've got

- use a basic entity mapping LLM call to create "estimates" for entities
- This will allow me to roughly stratify a validation/test set
- Plan is to do a brief annotation of a small number (20) to learn difficulties and figure out what EXACTLY I would want to see -> without this we are flying blind
  - I considered using ontology databases to generate labels to validate on and then use the annotated as test but I thought that would be creating "easy" examples to validate from

In [1]:
# add the code for checking if we actually know what these concepts are

db = client["umls"]    # Replace with your database name
collection = db["mrconso"] # Replace with your collection name

pipeline = [
        {
            "$match": {
                "STR_LOWER": {"$in": lower_terms}
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the default _id field
                "STR": 1,  # Include STR field
                "SAB": 1,  # Include SAB field
                "STR_LOWER": 1,  # Include STR field
                "CODE": 1  # Include CODE field
            }
        }
    ]

    # Execute the aggregation pipeline and convert to DataFrame immediately
aggregation_result_df = pd.DataFrame(list(collection.aggregate(pipeline)))

NameError: name 'client' is not defined

In [14]:
aggregation_result_df.shape
# so we could fall back on this somehow

# I will not though, I will use the annotated data and that's probably all I will have time for 

(3032, 4)

### Step 2) Process and prep annotated lables for modeling work

#### Define Functions for annotation and premodeling and later use

In [2]:
# Define some functions to use
# The openapi calls here are for pre-modeling to help in annotation for just entities
# and if you downloaded the repo you do not need to run unless you plan on annotating
import json
import pandas as pd
run_annotation = False

def create_entity_type_prompt(term_list):
    '''Prompt generator for classifying entity types, with a regime flag.'''
    
    entity_dict = {
        "entities": term_list
    }
    
    entity_json_string = json.dumps(entity_dict, indent=2)
    
    prompt = """
    You are an expert in clinical informatics.
    
    Given a JSON list of concept names, return a JSON where each concept is assigned:
    1. Its most appropriate **entity type(s)** from the following list:
       - 'diagnosis'
       - 'procedure'
       - 'measurements/labs'
       - 'medication'
       - 'drug_class'
       If more than one type applies, return them as a **comma-separated string** (e.g., "medication,drug_class").
    
    2. A boolean `is_regime` flag that is `true` if the term appears to refer to a multi-drug **regimen** or **combination therapy** (e.g., "folfirinox", "FOLFOX", "triple therapy") — otherwise, return `false`.
    
    Return the result in the following JSON format only:
    ```json
    {{
      "entities": {{
        "[ENTITY_NAME]": {{
          "entity_name": "[ENTITY_NAME]",
          "types": "[ENTITY_TYPE]",
          "is_regime": true | false
        }}
      }}
    }}

    Here is the input
    """

    prompt = prompt + entity_json_string

    return prompt

def flatten_entity_types_to_df(response_json):
    '''Convert entity-type-only LLM output JSON to a DataFrame including regime flag.'''
    
    rows = []
    
    for entity, data in response_json.get("entities", {}).items():
        entity_name = data.get("entity_name", entity)
        types = data.get("types", "")
        is_regime = data.get("is_regime", False)
        
        rows.append({
            "entity_name": entity_name,
            "types": types,
            "is_regime": is_regime
        })
    
    df = pd.DataFrame(rows)
    return df



def get_completion(prompt, model="gpt-4-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    content = response.choices[0].message.content
    
    return content


def strip_markdown_fences(text):
    # Remove triple backticks and optional "json" label around the JSON block
    return re.sub(r"^```json\s*|```$", "", text.strip(), flags=re.MULTILINE)


#### Run basic annotation LLM calls to allow for stratifications 

In [3]:
batch_size=20
model="gpt-4-turbo"

# really only need to run this once
if run_annotation == True:
       
    # this isn't relaly batching I'm just calling it that for right now
    all_raw_results = []
    for i in range(0, len(terms), batch_size):
        batch_terms = terms["Concept Name"].iloc[i:i+batch_size].tolist()
        prompt = create_entity_type_prompt(batch_terms)
        response_json = get_completion(prompt, model=model)  # get raw JSON/dict
        all_raw_results.append(response_json)
        
    # stip any markdown and load to one large dataframe
    df_full = pd.DataFrame()
    for ent in all_raw_results:
        ent_clean = json.loads(strip_markdown_fences(ent))
        df_clean = flatten_entity_types_to_df(ent_clean)
        df_full = pd.concat([df_full, df_clean])

    assert len(set(df_full['entity_name'].to_list()) & set(terms['Concept Name'].to_list())), 'Assertion Error: Dataframes do not align
    # keep it the same for storage
    df_full = df_full.reset_index(drop=True)
    
    if not os.path.isfile('/pipeline_datalake/Annoted terms.xlsx'):
        df_full.to_excel('/pipeline_datalake/Annoted terms.xlsx')


#### Take annotation results and create validation/test sets for annotating

In [None]:
df_full = pd.read_excel('/pipeline_datalake/Annoted terms.xlsx')

In [None]:
df_full_use = df_full[df_full['types'].isin(['medication','diagnosis','measurements/labs','procedure'])]
# Step 1: split off 60 samples (30 val + 30 test), stratified by 'types'
df_26, df_output = train_test_split(
    df_full_use,
    train_size=26,
    stratify=df_full_use['types'],
    random_state=42,
)

# Step 2: split the 60 samples into 30 val and 30 test, stratified by 'types'
df_val, df_test = train_test_split(
    df_26,
    train_size=13,
    stratify=df_26['types'],
    random_state=42,
)

print(f"Validation set size: {len(df_val)}")
print(f"Test set size: {len(df_test)}")
print(f"Output set size: {len(df_output)}")


#### NOTE I later chose only 20 to be my validation set

That's all I really had time for. Will annotate as many as I can for test but we'll do this in google docs

Of course this assumes that the model did the entity assignments vaguely correctly but I can tweak if I need to 

In [None]:
df_full[df_full['types'] == 'medication,drug_class'].head(1)

In [None]:
df_full[df_full['types'] == 'medication,drug_class'].tail(1)

In [None]:
df_val = pd.concat([df_val,df_full[df_full['types'] == 'medication,drug_class'].head(1) ])
df_test = pd.concat([df_test,df_full[df_full['types'] == 'medication,drug_class'].tail(1) ])
# and add the only it found as drug_class
df_test = pd.concat([df_test,df_full[df_full['types'] == 'drug_class'].head(1) ])

In [None]:
# assign
df_val['set'] = 'val'
df_test['set'] = 'test'
df_output['set'] = 'output'
final_labeled = pd.concat([df_val, df_test, df_output])

assert final_labeled.shape[0] == terms['Concept Name'].nunique() == final_labeled['entity_name'].nunique()

In [None]:
df_full['types'].value_counts(dropna=False)

In [None]:
if not os.path.isfile('/pipeline_datalake/Annoted terms With Set.xlsx'):
        final_labeled.to_excel('/pipeline_datalake/Annoted terms With Set.xlsx')


### Step 3: Post-Annotation form evaluation dataframe for modeling 
Ready to go
We continue in the next notebook



In [4]:
annotated_df = pd.read_excel('/pipeline_datalake/Annoted terms With Labels.xlsx', sheet_name='entity_clean')

In [5]:
annotated_df[annotated_df['validated'] == 1].shape

(31, 9)

In [6]:
annotated_df[annotated_df['validated'] == 1]['entity_name'].nunique()

20

In [12]:
annotated_df.head()

Unnamed: 0,entity_name,types,is_regime,set,should_say_no,codes_pipe,vocabulary,text,validated
0,Liver Transplant Rejection,diagnosis,False,val,,T86.41,ICD-10,Liver transplant rejection,1.0
1,Oseltamivir,medication,False,val,,260101,RxNorm,oseltamivir,1.0
2,Lurbinectedin,medication,False,val,,2374729,RxNorm,lurbinectedin,1.0
3,Wheezing,diagnosis,False,val,,R06.2,ICD-10,Wheezing,1.0
4,eptifibatide,medication,False,val,,75635,RxNorm,eptifibatide,1.0


#### Reflect on annotatoins

6/20 were difficult

green nails	- diagnosis

Dilation of hypoglossal nerve, open approach - 	procedure

Methylxanthine - 	medication,drug_class	

long-acting beta agonist	- drug_class	

Census Subregion - Mountain	-  NaN


#### Can I make more samples automatically with a database?

Prpbably not a good idea, I would have to keep that ratio of difficult terms balance
I could use tools to lookup values if I have time
