# Admission extraction

In [1]:
from typing import List, Optional

from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field, validator

import os

from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate

# Use vLLM, should be faster than pipeline
from vllm import LLM, SamplingParams

# To download the models
from huggingface_hub import snapshot_download

import pandas as pd

from transformers import AutoTokenizer

import torch

INFO 06-03 15:42:00 [__init__.py:243] Automatically detected platform cuda.


In [1]:
!pip freeze -l > requirements.txt

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = # add hugginface api key

In [None]:
MIMIC_FILES_BASE_FOLDER: os.path = os.path.join("", "mimic-iv", "physionet", "files", "mimiciv", "2.2") # mimic base folder

MIMIC_FILES_BASE_FOLDER

('mimic-iv/physionet/hosp/mimiciv_hosp.db',
 'mimic-iv/physionet/files/mimiciv/2.2')

In [None]:
NOTES_PATH: os.path = os.path.join(os.getcwd(), MIMIC_FILES_BASE_FOLDER, 'hosp', 'discharge.csv.gz') # discharge file
NOTES_PATH

'/orfeo/cephfs/scratch/area/egoat/mimic-iv/physionet/files/mimiciv/2.2/hosp/discharge.csv.gz'

In [None]:
def get_iter(path: os.path, chunksize: int = 100000): # since file is to large to load directly as a pandas df load it as chuncks
    '''
    This function is used to read the file and get a sort of iterator on the .csv file
    '''
    return pd.read_csv(path, compression='gzip', chunksize=chunksize)

In [6]:
class AdmissionPhysicalExam(BaseModel):
    """Represents the findings of an admission physical examination across various systems."""

    heent: Optional[str] = Field(
        default=None, description="PE Section: Head, Eyes, Ears, Nose, Throat Exam or HEENT"
    )
    neuro: Optional[str] = Field(
        default=None, description="PE Section: Neurological Exam"
    )
    vs: Optional[str] = Field(
        default=None, description="PE Section: Vital Signs or Vitals"
    )
    general: Optional[str] = Field(
        default=None, description="PE Section: General Appearance Exam"
    )
    neck: Optional[str] = Field(
        default=None, description="PE Section: Neck Exam"
    )
    skin: Optional[str] = Field(
        default=None, description="PE Section: Skin Exam"
    )
    lymph: Optional[str] = Field(
        default=None, description="PE Section: Lymphatic System Exam"
    )
    ext: Optional[str] = Field(
        default=None, description="PE Section: Extremities Exam"
    )
    abd: Optional[str] = Field(
        default=None, description="PE Section: Abdominal Exam"
    )
    psych: Optional[str] = Field(
        default=None, description="PE Section: Psychiatric Exam"
    )
    cv: Optional[str] = Field(
        default=None, description="PE Section: Cardiovascular Exam"
    )
    resp: Optional[str] = Field(
        default=None, description="PE Section: Respiratory Exam"
    )
    ent: Optional[str] = Field(
        default=None, description="PE Section: Ears, Nose, and Throat Exam"
    )
    back: Optional[str] = Field(
        default=None, description="PE Section: Back Exam"
    )
    chest: Optional[str] = Field(
        default=None, description="PE Section: Chest Exam"
    )
    gu: Optional[str] = Field(
        default=None, description="PE Section: Genitourinary Exam"
    )
    spine: Optional[str] = Field(
        default=None, description="PE Section: Spine Exam"
    )
    head: Optional[str] = Field(
        default=None, description="PE Section: Head Exam"
    )
    gi: Optional[str] = Field(
        default=None, description="PE Section: Gastrointestinal Exam"
    )

In [None]:
# !!!! Download the whole medGemma model to a local folder !!!
# local_dir = snapshot_download(repo_id="google/medgemma-27b-text-it")

In [None]:
model_id = "google/medgemma-27b-text-it" # "google/medgemma-27b-text-it" or "google/medgemma-4b-it"

llm = LLM(model=model_id,
          dtype=torch.bfloat16, # bfloat16 
          tensor_parallel_size=2, # enable multi-gpu
          max_model_len=1e4,   # max contex window
          max_num_seqs=10,       # Limit batch size,
          enforce_eager=True # disable cuda graph to reduce memory but it decreses perfomances
)

sampling_params = SamplingParams(
    temperature=0.0,  # This enables greedy decoding
    max_tokens=1000
)

INFO 06-03 15:42:02 [__init__.py:31] Available plugins for group vllm.general_plugins:
INFO 06-03 15:42:02 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 06-03 15:42:02 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
INFO 06-03 15:42:13 [config.py:793] This model supports multiple tasks: {'score', 'embed', 'generate', 'reward', 'classify'}. Defaulting to 'generate'.
INFO 06-03 15:42:13 [config.py:1875] Defaulting to use mp for distributed inference
INFO 06-03 15:42:13 [config.py:2118] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 06-03 15:42:17 [core.py:438] Waiting for init message from front-end.
INFO 06-03 15:42:17 [core.py:65] Initializing a V1 LLM engine (v0.9.0.1) with config: model='google/medgemma-27b-text-it', speculative_config=None, tokenizer='google/medgemma-27b-text-it', skip_tokenizer_init=False, tokenizer_mo

Loading safetensors checkpoint shards:   0% Completed | 0/11 [00:00<?, ?it/s]


[1;36m(VllmWorker rank=1 pid=2887148)[0;0m INFO 06-03 15:42:42 [default_loader.py:280] Loading weights took 21.96 seconds
[1;36m(VllmWorker rank=1 pid=2887148)[0;0m INFO 06-03 15:42:43 [gpu_model_runner.py:1549] Model loading took 25.4906 GiB and 22.849613 seconds
[1;36m(VllmWorker rank=0 pid=2887147)[0;0m INFO 06-03 15:42:43 [default_loader.py:280] Loading weights took 22.08 seconds
[1;36m(VllmWorker rank=0 pid=2887147)[0;0m INFO 06-03 15:42:43 [gpu_model_runner.py:1549] Model loading took 25.4906 GiB and 23.683553 seconds
INFO 06-03 15:42:50 [kv_cache_utils.py:637] GPU KV cache size: 29,264 tokens
INFO 06-03 15:42:50 [kv_cache_utils.py:640] Maximum concurrency for 10,000 tokens per request: 2.93x
INFO 06-03 15:42:50 [kv_cache_utils.py:637] GPU KV cache size: 29,264 tokens
INFO 06-03 15:42:50 [kv_cache_utils.py:640] Maximum concurrency for 10,000 tokens per request: 2.93x
INFO 06-03 15:42:52 [core.py:167] init engine (profile, create kv cache, warmup model) took 8.01 seconds


In [10]:
template = r"""
You are a data‐extraction assistant.

Task:
1. From the provided clinical note, locate the "Physical Exam" section.
2. Within that section, extract **only** the text under the **Admission** subsection.

## Critical Extraction Rules:
- **START ONLY** at "Admission:" heading - ignore ALL content before this heading
- **EXCLUDE** any preceding content  
- **STOP** extraction at next heading (e.g., "Findings:", "Impression:", "Assessment:") or clear section break
- **WHEN VERY UNCERTAIN** about text boundaries: **EXCLUDE rather than include**
- **PRESERVE** multi-line sentences within the Admission subsection

The extracted text data **must** be outputted in the following format:
**{format_instructions}**

Text:
{text}
"""

In [None]:
parser = PydanticOutputParser(pydantic_object=AdmissionPhysicalExam) # pydantic parser for AdmissionPhysicalExam

creation_prompt = PromptTemplate(
    template=template,    input_variables=["text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
) # Creation of the template prompt

# True Extraction of the first 100 discharges

In [None]:
BATCH_SIZE: int = 2

random_discharge_batch: pd.DataFrame = next(iter(get_iter(NOTES_PATH))).iloc[:100]

responces: List[AdmissionPhysicalExam] = []

for idx in range(0, 100, BATCH_SIZE):
    cbatch: pd.DataFrame = random_discharge_batch['text'].iloc[idx: idx + BATCH_SIZE] # loading a batch of two patients
    messages_batch = [ # creating an list of messagases that have system prompt and a user prompt
        [{
            "role": "system",
            "content": "You are a helpful medical assistant."
        },
        {
            "role": "user",
            "content": creation_prompt.format(text=text)
        }] for text in cbatch
    ]

    outputs = llm.chat(messages_batch, sampling_params=sampling_params, use_tqdm=True, add_generation_prompt=True, continue_final_message=False)

    for out_data in outputs:
        responces.append(parser.parse(out_data.outputs[0].text))

In [None]:
print(responces)