# Starter: Zero shot data extraction using LLMs

In [17]:
from ollama import chat
from ollama import ChatResponse
import pandas as pd
import json
import ast

## prompts

In [18]:
system_prompt = "As a medical professional, please analyze the following medical notes carefully and extract relevant information in the specified JSON format. Do not include any values that are not mentioned in the notes."

In [19]:
# Define the template with a placeholder
prompt_template = """
For each entry:
- Accurately identify and extract patient data, visit motivation, symptoms, and vital signs.
- Only include keys that have corresponding information in the notes, omitting any keys that are not mentioned.
- Follow this JSON structure:

```json
{
  "patient_info": {
    "age": <number>,
    "gender": <string>
  },
  "visit_motivation": "",
  "symptoms": [<string>],
  "vital_signs": {
    "heart_rate": {
      "value": <number>,
      "unit": "bpm"
    },
    "oxygen_saturation": {
      "value": <number>,
      "unit": "%"
    },
    "cholesterol_level": {
      "value": <number>,
      "unit": "mg/dL"
    },
    "glucose_level": {
      "value": <number>,
      "unit": "mg/dL"
    },
    "temperature": {
      "value": <number>,
      "unit": "°C"
    },
    "respiratory_rate": {
      "value": <number>,
      "unit": "breaths/min"
    },
    "blood_pressure": {
      "systolic": {
        "value": <number>,
        "unit": "mmHg"
      },
      "diastolic": {
        "value": <number>,
        "unit": "mmHg"
      }
    }
  }
}

Here are the clinical notes to review:

{medical_notes}

"""

In [20]:
# load test data
test_df = pd.read_csv("medical-note-extraction-h-2-o-gen-ai-world-ny/test.csv")

In [21]:
test_df.head()

Unnamed: 0,ID,Note
0,10823,**Clinical Notes**\n\n**Patient Information:**...
1,10824,**Clinical Notes**\n\n**Patient Information:**...
2,10825,Clinical Note:\n\nPatient: 71-year-old male\nC...
3,10826,**Clinical Notes**\n\n**Patient Information:**...
4,10827,**Clinical Notes**\n\n**Patient Information:**...


In [22]:
# Try a small subset
trial_df = test_df.iloc[:10].copy()

In [23]:
#Build prompts
all_prompts = []
for index,row in trial_df.iterrows():
    
    medical_notes = row.Note
    filled_prompt = f"{prompt_template}".replace("{medical_notes}", medical_notes)    
    message=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": filled_prompt}
    ]
    all_prompts.append(message)
    

In [24]:
print(all_prompts[0])

[{'role': 'system', 'content': 'As a medical professional, please analyze the following medical notes carefully and extract relevant information in the specified JSON format. Do not include any values that are not mentioned in the notes.'}, {'role': 'user', 'content': '\nFor each entry:\n- Accurately identify and extract patient data, visit motivation, symptoms, and vital signs.\n- Only include keys that have corresponding information in the notes, omitting any keys that are not mentioned.\n- Follow this JSON structure:\n\n```json\n{\n  "patient_info": {\n    "age": <number>,\n    "gender": <string>\n  },\n  "visit_motivation": "",\n  "symptoms": [<string>],\n  "vital_signs": {\n    "heart_rate": {\n      "value": <number>,\n      "unit": "bpm"\n    },\n    "oxygen_saturation": {\n      "value": <number>,\n      "unit": "%"\n    },\n    "cholesterol_level": {\n      "value": <number>,\n      "unit": "mg/dL"\n    },\n    "glucose_level": {\n      "value": <number>,\n      "unit": "mg/dL

In [25]:
def print_outputs(outputs):
    for output in outputs:
        print(f"Generated text: {output!r}")
        print("-" * 80)

## zero shot prediction

In [26]:
%%time
model = "gpt-oss:20b"  # or your model name
options = {
    "temperature": 0.2,
    "top_p": 0.9,
    "seed": 777,
    "num_predict": 1024,  # vLLM's max_tokens
}

def run_batch(all_prompts):
    outputs = []
    for messages in all_prompts:
        resp: ChatResponse = chat(model=model, messages=messages, options=options)
        outputs.append(resp["message"]["content"])
    return outputs

from time import time
start = time()

# responses = llm.generate(
#     all_prompts[0],
#     sampling_params,
#     use_tqdm = True
# )

outputs = run_batch(all_prompts)

end = time()
elapsed = (end-start)/60. #minutes
print(f"Inference took {elapsed} minutes!")

Inference took 2.6742958863576254 minutes!
CPU times: total: 15.6 ms
Wall time: 2min 40s


In [27]:
print_outputs(outputs)

Generated text: '```json\n{\n  "patient_info": {\n    "age": 48,\n    "gender": "Male"\n  },\n  "visit_motivation": "Uncontrolled Type 2 Diabetes",\n  "symptoms": [\n    "fatigue",\n    "increased thirst",\n    "frequent urination",\n    "blurred vision",\n    "pale skin"\n  ],\n  "vital_signs": {\n    "oxygen_saturation": {\n      "value": 99.8,\n      "unit": "%"\n    },\n    "glucose_level": {\n      "value": 155.4,\n      "unit": "mg/dL"\n    },\n    "temperature": {\n      "value": 37.1,\n      "unit": "°C"\n    },\n    "respiratory_rate": {\n      "value": 12,\n      "unit": "breaths/min"\n    },\n    "blood_pressure": {\n      "systolic": {\n        "value": 104,\n        "unit": "mmHg"\n      },\n      "diastolic": {\n        "value": 73,\n        "unit": "mmHg"\n      }\n    }\n  }\n}\n```'
--------------------------------------------------------------------------------
Generated text: '```json\n{\n  "patient_info": {\n    "age": 53,\n    "gender": "Male"\n  },\n  "visit_motiv

## simple cleanup the outputs

In [34]:

def clean_preds(preds:str):
    try:
        preds = preds.split('```json\n')[1].split('\n```')[0]
    except Exception as e:
        
        print(e)
        try: 
            preds = preds.split('\n\n')[1]
        except Exception as e:
            print(e)
            preds = preds
 
    return preds

In [36]:
json_list = []

for output in outputs:
    res = output
    
    try:
        json_list.append(json.loads(clean_preds(res)))
    except:
        print("Error")
        json_list.append(res)
    
trial_df['json'] = json_list
sub_df = trial_df[['ID','json']]
# test_trial_df.to_csv("test_with_json2_final.csv", index=False)

Error


In [None]:
# take a look at a prediction
sub_df.iloc[2]['json']