# Starter: Zero shot data extraction using LLMs

In [38]:
from ollama import chat
from ollama import ChatResponse
import pandas as pd
import json
import ast
import json_metric
import re

## prompts

In [2]:
system_prompt = "As a medical professional, please analyze the following medical notes carefully and extract relevant information in the specified JSON format. Do not include any values that are not mentioned in the notes."

In [3]:
# Define the template with a placeholder
prompt_template = """
For each entry:
- Accurately identify and extract patient data, visit motivation, symptoms, and vital signs.
- Only include keys that have corresponding information in the notes, omitting any keys that are not mentioned.
- Follow this JSON structure:

```json
{
  "patient_info": {
    "age": <number>,
    "gender": <string>
  },
  "visit_motivation": <string>,
  "symptoms": [<string>],
  "vital_signs": {
    "heart_rate": {
      "value": <number>,
      "unit": "bpm"
    },
    "oxygen_saturation": {
      "value": <number>,
      "unit": "%"
    },
    "cholesterol_level": {
      "value": <number>,
      "unit": "mg/dL"
    },
    "glucose_level": {
      "value": <number>,
      "unit": "mg/dL"
    },
    "temperature": {
      "value": <number>,
      "unit": "°C"
    },
    "respiratory_rate": {
      "value": <number>,
      "unit": "breaths/min"
    },
    "blood_pressure": {
      "systolic": {
        "value": <number>,
        "unit": "mmHg"
      },
      "diastolic": {
        "value": <number>,
        "unit": "mmHg"
      }
    }
  }
}

Here are the clinical notes to review:

{medical_notes}

"""

In [15]:
# load test data
test_df = pd.read_csv("medical-note-extraction-h-2-o-gen-ai-world-ny/test.csv")
train_df = pd.read_csv("medical-note-extraction-h-2-o-gen-ai-world-ny/train.csv")

In [16]:
test_df.head()

Unnamed: 0,ID,Note
0,10823,**Clinical Notes**\n\n**Patient Information:**...
1,10824,**Clinical Notes**\n\n**Patient Information:**...
2,10825,Clinical Note:\n\nPatient: 71-year-old male\nC...
3,10826,**Clinical Notes**\n\n**Patient Information:**...
4,10827,**Clinical Notes**\n\n**Patient Information:**...


In [18]:
# Try a small subset
trial_df = test_df.iloc[:10].copy()
train_sample_df = train_df.iloc[:10].copy()

In [19]:
#Build prompts
all_prompts = []
for index,row in train_sample_df.iterrows():
    
    medical_notes = row.Note
    filled_prompt = f"{prompt_template}".replace("{medical_notes}", medical_notes)    
    message=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": filled_prompt}
    ]
    all_prompts.append(message)
    

In [20]:
print(all_prompts[0])

[{'role': 'system', 'content': 'As a medical professional, please analyze the following medical notes carefully and extract relevant information in the specified JSON format. Do not include any values that are not mentioned in the notes.'}, {'role': 'user', 'content': '\nFor each entry:\n- Accurately identify and extract patient data, visit motivation, symptoms, and vital signs.\n- Only include keys that have corresponding information in the notes, omitting any keys that are not mentioned.\n- Follow this JSON structure:\n\n```json\n{\n  "patient_info": {\n    "age": <number>,\n    "gender": <string>\n  },\n  "visit_motivation": "",\n  "symptoms": [<string>],\n  "vital_signs": {\n    "heart_rate": {\n      "value": <number>,\n      "unit": "bpm"\n    },\n    "oxygen_saturation": {\n      "value": <number>,\n      "unit": "%"\n    },\n    "cholesterol_level": {\n      "value": <number>,\n      "unit": "mg/dL"\n    },\n    "glucose_level": {\n      "value": <number>,\n      "unit": "mg/dL

In [21]:
def print_outputs(outputs):
    for output in outputs:
        print(f"Generated text: {output!r}")
        print("-" * 80)

## zero shot prediction

In [22]:
%%time
model = "gpt-oss:20b"  # or your model name
options = {
    "temperature": 0.2,
    "top_p": 0.9,
    "seed": 777,
    "num_predict": 1024,  # vLLM's max_tokens
}

def run_batch(all_prompts):
    outputs = []
    for messages in all_prompts:
        resp: ChatResponse = chat(model=model, messages=messages, options=options)
        outputs.append(resp["message"]["content"])
    return outputs

from time import time
start = time()

# responses = llm.generate(
#     all_prompts[0],
#     sampling_params,
#     use_tqdm = True
# )

outputs = run_batch(all_prompts)

end = time()
elapsed = (end-start)/60. #minutes
print(f"Inference took {elapsed} minutes!")

Inference took 2.498507340749105 minutes!
CPU times: total: 0 ns
Wall time: 2min 29s


In [11]:
print_outputs(outputs)

Generated text: '```json\n{\n  "patient_info": {\n    "age": 48,\n    "gender": "Male"\n  },\n  "visit_motivation": "Uncontrolled Type 2 Diabetes",\n  "symptoms": [\n    "fatigue",\n    "increased thirst",\n    "frequent urination",\n    "blurred vision",\n    "pale skin"\n  ],\n  "vital_signs": {\n    "oxygen_saturation": {\n      "value": 99.8,\n      "unit": "%"\n    },\n    "glucose_level": {\n      "value": 155.4,\n      "unit": "mg/dL"\n    },\n    "temperature": {\n      "value": 37.1,\n      "unit": "°C"\n    },\n    "respiratory_rate": {\n      "value": 12,\n      "unit": "breaths/min"\n    },\n    "blood_pressure": {\n      "systolic": {\n        "value": 104,\n        "unit": "mmHg"\n      },\n      "diastolic": {\n        "value": 73,\n        "unit": "mmHg"\n      }\n    }\n  }\n}\n```'
--------------------------------------------------------------------------------
Generated text: '```json\n{\n  "patient_info": {\n    "age": 53,\n    "gender": "Male"\n  },\n  "visit_motiv

## simple cleanup the outputs

In [None]:
def oneside_strip_code_fence(s:str)->str:
    s = re.sub(r"```$","",s.strip())
    s = re.sub(r"^```[a-zA-Z]*\n?", "", s.strip())
    return s

def SingleQuotetoDouble(s:str) -> str:
    s = s.replace("'",'"')
    return s

def rm_trailing_commas(s:str)-> str:
    s=  re.sub(r",\s*([\]}])", r"\1", s.strip())
    return s

def balance_brackets(s:str)-> str:
    """Append missing closing braces/brackets if the string is cut off."""
    # count curly braces
    opens, closes = s.count("{"), s.count("}")
    if closes < opens:
        s+= "}"* (opens-closes)
    opens, closes = s.count("["), s.count("]")
    if closes < opens:
        s+= "]"* (opens-closes)
    return s

def sanitize_text(s: str) -> str:
    s = SingleQuotetoDouble(s)
    s = rm_trailing_commas(s)
    s = balance_brackets(s)
    return s

def json_with_duplicates(s: str):
    """
    Parse JSON but keep duplicated keys as list values.
    """
    def parse_object_pairs(pairs):
        obj = defaultdict(list)
        for k, v in pairs:
            obj[k].append(v)
        # if only one value, unwrap from list
        return {k: v[0] if len(v) == 1 else v for k, v in obj.items()}
    
    return json.loads(s, object_pairs_hook=parse_object_pairs)

In [27]:
def clean_preds(preds:str):
    try:
        preds = preds.split('```json\n')[1].split('\n```')[0]
    except Exception as e:
        
        print(e)
        try: 
            preds = preds.split('\n\n')[1]
        except Exception as e:
            print(e)
            preds = preds
 
    return preds

In [35]:
json_list = []

for output in outputs:
    res = output
    
    try:
        json_list.append(json.loads(clean_preds(res)))
    except Exception as e:
        print(e)
        json_list.append(res)
    
pred_sample_df = train_sample_df[['ID','json']].copy()
pred_sample_df['json'] = json_list 
# test_trial_df.to_csv("test_with_json2_final.csv", index=False)

In [36]:
pred_sample_df

Unnamed: 0,ID,json
0,1,"{'patient_info': {'age': 41, 'gender': 'Male'}..."
1,2,"{'patient_info': {'age': 56, 'gender': 'male'}..."
2,3,"{'patient_info': {'age': 73, 'gender': 'female..."
3,4,"{'patient_info': {'age': 32, 'gender': 'Female..."
4,5,"{'patient_info': {'age': 51, 'gender': 'Male'}..."
5,6,"{'patient_info': {'age': 74, 'gender': 'Male'}..."
6,7,"{'patient_info': {'age': 21, 'gender': 'Female..."
7,8,"{'patient_info': {'age': 90, 'gender': 'Male'}..."
8,9,"{'patient_info': {'age': 25, 'gender': 'female..."
9,10,"{'patient_info': {'age': 44, 'gender': 'female..."


In [37]:
similarity = json_metric.score(train_sample_df.drop(columns="Note"), pred_sample_df, "ID")
print(similarity)

Error converting to JSON: {'patient_info': {'age': 41, 'gender': 'Male'}, 'visit_motivation': 'Anemia evaluation', 'symptoms': ['anemia', 'fever', 'fatigue', 'difficulty breathing (dyspnea)', 'vomiting', 'dizziness', 'blurred vision', 'wheezing', 'pale skin'], 'vital_signs': {'heart_rate': {'value': 114, 'unit': 'bpm'}, 'oxygen_saturation': {'value': 98.4, 'unit': '%'}, 'cholesterol_level': {'value': 132.8, 'unit': 'mg/dL'}, 'glucose_level': {'value': 110.6, 'unit': 'mg/dL'}}}
Error converting to JSON: {'patient_info': {'age': 56, 'gender': 'male'}, 'visit_motivation': 'Allergies', 'symptoms': ['runny nose', 'frequent sneezing', 'itchy eyes', 'blurred vision', 'wheezing'], 'vital_signs': {'temperature': {'value': 36.6, 'unit': '°C'}, 'respiratory_rate': {'value': 13, 'unit': 'breaths/min'}, 'glucose_level': {'value': 99.0, 'unit': 'mg/dL'}}}
Error converting to JSON: {'patient_info': {'age': 73, 'gender': 'female'}, 'visit_motivation': 'Follow-up for Type 2 Diabetes', 'symptoms': ['fev