### Retrieval evaluation

In [59]:
import json
import pickle
import hashlib
import time
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
from dotenv import load_dotenv
from collections import defaultdict

load_dotenv()

True

In [60]:
def generate_document_id(doc, index=None):
    source = doc.get('metadata', {}).get('source', 'unknown_source')
    page = doc.get('metadata', {}).get('page', 'unknown_page')
    text = doc.get('page_content', '')
    timestamp = str(time.time())


    text_preview = text[:10] if len(text) >= 10 else text

    combined = f"{source}-{page}-{text_preview}-{index}-{timestamp}"

    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]

    return document_id


In [61]:
with open("../chunked_data_two.json", 'r') as f_in:
    documents = json.load(f_in)

In [62]:
for i, doc in enumerate(documents):
    doc_id = generate_document_id(doc, index=i)
    doc['id'] = doc_id

In [63]:
documents[0]

{'page_content': '123\nXue-Hong Wan\nRui Zeng\nEditors \nHandbook of Clinical \nDiagnostics',
 'metadata': {'source': '../data/Handbook_of_clinical_diagnostics.pdf',
  'page': 0,
  'page_label': 'C1'},
 'id': '855102eb'}

In [64]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [65]:
# there are not duplicate id
len(hashes), len(documents)

(5406, 5406)

In [66]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

In [67]:
# save document with id
with open('documents_with_ids.json', 'w') as f_out:
    json.dump(documents, f_out, indent=4)

In [68]:
client = OpenAI()

In [69]:
prompt_template = """
You are a medical examiner preparing a **clinical case scenario** for an OSCE (Objective Structured Clinical Examination).  
Your task is to **predict a body system or clinical topic** a medical student might enter and generate a **realistic case prompt** based on the retrieved medical records.  

### Instructions:
- **Step 1**: Infer a relevant **student query** based on the `page_content` of the retrieved medical record.
- **Step 2**: Construct a **concise** case prompt that aligns with the student’s level of understanding.
- **Step 3**: The case prompt should follow **OSCE standards** for **history taking and physical examination**.
- **Step 4**: The case prompt should include **only the presenting complaint**; do not provide additional history details.
- **Step 5**: The scenario should **mimic a real OSCE station**, requiring the student to take a full history and perform a relevant examination.

### Retrieved Medical Record:
- **Source**: {source}
- **Page Number**: {page_label}
- **Extracted Content**:  
{page_content}

### Expected Output:
Provide the output in **parsable JSON** without using code blocks:

{{
    "student_query": "Likely topic/disease/examination the student might ask about",
    "case_prompt": "Generated case prompt based on the student query and retrieved data, give the presenting complaint eg presents to the clinic with complaints of chest discomfort for the past 3 days "
}}
""".strip()

In [70]:
len(documents)

5406

In [71]:
# choose the random document chunk for indexing
sample = documents[2100: 2150] + documents[4100: 4200] + documents[1100: 1150] + documents[3100: 3150]

In [72]:
len(sample)

250

In [73]:
sample

[{'page_content': 'CHAPTER 5   THE CARdiAC ExAminATion 83\nRate of pulse\nPractised observers can estimate the rate quickly. Formal \ncounting over 30 seconds is accurate and requires only \nsimple mathematics to obtain the rate per minute. The \nnormal resting heart rate in adults is usually said to be \nbetween 60 and 100 beats per minute but a more \nsensible range is probably 55 to 95 (95% of normal \npeople). Bradycardia (from the Greek bradys ‘slow’ , \nkardia ‘heart’) is defined as a heart rate of less than 60 \nbeats per minute. Tachycardia (from the Greek tachys \n‘swift’ , kardia ‘heart’) is defined as a heart rate over 100 \nbeats per minute (see the OSCE ECGs nos 2, 3 and 4 \nat ). The causes of bradycardia and \ntachycardia are listed in Table 5.1.\nRhythm\nThe rhythm of the pulse can be regular or irregular. An \nirregular rhythm can be completely irregular with no \npattern (irregularly irregular or chaotic rhythm); this is \nusually due to atrial fibrillation (see Table

In [74]:
def generate_case_prompt(doc):
    source = doc.get("metadata", {}).get("source", "Unknown source")
    page_label = doc.get("metadata", {}).get("page_label", "Unknown page")
    page_content = doc.get("page_content", "No content available.")

   
    prompt = prompt_template.format(
        source=source,
        page_label=page_label,
        page_content=page_content
    )

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    try:
        json_response = json.loads(response.choices[0].message.content)
    except json.JSONDecodeError:
        json_response = {"error": "Invalid JSON response from model"}

    return json_response

In [75]:
results = {}

for doc in tqdm(sample[100:105]): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_case_prompt(doc)
    results[doc_id] = questions

100%|██████████| 5/5 [00:07<00:00,  1.58s/it]


In [76]:
results

{'93482d80': {'student_query': 'rheumatological diseases with associated symptoms like dry eyes and limb pain',
  'case_prompt': 'A 45-year-old female presents to the clinic with complaints of dry eyes and persistent limb pain for the past two months.'},
 'e9a0c17a': {'student_query': 'urinary tract infection',
  'case_prompt': 'A 28-year-old female presents to the clinic with complaints of dysuria, frequency, and urgency of urination for the past 2 days.'},
 'cb999057': {'student_query': 'abdominal pain and its differential diagnosis',
  'case_prompt': 'A 35-year-old female presents to the clinic with complaints of abdominal pain that started 2 days ago. The pain is localized to the right upper quadrant and is associated with nausea.'},
 '9f64d32d': {'student_query': 'abnormal movements or seizures',
  'case_prompt': 'A 25-year-old male presents to the emergency department with a history of recurrent seizures over the past month. He reports experiencing generalized tonic-clonic seizur

In [77]:
results = {}

for doc in tqdm(sample): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_case_prompt(doc)
    results[doc_id] = questions

100%|██████████| 250/250 [08:37<00:00,  2.07s/it]


In [78]:
with open('results.bin', 'wb') as f_in:
    pickle.dump(results, f_in)

In [81]:
results

{'e84c82d5': {'student_query': 'Assessment of bradycardia or tachycardia',
  'case_prompt': 'A 68-year-old male presents to the clinic with complaints of lightheadedness and palpitations for the past week.'},
 'abf43004': {'student_query': 'Atrial fibrillation and its clinical implications',
  'case_prompt': 'A 65-year-old female presents to the clinic with complaints of palpitations and shortness of breath that started 2 days ago. She describes the palpitations as a rapid and irregular sensation in her chest.'},
 '97f15a47': {'student_query': 'Atrial fibrillation and its examination',
  'case_prompt': 'A 68-year-old male presents to the clinic with complaints of palpitations and intermittent dizziness for the past week.'},
 '561fa7b4': {'student_query': 'Cardiac arrhythmias, especially focused on irregular pulse patterns',
  'case_prompt': 'A 55-year-old female presents to the clinic with complaints of palpitations and a sensation of her heart racing intermittently for the past week.'

In [83]:
data = [(v['student_query'], v['case_prompt'], k) for k, v in results.items()]

df = pd.DataFrame(data, columns=['question', 'case_prompt', 'document'])

In [84]:
df.head()

Unnamed: 0,question,case_prompt,document
0,Assessment of bradycardia or tachycardia,A 68-year-old male presents to the clinic with...,e84c82d5
1,Atrial fibrillation and its clinical implications,A 65-year-old female presents to the clinic wi...,abf43004
2,Atrial fibrillation and its examination,A 68-year-old male presents to the clinic with...,97f15a47
3,"Cardiac arrhythmias, especially focused on irr...",A 55-year-old female presents to the clinic wi...,561fa7b4
4,Symptoms and examination findings related to c...,A 25-year-old male presents to the clinic comp...,ffc52230


In [85]:
df.to_csv('questions.csv', index=False)

In [86]:
!head questions.csv

question,case_prompt,document
Assessment of bradycardia or tachycardia,A 68-year-old male presents to the clinic with complaints of lightheadedness and palpitations for the past week.,e84c82d5
Atrial fibrillation and its clinical implications,A 65-year-old female presents to the clinic with complaints of palpitations and shortness of breath that started 2 days ago. She describes the palpitations as a rapid and irregular sensation in her chest.,abf43004
Atrial fibrillation and its examination,A 68-year-old male presents to the clinic with complaints of palpitations and intermittent dizziness for the past week.,97f15a47
"Cardiac arrhythmias, especially focused on irregular pulse patterns",A 55-year-old female presents to the clinic with complaints of palpitations and a sensation of her heart racing intermittently for the past week.,561fa7b4
Symptoms and examination findings related to coarctation of the aorta,A 25-year-old male presents to the clinic complaining of intermittent claudicat