### Retrieval evaluation

In [141]:
import json
import pickle
import hashlib
import time
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
from dotenv import load_dotenv
from collections import defaultdict

load_dotenv()

True

In [2]:
def generate_document_id(doc, index=None):
    source = doc.get('metadata', {}).get('source', 'unknown_source')
    page = doc.get('metadata', {}).get('page', 'unknown_page')
    text = doc.get('page_content', '')
    timestamp = str(time.time())


    text_preview = text[:10] if len(text) >= 10 else text

    combined = f"{source}-{page}-{text_preview}-{index}-{timestamp}"

    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]

    return document_id


In [3]:
with open("../chunked_data_two.json", 'r') as f_in:
    documents = json.load(f_in)

In [4]:
for i, doc in enumerate(documents):
    doc_id = generate_document_id(doc, index=i)
    doc['id'] = doc_id

In [5]:
documents[0]

{'page_content': '123\nXue-Hong Wan\nRui Zeng\nEditors \nHandbook of Clinical \nDiagnostics',
 'metadata': {'source': '../data/Handbook_of_clinical_diagnostics.pdf',
  'page': 0,
  'page_label': 'C1'},
 'id': '84a3b541'}

In [6]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [7]:
# there are not duplicate id
len(hashes), len(documents)

(5406, 5406)

In [8]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

In [9]:
# save document with id
with open('documents_with_ids.json', 'w') as f_out:
    json.dump(documents, f_out, indent=4)

In [63]:
client = OpenAI()

In [115]:
prompt_template = """
You are a medical examiner preparing a **clinical case scenario** for an OSCE (Objective Structured Clinical Examination).  
Your task is to **predict a likely query** a medical student might enter and generate a **realistic case prompt** based on the retrieved medical records.  

### Instructions:
- **Step 1**: Infer a relevant **student query** based on the `page_content` of the retrieved medical record.
- **Step 2**: Construct a **concise** case prompt that aligns with the student’s level of understanding.
- **Step 3**: The case prompt should follow **OSCE standards** for **history taking and physical examination**.
- **Step 4**: The case prompt should include **only the presenting complaint**; do not provide additional history details.
- **Step 5**: The scenario should **mimic a real OSCE station**, requiring the student to take a full history and perform a relevant examination.

### Retrieved Medical Record:
- **Source**: {source}
- **Page Number**: {page_label}
- **Extracted Content**:  
{page_content}

### Expected Output:
Provide the output in **parsable JSON** without using code blocks:

{{
    "student_query": "Likely topic/disease/examination the student might ask about",
    "case_prompt": "Generated case prompt based on the student query and retrieved data, give the presenting complaint eg presents to the clinic with complaints of chest discomfort for the past 3 days "
}}
""".strip()

In [103]:
len(documents)

5406

In [104]:
# choose the random document chunk for indexing
sample = documents[2100: 2150] + documents[4100: 4200] + documents[1100: 1150] + documents[3100: 3150]

In [105]:
len(sample)

250

In [106]:
def generate_case_prompt(doc):
    source = doc.get("metadata", {}).get("source", "Unknown source")
    page_label = doc.get("metadata", {}).get("page_label", "Unknown page")
    page_content = doc.get("page_content", "No content available.")

   
    prompt = prompt_template.format(
        source=source,
        page_label=page_label,
        page_content=page_content
    )

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    try:
        json_response = json.loads(response.choices[0].message.content)
    except json.JSONDecodeError:
        json_response = {"error": "Invalid JSON response from model"}

    return json_response


In [118]:
results = {}

for doc in tqdm(sample): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_case_prompt(doc)
    results[doc_id] = questions

100%|██████████| 250/250 [07:37<00:00,  1.83s/it]


In [121]:
with open('results.bin', 'wb') as f_in:
    pickle.dump(results, f_in)

In [123]:
results

{'3d02524a': {'student_query': 'What are the possible causes and examination findings for a patient with bradycardia?',
  'case_prompt': 'A 71-year-old man presents to the clinic with complaints of dizziness and fatigue for the past week. He reports that these symptoms have worsened over the last few days, and he denies any chest pain or palpitations.'},
 'b0ee85e6': {'student_query': 'What are the clinical features and examination findings in a patient with atrial fibrillation?',
  'case_prompt': 'A 68-year-old male presents to the clinic with complaints of palpitations and shortness of breath that have worsened over the past week.'},
 '033df4d0': {'student_query': 'What are the clinical features and examination findings in a patient with atrial fibrillation?',
  'case_prompt': 'A 68-year-old male presents to the emergency department with complaints of palpitations and shortness of breath that started suddenly 2 hours ago.'},
 'f29f9cdb': {'student_query': 'How to assess and manage a 

In [124]:
doc_index = {d['id']: d for d in sample}

In [142]:
data = [(v['student_query'], v['case_prompt'], k) for k, v in results.items()]

df = pd.DataFrame(data, columns=['question', 'case_prompt', 'document'])

In [143]:
df.head()

Unnamed: 0,question,case_prompt,document
0,What are the possible causes and examination f...,A 71-year-old man presents to the clinic with ...,3d02524a
1,What are the clinical features and examination...,A 68-year-old male presents to the clinic with...,b0ee85e6
2,What are the clinical features and examination...,A 68-year-old male presents to the emergency d...,033df4d0
3,How to assess and manage a patient with irregu...,A 58-year-old male presents to the clinic with...,f29f9cdb
4,How do I assess for coarctation of the aorta i...,A 16-year-old male presents to the clinic with...,5c22a143


In [144]:
df.to_csv('questions.csv', index=False)

In [145]:
!head questions.csv

question,case_prompt,document
What are the possible causes and examination findings for a patient with bradycardia?,"A 71-year-old man presents to the clinic with complaints of dizziness and fatigue for the past week. He reports that these symptoms have worsened over the last few days, and he denies any chest pain or palpitations.",3d02524a
What are the clinical features and examination findings in a patient with atrial fibrillation?,A 68-year-old male presents to the clinic with complaints of palpitations and shortness of breath that have worsened over the past week.,b0ee85e6
What are the clinical features and examination findings in a patient with atrial fibrillation?,A 68-year-old male presents to the emergency department with complaints of palpitations and shortness of breath that started suddenly 2 hours ago.,033df4d0
How to assess and manage a patient with irregular heart rhythms?,A 58-year-old male presents to the clinic with complaints of palpitations and a feeling of skipped h