# DSPy X Progress Note Validation - Minimal Working Example

## 1) Setup

### Import OpenAI library, read API key

In [3]:
#%pip install openai

In [1]:
import os
os.chdir('C:/Users/jcviscom/OneDrive - Intermountain Healthcare/BMI Degree/Practicum/DSPy')

In [2]:
from openai import OpenAI

# Function to read API key from a file
def get_api_key(filepath):
    with open(filepath, 'r') as file:
        return file.read().strip()

# Set the path to your API key file
api_key_path = 'OpenAIKey.txt'


client = OpenAI(api_key = get_api_key(api_key_path))

## 2) Data Wrangling

### Load Progress Note PDF

In [3]:
import fitz  # PyMuPDF for PDF parsing
import re
import csv


# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    doc = fitz.open(pdf_file)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

progress_notes_text = extract_text_from_pdf('MWE - Sepsis/Progress Notes/Progress Notes.pdf')

In [4]:
progress_notes_text

"Progress Notes \nPatient Name: John Doe \nMRN: 12345678 \nDate of Admission: 2024-06-10 \nDate of Note: 2024-06-13 \nAttending Physician: Dr. Jane Smith \nUnit: ICU \n \nSubjective: \nPatient is a 65-year-old male with a history of hypertension and type 2 diabetes mellitus who was \nadmitted three days ago with high fever, chills, and confusion. The patient's family reports that \nhe has been feeling progressively worse over the past week and had a fever of 102°F at home. He \nwas brought to the emergency department where he was found to be hypotensive and \ntachycardic. \nObjective: \n• \nVitals: \no Temperature: 101.8°F \no Heart Rate: 110 bpm \no Blood Pressure: 85/55 mmHg \no Respiratory Rate: 24 breaths/min \no SpO2: 92% on room air \n• \nPhysical Exam: \no General: Ill-appearing, diaphoretic \no HEENT: No significant findings \no Cardiovascular: Tachycardia, no murmurs \no Respiratory: Tachypneic, coarse crackles at bases \no Abdomen: Soft, non-tender \no Extremities: No edema \

### Load HL7 Message, Extract Primary Diagnosis and DRG

In [5]:
# Function to read HL7 message from a text file
with open('MWE - Sepsis/HL7.txt', 'r') as file:
    hl7_message = file.read()

# Regular expressions to extract DRG and ICD codes
drg_pattern = r"DRG\|(\d+)\|"  # Matches DRG|<code>|
PD_pattern = r"DG1\|1\|\|(.*?)\|"  # Matches DG1|1||<code>|
encounter_id_pattern = r"EVN\|.*\|([0-9]{12})\|"

# Extract DRG code
drg_match = re.search(drg_pattern, hl7_message)
if drg_match:
    drg_code = drg_match.group(1)
else:
    drg_code = "Not found"

# Extract ICD code
PD_match = re.search(PD_pattern, hl7_message)
if PD_match:
    PD_ICD = PD_match.group(1).split("^")[0]  # Extract only the ICD code without additional details
else:
    PD_ICD = "Not found"
    


# Extract Encounter ID
encounter_id_match = re.search(encounter_id_pattern, hl7_message)
if encounter_id_match:
    encounter_id = encounter_id_match.group(1)
else:
    encounter_id = "Not found"


# Print the extracted codes
print('HL7 Message:')
print()
print(hl7_message)
print()
print('RegEx Extraction:')
print()
print(f"Encounter ID: {encounter_id}")
print(f"DRG: {drg_code}")
print(f"Primary Diagnosis: {PD_ICD}")

HL7 Message:

MSH|^~\&|HIS|RIH|EKG|EKG|202406131030||ADT^A08|MSG00001|P|2.3
EVN|A08|202406131030|||1234567
PID|1||12345678^^^HIS^MR||DOE^JOHN^A||19580615|M|||123 MAIN ST^^HOMETOWN^CA^99999^USA||(555)555-5555|||M|S||123456789|987-65-4320
PV1|1|I|ICU^101^1^RIH||||1234^Smith^Jane^A|||MED|||||12345678|A|||||||||||||||||||20240610
DG1|1||A41.9^Sepsis, unspecified organism^I10|Sepsis|20240610|A
DRG|871|0|20240613
PR1|1|ICD10PCS|5A1D60Z^Respiratory Ventilation, Less than 24 Consecutive Hours^I10|20240611


RegEx Extraction:

Encounter ID: 202406131030
DRG: 871
Primary Diagnosis: A41.9


### Option A) Use GPT 3.5 to compile information into structured table  

#### Prep Few-Shot Learning Examples

In [6]:
#%pip install langchain
from langchain.prompts.few_shot import FewShotPromptTemplate

In [22]:
import ast

#plain text examples
progress_notes2_text = extract_text_from_pdf('MWE - Sepsis/Progress Notes/Progress Notes 2.pdf')
progress_notes3_text = extract_text_from_pdf('MWE - Sepsis/Progress Notes/Progress Notes 3.pdf')

#desired array outputs for examples
with open('MWE - Sepsis/Progress Notes/progress_notes2_array.txt', 'r') as file:
    progress_notes2_array_raw = file.read()
with open('MWE - Sepsis/Progress Notes/progress_notes3_array.txt', 'r') as file:
    progress_notes3_array_raw = file.read()

#read in the notes literaly and convert to array
progress_notes2_array = ast.literal_eval(progress_notes2_array_raw)
progress_notes3_array = ast.literal_eval(progress_notes3_array_raw)

#set up few-shot examples with lists
examples = [
    {"Note": progress_notes2_text
    ,"Array": progress_notes2_array}
    #,{"Note": progress_notes3_text
    # ,"Array": progress_notes3_array}
]

#### Create formatter for the few shot examlpes

In [27]:
from langchain_core.prompts.prompt import PromptTemplate

example_prompt = PromptTemplate(
    input_variables=["Note", "Array"], template="Progress Note: {Note}\n Array: {Array}"
)

print(example_prompt.format(**examples[0]))

Progress Note: Day 1 - Admission 
Date: 2024-06-10 
Time: 10:00 AM 
Physician: Dr. Jane Smith 
Subjective: 
• 
Patient is a 65-year-old male with a history of COPD and hypertension, presenting with 
shortness of breath, fever, and productive cough for the past 3 days. 
Objective: 
• 
Vital Signs: BP 140/85, HR 100, RR 24, Temp 101.5°F, SpO2 88% on room air. 
• 
General: Appears ill and in mild respiratory distress. 
• 
Lungs: Decreased breath sounds with crackles in the right lower lobe. 
• 
Cardiac: Regular rhythm, no murmurs. 
• 
Labs: Elevated WBC count of 15,000/µL. 
• 
Imaging: Chest X-ray shows right lower lobe infiltrate consistent with pneumonia. 
Assessment: 
• 
Community-acquired pneumonia, likely bacterial. 
• 
COPD exacerbation. 
• 
Hypertension, stable. 
Plan: 
1. Admit to inpatient medical ward. 
2. Start IV antibiotics: Ceftriaxone and Azithromycin. 
3. Administer oxygen via nasal cannula to maintain SpO2 > 92%. 
4. Nebulizer treatments with albuterol every 4 hours. 
5. 

#### Feed examples and formatter to FewShotPromptTemplate

In [31]:
prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    suffix="Turn this Progress Note into an Array: {input}",
    input_variables=["input"],
)

print(prompt.format(input=progress_notes_text))

Progress Note: Day 1 - Admission 
Date: 2024-06-10 
Time: 10:00 AM 
Physician: Dr. Jane Smith 
Subjective: 
• 
Patient is a 65-year-old male with a history of COPD and hypertension, presenting with 
shortness of breath, fever, and productive cough for the past 3 days. 
Objective: 
• 
Vital Signs: BP 140/85, HR 100, RR 24, Temp 101.5°F, SpO2 88% on room air. 
• 
General: Appears ill and in mild respiratory distress. 
• 
Lungs: Decreased breath sounds with crackles in the right lower lobe. 
• 
Cardiac: Regular rhythm, no murmurs. 
• 
Labs: Elevated WBC count of 15,000/µL. 
• 
Imaging: Chest X-ray shows right lower lobe infiltrate consistent with pneumonia. 
Assessment: 
• 
Community-acquired pneumonia, likely bacterial. 
• 
COPD exacerbation. 
• 
Hypertension, stable. 
Plan: 
1. Admit to inpatient medical ward. 
2. Start IV antibiotics: Ceftriaxone and Azithromycin. 
3. Administer oxygen via nasal cannula to maintain SpO2 > 92%. 
4. Nebulizer treatments with albuterol every 4 hours. 
5. 

#### Run FewShotPrompt on GPT API

In [25]:
#%pip install langchain_openai
from langchain_openai import OpenAI

model = OpenAI()

In [33]:
output = model(prompt.format(input=progress_notes_text))

In [34]:
output

"Array: ['Progress Notes', 'Patient Name: John Doe', 'MRN: 12345678', 'Date of Admission: 2024-06-10', 'Date of Note: 2024-06-13', 'Attending Physician: Dr. Jane Smith', 'Unit: ICU', 'Subjective:', 'Patient is a 65-year-old male with a history of hypertension and type 2 diabetes mellitus who was', 'admitted three days ago with high fever, chills, and confusion. The patient\\'s family reports that', 'he has been feeling progressively worse over the past week and had a fever of 102Â°F at home. He', 'was brought to the emergency department where he was found to be hypotensive and', 'tachycardic.', 'Objective:', 'â€¢', 'Vitals:', 'o Temperature: 101.8Â°F', 'o Heart Rate: 110 bpm', 'o Blood Pressure: 85/55 mmHg', 'o Respiratory Rate: 24 breaths/min', 'o SpO2: 92% on room air', 'â€¢', 'Physical Exam:', 'o General: Ill-appearing, diaphoret"

This hasn't been working. I'm going to stick to HL7 inputs for relability

In [6]:
prompt = f'Seperate each SOAP note with double quotes.'\
f'Use this text:{progress_notes_text[:100]}'

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="gpt-3.5-turbo",
)
    
pipe_content = chat_completion.choices[0].message.content


APITimeoutError: Request timed out.

In [86]:
pipe_content

'"Progress Notes \nPatient Name: John Doe \nMRN: 12345678 \nDate of Admission: 2024-06-10 \nDate of Note: 2024-06-13 \nAttending Physician: Dr. Jane Smith \nUnit: ICU \n \nSubjective: \nPatient is a 65-year-old male with a history of hypertension and type 2 diabetes mellitus who was \nadmitted three days ago with high fever, chills, and confusion. The patient\'s family reports that \nhe has been feeling progressively worse over the past week and had a fever of 102°F at home. He \nwas brought to the emergency department where he was found to be hypotensive and \ntachycardic. \nObjective: \n• \nVitals: \no Temperature: 101.8°F \no Heart Rate: 110 bpm \no Blood Pressure: 85/55 mmHg \no Respiratory Rate: 24 breaths/min \no SpO2: 92% on room air \n• \nPhysical Exam: \no General: Ill-appearing, diaphoretic \no HEENT: No significant findings \no Cardiovascular: Tachycardia, no murmurs \no Respiratory: Tachypneic, coarse crackles at bases \no Abdomen: Soft, non-tender \no Extremities: No edema

In [93]:
# Extract text within double quotes
matches = re.findall(r'"([^"]*)"', pipe_content)
matches[0]

"Progress Notes \nPatient Name: John Doe \nMRN: 12345678 \nDate of Admission: 2024-06-10 \nDate of Note: 2024-06-13 \nAttending Physician: Dr. Jane Smith \nUnit: ICU \n \nSubjective: \nPatient is a 65-year-old male with a history of hypertension and type 2 diabetes mellitus who was \nadmitted three days ago with high fever, chills, and confusion. The patient's family reports that \nhe has been feeling progressively worse over the past week and had a fever of 102°F at home. He \nwas brought to the emergency department where he was found to be hypotensive and \ntachycardic. \nObjective: \n• \nVitals: \no Temperature: 101.8°F \no Heart Rate: 110 bpm \no Blood Pressure: 85/55 mmHg \no Respiratory Rate: 24 breaths/min \no SpO2: 92% on room air \n• \nPhysical Exam: \no General: Ill-appearing, diaphoretic \no HEENT: No significant findings \no Cardiovascular: Tachycardia, no murmurs \no Respiratory: Tachypneic, coarse crackles at bases \no Abdomen: Soft, non-tender \no Extremities: No edema \

### Import DSPy, configure LLM

In [3]:
import dspy
from dspy.datasets.gsm8k import GSM8K, gsm8k_metric

# Set up the LM
turbo = dspy.OpenAI(model='gpt-3.5-turbo-instruct', max_tokens=250)
dspy.settings.configure(lm=turbo)

# Load math questions from the GSM8K dataset
gsm8k = GSM8K()
gsm8k_trainset, gsm8k_devset = gsm8k.train[:10], gsm8k.dev[:10]

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/419k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

100%|███████████████████████████████████████████████████████████████████████████| 7473/7473 [00:00<00:00, 29926.93it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1319/1319 [00:00<00:00, 40104.44it/s]


## 2) Define the Module

In [None]:
class CoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought("question -> answer")
    
    def forward(self, question):
        return self.prog(question=question)