# DSPy X Progress Note Validation - Minimal Working Example

## 1) Setup

### Import OpenAI library, read API key

In [3]:
#%pip install openai

In [2]:
import os
os.chdir('C:/Users/jcviscom/OneDrive - Intermountain Healthcare/BMI Degree/Practicum/DSPy')

In [3]:
from openai import OpenAI

# Function to read API key from a file
def get_api_key(filepath):
    with open(filepath, 'r') as file:
        return file.read().strip()

# Set the path to your API key file
api_key_path = 'OpenAIKey.txt'


client = OpenAI(api_key = get_api_key(api_key_path))

## 2) Load progress notes, structured billing information

### 2.1 Load Progress Note PDF

In [4]:
import fitz  # PyMuPDF for PDF parsing
import re
import csv


# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    doc = fitz.open(pdf_file)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

progress_notes_text = extract_text_from_pdf('MWE - Sepsis/Progress Notes/Progress Notes.pdf')

In [5]:
progress_notes_text

"Progress Notes \nPatient Name: John Doe \nMRN: 12345678 \nDate of Admission: 2024-06-10 \nDate of Note: 2024-06-13 \nAttending Physician: Dr. Jane Smith \nUnit: ICU \n \nSubjective: \nPatient is a 65-year-old male with a history of hypertension and type 2 diabetes mellitus who was \nadmitted three days ago with high fever, chills, and confusion. The patient's family reports that \nhe has been feeling progressively worse over the past week and had a fever of 102°F at home. He \nwas brought to the emergency department where he was found to be hypotensive and \ntachycardic. \nObjective: \n• \nVitals: \no Temperature: 101.8°F \no Heart Rate: 110 bpm \no Blood Pressure: 85/55 mmHg \no Respiratory Rate: 24 breaths/min \no SpO2: 92% on room air \n• \nPhysical Exam: \no General: Ill-appearing, diaphoretic \no HEENT: No significant findings \no Cardiovascular: Tachycardia, no murmurs \no Respiratory: Tachypneic, coarse crackles at bases \no Abdomen: Soft, non-tender \no Extremities: No edema \

### 2.2A Load HL7 Message, Extract Primary Diagnosis and DRG

In [12]:
# Function to read HL7 message from a text file
with open('MWE - Sepsis/HL7.txt', 'r') as file:
    hl7_message = file.read()

# Regular expressions to extract DRG and ICD codes
drg_pattern_HL7 = r"DRG\|(\d+)\|"  # Matches DRG|<code>|
PD_pattern_HL7 = r"DG1\|1\|\|(.*?)\|"  # Matches DG1|1||<code>|
encounter_id_pattern = r"EVN\|.*\|([0-9]{12})\|"

# Extract DRG code
drg_match = re.search(drg_pattern_HL7, hl7_message)
if drg_match:
    drg_code = drg_match.group(1)
else:
    drg_code = "Not found"

# Extract ICD code
PD_match = re.search(PD_pattern_HL7, hl7_message)
if PD_match:
    PD_ICD = PD_match.group(1).split("^")[0]  # Extract only the ICD code without additional details
else:
    PD_ICD = "Not found"
    


# Extract Encounter ID
encounter_id_match = re.search(encounter_id_pattern, hl7_message)
if encounter_id_match:
    encounter_id = encounter_id_match.group(1)
else:
    encounter_id = "Not found"


# Print the extracted codes
print('HL7 Message:')
print()
print(hl7_message)
print()
print('RegEx Extraction:')
print()
print(f"Encounter ID: {encounter_id}")
print(f"DRG: {drg_code}")
print(f"Primary Diagnosis: {PD_ICD}")

HL7 Message:

MSH|^~\&|HIS|RIH|EKG|EKG|202406131030||ADT^A08|MSG00001|P|2.3
EVN|A08|202406131030|||1234567
PID|1||12345678^^^HIS^MR||DOE^JOHN^A||19580615|M|||123 MAIN ST^^HOMETOWN^CA^99999^USA||(555)555-5555|||M|S||123456789|987-65-4320
PV1|1|I|ICU^101^1^RIH||||1234^Smith^Jane^A|||MED|||||12345678|A|||||||||||||||||||20240610
DG1|1||A41.9^Sepsis, unspecified organism^I10|Sepsis|20240610|A
DRG|871|0|20240613
PR1|1|ICD10PCS|5A1D60Z^Respiratory Ventilation, Less than 24 Consecutive Hours^I10|20240611


RegEx Extraction:

Encounter ID: 202406131030
DRG: 871
Primary Diagnosis: A41.9


### 2.2B Load ASC EDI X12 837/835 Message, Extract Primary Diagnosis and DRG

In [18]:
# Function to read HL7 message from a text file
with open('MWE - Sepsis/ASC_EDI_X12.txt', 'r') as file:
    EDI_message = file.read()

# Regular expressions to extract DRG and ICD codes
drg_pattern_EDI = r"HI\*ABF:([\w\d]+)~"  # Matches DRG|<code>|
PD_pattern_EDI = r"HI\*ABK:([A-Za-z0-9.]+)~"  # Matches DG1|1||<code>|
claim_id_pattern = r"CLM\*([\w\d]+)\*"

# Extract DRG code
drg_match = re.search(drg_pattern_EDI, EDI_message)
if drg_match:
    drg_code = drg_match.group(1)
else:
    drg_code = "Not found"

# Extract ICD code
PD_match = re.search(PD_pattern_EDI, EDI_message)
if PD_match:
    PD_ICD = PD_match.group(1) #.split("^")[0]  # Extract only the ICD code without additional details
else:
    PD_ICD = "Not found"
    


# Extract Encounter ID
claim_id_match = re.search(claim_id_pattern, EDI_message)
if claim_id_match:
    claim_id = claim_id_match.group(1)
else:
    claim_id = "Not found"


# Print the extracted codes
print('EDI X12 Message:')
print()
print(EDI_message)
print()
print('RegEx Extraction:')
print()
print(f"Claim ID: {claim_id}")
print(f"DRG: {drg_code}")
print(f"Primary Diagnosis: {PD_ICD}")

EDI X12 Message:

ISA*00* 	*00* 
*ZZ*SENDER ID 
*ZZ*RECEIVER ID *20240613*1030*^*00501*000000001*0*T*:~ 
GS*HC*SENDER CODE*RECEIVER 
CODE*20240613*1030*1*X*005010X222A1~ 
ST*837*0001*005010X222A1~ 
BHT*0019*00*202406131030*20240613*1030*CH~ 
NM1*41*2*SENDER NAME*****46*123456789~ 
PER*IC*CONTACT NAME*TE*5555555555~ 
NM1*40*2*RECEIVER NAME*****46*987654321~ 
HL*1**20*1~ NM1*85*2*CUH*****XX*1234567893~ 
N3*123 MAIN ST~ 
N4*ATLANTA*GA*30303~ 
REF*EI*123456789~ 
HL*2*1*22*0~ 
SBR*P*18*MEDICARE*****MA~ 
NM1*IL*1*DOE*JOHN****MI*12345678~ 
N3*123 MAIN ST~ 
N4*ATLANTA*GA*30303~ 
DMG*D8*19500101*M~ 
NM1*PR*2*MEDICARE*****PI*MEDICARE~ 
HL*3*2*23*0~ PAT*19~ 
NM1*QC*1*DOE*JOHN~ 
N3*123 MAIN ST~ 
N4*ATLANTA*GA*30303~ 
CLM*12345678*1000***11::1*Y*A*Y*I~ 
HI*ABK:A41.9~ 
HI*ABF:871~ 
NM1*82*1*SMITH*JANE****XX*9876543210~ 
SBR*S*18****CI~ 
NM1*PR*2*CUH*****PI*123456~ 
SE*28*0001~ 
GE*1*1~ 
IEA*1*000000001~

RegEx Extraction:

Claim ID: 12345678
DRG: 871
Primary Diagnosis: A41.9


### 2.3 Manually seperate progress note into strucuted table

In [19]:
#split progress notes, combine into array
progress_note_1 = progress_notes_text[0:2198]
progress_note_2 = progress_notes_text[2198:]

progress_note_array = [progress_note_1,progress_note_2]

In [20]:
import pandas as pd

data = []
for i in range(len(progress_note_array)):
    ProgessNote = f'{progress_note_array[i]}'
    # Append a tuple (or list) of values to the data list
    data.append((ProgessNote, PD_ICD, drg_code))

# Convert the list of tuples to a DataFrame
df = pd.DataFrame(data, columns=['ProgessNote', 'PrimaryDiagnosis', 'MSDRG'])

# Display the DataFrame
df

Unnamed: 0,ProgessNote,PrimaryDiagnosis,MSDRG
0,Progress Notes \nPatient Name: John Doe \nMRN:...,A41.9,871
1,Follow-Up Note (2024-06-14): \nSubjective: \nP...,A41.9,871


In [21]:
df_gs = df.assign(ClinValidDRG_GS = ['Y','N'])
df_gs

Unnamed: 0,ProgessNote,PrimaryDiagnosis,MSDRG,ClinValidDRG_GS
0,Progress Notes \nPatient Name: John Doe \nMRN:...,A41.9,871,Y
1,Follow-Up Note (2024-06-14): \nSubjective: \nP...,A41.9,871,N


### Import DSPy, configure LLM

In [1]:
import dspy
from dspy.datasets.gsm8k import GSM8K, gsm8k_metric

# Set up the LM
turbo = dspy.OpenAI(model='gpt-3.5-turbo-instruct', max_tokens=250)
dspy.settings.configure(lm=turbo)

# Load math questions from the GSM8K dataset
gsm8k = GSM8K()
gsm8k_trainset, gsm8k_devset = gsm8k.train[:10], gsm8k.dev[:10]

100%|███████████████████████████████████████████████████████████████████████████| 7473/7473 [00:00<00:00, 29191.29it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1319/1319 [00:00<00:00, 29916.87it/s]


In [10]:
print(gsm8k_trainset)

[Example({'question': "The result from the 40-item Statistics exam Marion and Ella took already came out. Ella got 4 incorrect answers while Marion got 6 more than half the score of Ella. What is Marion's score?", 'gold_reasoning': "Ella's score is 40 items - 4 items = <<40-4=36>>36 items. Half of Ella's score is 36 items / 2 = <<36/2=18>>18 items. So, Marion's score is 18 items + 6 items = <<18+6=24>>24 items.", 'answer': '24'}) (input_keys={'question'}), Example({'question': "Stephen made 10 round trips up and down a 40,000 foot tall mountain. If he reached 3/4 of the mountain's height on each of his trips, calculate the total distance he covered.", 'gold_reasoning': 'Up a mountain, Stephen covered 3/4*40000 = <<3/4*40000=30000>>30000 feet. Coming down, Stephen covered another 30000 feet, making the total distance covered in one round to be 30000+30000 = <<30000+30000=60000>>60000. Since Stephen made 10 round trips up and down the mountain, he covered 10*60000 = <<10*60000=600000>>60

## 2) Define the Module

In [None]:
class CoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought("question -> answer")
    
    def forward(self, question):
        return self.prog(question=question)