In [106]:
import os
import openai
from openai import OpenAI
from pathlib import Path
from pdf2image import convert_from_path
import base64
from io import BytesIO

In [107]:
def pdf_to_base64(pdf_path):
    # Convert PDF to a list of images
    images = convert_from_path(pdf_path)

    base64_images = []

    for image in images:
        # Convert image to base64
        buffered = BytesIO()
        image.save(buffered, format="PNG")
        img_str = base64.b64encode(buffered.getvalue())

        # Add the base64-encoded image to the list
        base64_images.append(img_str.decode())

    return base64_images

In [108]:
def pdf_ingest_and_extract(pdf_path, prompt_path, openai_api_key):

    images = pdf_to_base64(pdf_path)
    with open(prompt_path, "r") as p:
        prompt = p.read()
    content = [{"type": "text", "text": prompt}]
    
    for i in images:
        content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{i}"}})
    
    client = OpenAI(api_key=openai_api_key)
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
         messages=[
            {
                "role": "user",
                "content": content,
            }
        ],
        max_tokens=4000,
        temperature=0
    )
    
    return response.choices[0].message.content

In [89]:
pdf_extract = pdf_ingest_and_extract("pdfs/medical-record-3.pdf", "pdf_extraction_prompt.txt")

In [174]:
def text_to_json(text, prompt_path, openai_api_key):

    with open(prompt_path, "r") as p:
        prompt = p.read()
        prompt = prompt + "\n\n" + f"Input: {text}\nOutput:"
    
    client = OpenAI(api_key=openai_api_key)
    response = openai.chat.completions.create(
            model="gpt-4-1106-preview",
            response_format={"type": "json_object"},
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0
        )
    
    output = response.choices[0].message.content
    output = output.split("<json>\n")[-1].split("\n</json>")[0]
    
    return json.loads(output)

In [96]:
pdf_json = text_to_json(pdf_extract, "text_to_json_prompt.txt")

In [97]:
pdf_json

{'CPT': ['47562', '45378', '43235', '43239'],
 'PatientMedicalRecord': {'Name': 'Michail Antonio',
  'DateOfBirth': '01/01/1975',
  'DateOfRecord': '12/06/2023',
  'Gender': 'Male',
  'Address': '101 Hospital Drive, Dallas, Texas 91126',
  'ContactNumber': '(545) 555-1111',
  'EmergencyContact': '(677) 234-8873',
  'ChiefComplaint': 'Persistent upper abdominal pain and heartburn since May 2023.'},
 'HistoryOfPresentIllness': {'Description': 'Mr. Doe, a 48-year-old male, reports a 6-month history of upper abdominal pain, heartburn, and occasional nausea beginning around May 2023. Symptoms worsen post-meal, especially after consuming spicy foods. He denies vomiting, weight loss, or changes in bowel habits. Over-the-counter antacids, started in June 2023, provide minimal relief. No history of gastrointestinal bleeding.'},
 'PastMedicalHistory': {'Conditions': ['Hypertension diagnosed in 2015, controlled with medication.'],
  'PreviousMedicalProcedures': [{'Procedure': 'Cholecystectomy on 

In [121]:
from collections import defaultdict

pdf_dir = Path('pdfs/')
pdf_extracts = defaultdict(list)

# Grab all PDF files
pdf_files = list(pdf_dir.glob('*.pdf'))
# Print the paths of the PDF files
for pdf_path in pdf_files:
    pdf_extracts["study"].append(str(pdf_path).split("/")[-1].split(".")[0])
    pdf_extracts["raw_text"].append(pdf_ingest_and_extract(pdf_path, "pdf_extraction_prompt.txt"))

In [199]:
pdf_extracts["json"] = []

In [200]:
for raw_text in pdf_extracts["raw_text"]:
    pdf_extracts["json"].append(text_to_json(raw_text, "text_to_json_prompt.txt"))

In [201]:
guidelines = {
    "guidelines": {
        "Colorectal cancer screening": {
            "criteria": {
                "Patient has average-risk or higher": {
                    "criteria": {
                        "Age 45 years or older?": {
                            "condition met" : None, 
                            "evidence" : None
                        },
                        "No colonoscopy in past 10 years?": {
                            "condition met" : None, 
                            "evidence" : None
                        },
                    }
                },
                "High risk family history": {
                    "criteria": {
                        "Colorectal cancer diagnosed in one or more first-degree relatives of any age": {
                            "criteria": {
                                "Age 40 years or older": {
                                    "condition met?" : None, 
                                    "evidence" : None},
                                "Symptomatic (eg, abdominal pain, iron deficiency anemia, rectal bleeding)": {
                                    "condition met?" : None, 
                                    "evidence" : None}
                            }
                        },
                        "Family member with colonic adenomatous polyposis of unknown etiology": {
                            "condition met?" : None, 
                            "evidence" : None}
                    }
                },
                "Juvenile polyposis syndrome diagnosis": {
                    "criteria": {
                        "Age 12 years or older and symptomatic (Abdominal pain, Iron deficiency anemia, Rectal bleeding, Telangiectasia)": {
                            "condition met?": None, 
                            "evidence": None
                        },
                        "Age younger than 12 years and symptomatic (Abdominal pain, Iron deficiency anemia, Rectal bleeding, Telangiectasia)": {
                            "condition met?": None, 
                            "evidence": None
                        }
                    }
                }
            }
        }
    }
}

In [210]:
decision = {"decision" : {"approve?" : None, "rationale" : None, "additional information required?" : None}}

In [202]:
for j in pdf_extracts["json"]:
    pdf_extracts["updated_json"].append(j.update(guidelines))

In [211]:
for j in pdf_extracts["json"]:
    pdf_extracts["updated_json"].append(j.update(decision))

In [212]:
pdf_extracts

defaultdict(list,
            {'study': ['medical-record-1',
              'medical-record-3',
              'medical-record-2'],
             'raw_text': ['MEDICAL RECORD\n\nPatient Name: James Freeman\n\nDOB: 06/16/1982\n\nMRN: 456789123\n\nSex: Male\n\nPRESENTING COMPLAINT\n\nSymptoms: Occasional rectal bleeding and abdominal discomfort for the past 6 months.\n\nDuration: 6 months.\n\nPATIENT INFORMATION\n\nName: James Freeman\n\nDOB: 06/16/1982\n\nGender: Male\n\nAddress: 4521 Maple Avenue, Dallas, Texas 75219\n\nContact Number: (214) 555-0123\n\nEmergency Contact: Not provided\n\nMEDICAL HISTORY\n\n- Family History: Father had colorectal cancer at age 68.\n- Personal Medical History: Hypertension, managed with medication.\n- Medications: Lisinopril 10mg daily.\n- Allergies: No known drug allergies.\n\nALLERGIES\n\n- Allergies not reviewed (last reviewed 11/28/2022)\n- NKDA\n\nVITALS\n\nHt: 6ft2\n\nWt: 165lbs\n\nBMI: 31.2\n\nPulse: 96bpm\n\nPHYSICAL EXAMINATION\n\nFindings: Mild te

In [184]:
import pandas as pd

df = pd.DataFrame(pdf_extracts)
df

Unnamed: 0,study,raw_text,json
0,medical-record-1,MEDICAL RECORD\n\nPatient Name: James Freeman\...,"{'CPT': ['44970', '29881', '45378'], 'MEDICAL ..."
1,medical-record-3,PATIENT MEDICAL RECORD\n\nName: Michail Antoni...,"{'CPT': ['47562', '45378', '43235', '43239'], ..."
2,medical-record-2,PATIENT MEDICAL RECORD\n\nName: James Maddison...,"{'Name': 'James Maddison', 'Date of Birth': '0..."


In [216]:
def create_json_schema(jsons, prompt_path):

    with open(prompt_path, "r") as p:
        prompt = p.read()
        prompt = prompt + "\n\n" + f"Input: {jsons}\nOutput:"
    
    client = OpenAI(api_key=openai_api_key)
    response = openai.chat.completions.create(
            model="gpt-4-1106-preview",
            response_format={"type": "json_object"},
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0
        )
    
    output = response.choices[0].message.content
    output = output.split("<json>\n")[-1].split("\n</json>")[0]
    
    return json.loads(output)

In [217]:
concat_jsons = "<json>" + "\n".join([str(j) for j in pdf_extracts["json"]]) + "</json>"

json_schema = create_json_schema(concat_jsons, "json_schema_prompt.txt")

In [218]:
json_schema

{'$schema': 'http://json-schema.org/draft-07/schema#',
 'type': 'object',
 'properties': {'CPT': {'type': 'array',
   'items': {'type': 'string'},
   'minItems': 1},
  'MEDICAL RECORD': {'type': 'object',
   'properties': {'Patient Name': {'type': 'string'},
    'DOB': {'type': 'string'},
    'MRN': {'type': 'string'},
    'Sex': {'type': 'string'}},
   'required': ['Patient Name', 'DOB', 'MRN', 'Sex']},
  'PRESENTING COMPLAINT': {'type': 'object',
   'properties': {'Symptoms': {'type': 'string'},
    'Duration': {'type': 'string'}},
   'required': ['Symptoms', 'Duration']},
  'PATIENT INFORMATION': {'type': 'object',
   'properties': {'Name': {'type': 'string'},
    'DOB': {'type': 'string'},
    'Gender': {'type': 'string'},
    'Address': {'type': 'string'},
    'Contact Number': {'type': 'string'},
    'Emergency Contact': {'type': 'string'}},
   'required': ['Name',
    'DOB',
    'Gender',
    'Address',
    'Contact Number',
    'Emergency Contact']},
  'MEDICAL HISTORY': {'type

In [221]:
with open("json_schema.json", "w") as j:
    json.dump(json_schema, j)

In [353]:
def text_to_unified_json(text, json_schema, prompt_path, model="gpt-4-1106-preview"):

    with open(prompt_path, "r") as p:
        prompt = p.read()
        prompt = prompt + "\n\n" + f"medical document input: {text}\noutput:"
    
    client = OpenAI(api_key=openai_api_key)
    response = openai.chat.completions.create(
            model=model,
            response_format={"type": "json_object"},
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0
        )
    
    output = response.choices[0].message.content
    output = output.split("<json>\n")[-1].split("\n</json>")[0]
    
    return json.loads(output)

In [320]:
for raw_text in pdf_extracts["raw_text"]:
    pdf_extracts["unified_json"].append(text_to_unified_json(raw_text, json_schema, "text_to_json_schema_prompt.txt"))

In [321]:
pdf_extracts["unified_json"]

[{'CPT': ['44970', '29881', '45378'],
  'MEDICAL RECORD': {'Patient Name': 'James Freeman',
   'DOB': '06/16/1982',
   'MRN': '456789123',
   'Sex': 'Male',
   'other': 'not provided'},
  'PRESENTING COMPLAINT': {'Symptoms': 'Occasional rectal bleeding and abdominal discomfort',
   'Duration': '6 months',
   'other': 'not provided'},
  'PATIENT INFORMATION': {'Name': 'James Freeman',
   'DOB': '06/16/1982',
   'Gender': 'Male',
   'Address': '4521 Maple Avenue, Dallas, Texas 75219',
   'Contact Number': '(214) 555-0123',
   'Emergency Contact': 'not provided',
   'other': 'not provided'},
  'MEDICAL HISTORY': {'Family History': ['Father had colorectal cancer at age 68'],
   'Personal Medical History': ['Hypertension, managed with medication'],
   'Medications': ['Lisinopril 10mg daily'],
   'Allergies': ['No known drug allergies'],
   'other': 'not provided'},
  'ALLERGIES': {'Allergies not reviewed': 'last reviewed 11/28/2022',
   'NKDA': True,
   'other': 'not provided'},
  'VITALS':

In [355]:
for raw_text in pdf_extracts["raw_text"]:
    print(text_to_unified_json(raw_text, json_schema, "text_to_json_schema_prompt.txt"))

{'CPT': ['44970', '29881', '45378'], 'MEDICAL RECORD': {'Patient Name': 'James Freeman', 'DOB': '06/16/1982', 'MRN': '456789123', 'Sex': 'Male', 'other': 'not provided'}, 'PRESENTING COMPLAINT': {'Symptoms': 'Occasional rectal bleeding and abdominal discomfort', 'Duration': '6 months', 'other': 'not provided'}, 'PATIENT INFORMATION': {'Name': 'James Freeman', 'DOB': '06/16/1982', 'Gender': 'Male', 'Address': '4521 Maple Avenue, Dallas, Texas 75219', 'Contact Number': '(214) 555-0123', 'Emergency Contact': 'Not provided', 'other': 'not provided'}, 'MEDICAL HISTORY': {'Family History': ['Father had colorectal cancer at age 68'], 'Personal Medical History': ['Hypertension, managed with medication'], 'Medications': ['Lisinopril 10mg daily'], 'Allergies': ['No known drug allergies'], 'other': 'not provided'}, 'ALLERGIES': {'Allergies not reviewed': 'last reviewed 11/28/2022', 'NKDA': True, 'other': 'not provided'}, 'VITALS': {'Ht': '6ft2', 'Wt': '165lbs', 'BMI': 31.2, 'Pulse': '96bpm', 'oth

In [351]:
for raw_text in pdf_extracts["raw_text"][2:]:
    print(text_to_unified_json(raw_text, json_schema, "text_to_json_schema_prompt.txt", model="gpt-3.5-turbo-1106"))

{'CPT': ['45378'], 'MEDICAL RECORD': {'Patient Name': 'James Maddison', 'DOB': '03/15/1965', 'MRN': 'not provided', 'Sex': 'Male', 'other': 'Address: 1234 Sunset Blvd, Los Angeles, California 90026\nContact Number: (214) 555-0123\nEmergency Contact: (214) 555-0456'}, 'PRESENTING COMPLAINT': {'Symptoms': 'Patient mentions occasional rectal bleeding and abdominal discomfort, with somewhat frequent occurrence and some severity.', 'Duration': 'The symptoms have been present for a few months', 'other': 'not provided'}, 'PATIENT INFORMATION': {'Name': 'James Maddison', 'DOB': '03/15/1965', 'Gender': 'Male', 'Address': '1234 Sunset Blvd, Los Angeles, California 90026', 'Contact Number': '(214) 555-0123', 'Emergency Contact': '(214) 555-0456', 'other': 'not provided'}, 'MEDICAL HISTORY': {'Family History': ['Father had colorectal cancer at a significantly advanced age.'], 'Personal Medical History': ['Hypertension, reportedly managed with medication.'], 'Medications': ['Lisinopril 10 mg daily,

In [304]:
for i in range(3):    
    for raw_text in pdf_extracts["raw_text"]:
        print(text_to_unified_json(raw_text, json_schema, "text_to_json_schema_prompt.txt"))
        print("\n\n")
    print("---------------------------------------------------")

{'CPT': ['44970', '29881', '45378'], 'MEDICAL RECORD': {'Patient Name': 'James Freeman', 'DOB': '06/16/1982', 'MRN': '456789123', 'Sex': 'Male'}, 'PRESENTING COMPLAINT': {'Symptoms': 'Occasional rectal bleeding and abdominal discomfort', 'Duration': '6 months'}, 'PATIENT INFORMATION': {'Name': 'James Freeman', 'DOB': '06/16/1982', 'Gender': 'Male', 'Address': '4521 Maple Avenue, Dallas, Texas 75219', 'Contact Number': '(214) 555-0123', 'Emergency Contact': 'not provided'}, 'MEDICAL HISTORY': {'Family History': ['Father had colorectal cancer at age 68'], 'Personal Medical History': ['Hypertension, managed with medication'], 'Medications': ['Lisinopril 10mg daily'], 'Allergies': ['No known drug allergies']}, 'ALLERGIES': {'Allergies not reviewed': 'last reviewed 11/28/2022', 'NKDA': True}, 'VITALS': {'Ht': '6ft2', 'Wt': '165lbs', 'BMI': 31.2, 'Pulse': '96bpm'}, 'PHYSICAL EXAMINATION': {'Findings': 'Mild tenderness in the lower abdomen, no palpable masses'}, 'DIAGNOSTIC TESTS AND RESULTS'

JSONDecodeError: Expecting ',' delimiter: line 1244 column 4 (char 10013)

In [None]:
 but if it is a mere rewording of existing fields in the schema you should incorporate it under the existing relevant key(s) of the schema