# Studying Hospital Readmission Risk with Synthetic Patient Data

### Source: synthea.mitre.org

### Dataset: 100 Sample Synthetic Patient Records, CSV: 7 MB

## Libraries & Setup

In [96]:
# Load required libraries
import pandas as pd
from datetime import datetime
# Load required libraries for API requests
import requests
import json
# Load operating system library & utils
import os
import textwrap
import shutil

# Get the width of the console window (for formatting)
console_width = shutil.get_terminal_size().columns

## Load and Explore Patient Data

In [97]:
# Local data folder
data_path = 'synthea_sample_data_csv'  

# Load relevant CSVs
patients_df = pd.read_csv(os.path.join(data_path, 'patients.csv'))
encounters_df = pd.read_csv(os.path.join(data_path, 'encounters.csv'))
conditions_df = pd.read_csv(os.path.join(data_path, 'conditions.csv'))
medications_df = pd.read_csv(os.path.join(data_path, 'medications.csv'))

# Preview the datasets
print("Patients:", patients_df.shape)
print("Encounters:", encounters_df.shape)
print("Conditions:", conditions_df.shape)
print("Medications:", medications_df.shape)
    

Patients: (1163, 25)
Encounters: (61459, 15)
Conditions: (38094, 6)
Medications: (56430, 13)


## Pretty Print to Console

In [110]:
def pretty_print_patient_summary(ground_truth, llm_prediction):
    """
    Print a summary of the patient's information, including the ground truth and LLM prediction.
    Args:
        ground_truth (str): The ground truth information.
        llm_prediction_cot (str): The LLM prediction information.
    """
    # Print a separator line
    print("=" * console_width)

    # Print the ground truth, centered
    ground_truth_text = f"Ground Truth: {ground_truth}"
    print(ground_truth_text.center(console_width))

    # Print another separator line
    print("=" * console_width)

    # Print the LLM prediction, wrapped to fit the console width
    llm_prediction_text = f"LLM (CoT): {llm_prediction}"
    wrapped_prediction = textwrap.fill(llm_prediction_text, width=console_width)
    print(wrapped_prediction)

    # Print a final separator line
    print("=" * console_width)

## Feature Engineer Patient Ages

In [98]:
# Function to parse date strings safely
def safe_parse_date(date_str):
    try:
        if pd.isnull(date_str):
            return None
        return datetime.strptime(str(date_str), "%Y-%m-%d")
    except Exception:
        return None
    
# Function to calculate age from BIRTHDATE and DEATHDATE
def calculate_age(dob, dod=None):
    dob_date = safe_parse_date(dob)
    dod_date = safe_parse_date(dod) or datetime.today()

    if dob_date is None:
        return None
    
    age = (dod_date - dob_date).days // 365
    return age

# Apply calculate_age function to create "age" column
patients_df['AGE'] = patients_df.apply(lambda row: calculate_age(row['BIRTHDATE'], row['DEATHDATE']), axis=1)

# Filter out rows with age greater than or equal to 120
filtered_patients_df = patients_df[patients_df['AGE'] < 120]


## Sanity Check Section

In [None]:
conditions_df.head()

In [None]:
patients_df.head()

In [None]:
encounters_df.head()

In [None]:
# show latest date in STOP column
latest_stop_date = pd.to_datetime(encounters_df['STOP'].max())
print("Latest STOP date:", latest_stop_date)

In [None]:
medications_df.head()

## Define Readmission Risk Rule (Ground Truth)

In [211]:
# Define basic rule:
# "High Risk" if patient has >=2 chronic conditions AND >=2 inpatient encounters in the past year

# Define chronic condition keywords, according to LACE, Charlson, and CMS 30-day readmission measures
chronic_conditions = ['Hypertension', 'Diabetes', 'COPD', 'CHF', 'CKD']

# Filter for chronic conditions
chronic = conditions_df[conditions_df['DESCRIPTION'].isin(chronic_conditions)]

# Convert encounter dates
encounters_df['START'] = pd.to_datetime(encounters_df['START'], errors='coerce')

# Filter for recent inpatient encounters (e.g., in prev year from the most recent visit)
recent_date = encounters_df['START'].max() - pd.Timedelta(days=365)
recent_inpatient = encounters_df[
    (encounters_df['ENCOUNTERCLASS'] == 'inpatient') &
    (encounters_df['START'] >= recent_date)
]

# Count chronic conditions and inpatient encounters per patient
chronic_counts = chronic.groupby('PATIENT').size()
inpatient_counts = recent_inpatient.groupby('PATIENT').size()

# Merge counts and label high risk
risk_df = pd.DataFrame({
    'CHRONIC_COUNT': chronic_counts,
    'INPATIENT_COUNT': inpatient_counts
}).fillna(0)

# Merge in age from patients_df
risk_df = risk_df.merge(
    patients_df[['Id', 'AGE']],
    left_index=True,
    right_on='Id',
    how='left'
).set_index('Id')

risk_df['RISK_LABEL'] = (
    (risk_df['CHRONIC_COUNT'] >= 2) |
    (risk_df['INPATIENT_COUNT'] >= 3) |
    (((risk_df['CHRONIC_COUNT'] + risk_df['INPATIENT_COUNT']) >= 2) & (risk_df['AGE'] >= 65)) |
    (risk_df['AGE'] >= 80) 
).map({True: 'High', False: 'Low'})

risk_df.head()
    

Unnamed: 0_level_0,CHRONIC_COUNT,INPATIENT_COUNT,AGE,RISK_LABEL
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00126cb9-8460-4747-e302-c3609684531e,1.0,0.0,37,Low
00209bf2-8e4d-06d1-82a4-daad02f25829,2.0,0.0,82,High
0142b69f-57f0-9a08-4e2d-65a2b77fdea7,2.0,0.0,64,High
02ceca12-357f-981e-dcf3-3d26d3c1ff82,1.0,0.0,61,Low
0354da61-b0c8-3469-d3f6-c2fd0f508b7a,1.0,0.0,50,Low


## Ollama 3.2 Setup & Query Function Definition

In [210]:
# Set up Ollama endpoint & model
OLLAMA_ENDPOINT = 'http://localhost:11434/api/generate'
MODEL = 'llama3.2'

def get_llm_response(prompt):
    # Send request
    response = requests.post(
        OLLAMA_ENDPOINT,
        headers={'Content-Type': 'application/json'},
        json={
            'model': MODEL,
            'prompt': prompt,
            'temperature': 0.7, # Adjust the temperature for randomness
            'max_tokens': 50, # Adjust the max tokens for the response length
            # 'stream:': False # Set to true for streaming responses
        }
    )
    # Collecting data
    full_response = ""

    for line in response.iter_lines():
        if line:
            try:
                partial = json.loads(line.decode('utf-8'))
                full_response += partial.get('response', '')
            except json.JSONDecodeError as e:
                print("Error decoding JSON:", e)

    # Print the full response
    return full_response

## Test Prompt

In [102]:
# Define prompt
test_prompt = "What is the capital of France?"

test_response = get_llm_response(test_prompt)
print("Test response:", test_response)

Test response: The capital of France is Paris.


### Zero-Shot Prompt

In [153]:
# Define prompt
zero_shot_prompt = "Decide in a single word if a 72-year old patient who has a chronic condition and has been hospitalized twice in the past year is at high risk for readmission. Answer with 'high' or 'low'. Patient ID: 12345. Chronic conditions: Hypertension, Diabetes. Inpatient encounters: 3. Is this patient high risk or low risk?"

# Print the response
print("Expected answer: High")
print("================================")

# Send request
zero_shot_request = get_llm_response(zero_shot_prompt)
print("One-time request response:", zero_shot_request)


Expected answer: High
One-time request response: High.


### One-Shot Prompt

In [190]:
# Define prompt
one_shot_prompt = f"""
    Decide in a single word if a patient is high risk or low risk for hospital readmission based on the following example.
    
    Example 1: 
    - Age: 55 
    - Conditions: Asthma 
    - Medications: Albuterol, Lasix 
    - Encounters in last year: 2 inpatient 
    
    Answer: Low
    
    Now classify this patient: 
    - Age: 59
    - Conditions: Hypertension
    - Medications: Metformin 
    - Encounters in last year: 2 inpatient 
    
    Answer:"
"""

# Print expectation
print("Expected answer: Low")
print("================================")

# Send request
one_shot_request = get_llm_response(one_shot_prompt)
print("Zero-time request response:", one_shot_request)

Expected answer: Low
Zero-time request response: Low.


## Few-Shot Prompt

In [191]:
# Define prompt
few_shot_prompt = f"""
    Decide in a single word if a patient is high risk or low risk for hospital readmission based on the following example.

    Example 1: 
    - Age: 55 
    - Conditions: Asthma 
    - Medications: Albuterol 
    - Encounters in last year: 0 inpatient 
    Answer: Low 
    
    Example 2: 
    - Age: 81 
    - Conditions: CHF, Diabetes, Hypertension 
    - Medications: Metformin, Lisinopril, Lasix 
    - Encounters in last year: 3 inpatient 
    Answer: High 
    
    Example 3: 
    - Age: 43 
    - Conditions: Depression 
    - Medications: Sertraline 
    - Encounters in last year: 1 outpatient 
    Answer: Low 
    
    Example 4: 
    - Age: 73 
    - Conditions: Depression 
    - Medications: Sertraline 
    - Encounters in last year: 2 outpatient 
    Answer: High 
    
    Now classify this patient: 
    - Age: 72 
    - Conditions: Hypertension 
    - Medications: Metformin 
    - Encounters in last year: 2 inpatient 
    
    Answer:"
"""

# Print expectation
print("Expected answer: High")
print("================================")

# Send request
few_shot_request = get_llm_response(few_shot_prompt)
print("Few-shot request response:", few_shot_request)

Expected answer: High
Few-shot request response: High.


## Construct Patient Summaries for LLM Input

In [None]:
# Generate a patient profile summary for prompting
def generate_summary(patient_id):
    demo = patients_df[patients_df['Id'] == patient_id].iloc[0]
    chronic = conditions_df[
        (conditions_df['PATIENT'] == patient_id) &
        (conditions_df['DESCRIPTION'].isin(chronic_conditions))
    ]['DESCRIPTION'].tolist()
    med_list = medications_df[medications_df['PATIENT'] == patient_id]['DESCRIPTION'].unique().tolist()
    encounters_recent = recent_inpatient[recent_inpatient['PATIENT'] == patient_id]
    
    summary = f"""
    Patient age: {demo['AGE']}
    Gender: {demo['GENDER']}
    Chronic conditions: {', '.join(chronic) if chronic else 'None'}
    Number of inpatient hospitalizations in past year: {len(encounters_recent)}
    Current medications: {', '.join(med_list) if med_list else 'None'}
    """
    return summary.strip()

# Test it
sample_patient_id = risk_df.index[42]
sample_patient_summary = generate_summary(sample_patient_id)
print(sample_patient_summary)

pretty_print_patient_summary("HIGH", sample_patient_summary)
    

Patient age: 79
    Gender: F
    Chronic conditions: None
    Number of inpatient hospitalizations in past year: 2
    Current medications: Acetaminophen 325 MG Oral Tablet, Alendronic acid 10 MG Oral Tablet, Cyclophosphamide 1000 MG Injection, 5 ML fulvestrant 50 MG/ML Prefilled Syringe, palbociclib 100 MG Oral Capsule
                               Ground Truth: HIGH                               
LLM (CoT): Patient age: 79     Gender: F     Chronic conditions: None     Number
of inpatient hospitalizations in past year: 2     Current medications:
Acetaminophen 325 MG Oral Tablet, Alendronic acid 10 MG Oral Tablet,
Cyclophosphamide 1000 MG Injection, 5 ML fulvestrant 50 MG/ML Prefilled Syringe,
palbociclib 100 MG Oral Capsule


## Chain-of-Thought Prompt

In [217]:
# CoT prompt for one patient summary
summary = generate_summary(sample_patient_id)

# Chain-of-Thought Prompt
cot_prompt = f"""
You are a clinical reasoning model. Based on the following summary, assess whether the patient is at high or low risk of hospital readmission. Think step by step before answering.

{summary}
"""

print("Chain-of-Thought Prompt:\n", cot_prompt)
    

Chain-of-Thought Prompt:
 
You are a clinical reasoning model. Based on the following summary, assess whether the patient is at high or low risk of hospital readmission. Think step by step before answering.

Patient age: 79
    Gender: F
    Chronic conditions: None
    Number of inpatient hospitalizations in past year: 2
    Current medications: Acetaminophen 325 MG Oral Tablet, Alendronic acid 10 MG Oral Tablet, Cyclophosphamide 1000 MG Injection, 5 ML fulvestrant 50 MG/ML Prefilled Syringe, palbociclib 100 MG Oral Capsule



## Chain-of-Thought Response

In [220]:
# Classify one sample
llm_prediction_cot = get_llm_response(cot_prompt)

ground_truth = risk_df.loc[sample_patient_id]['RISK_LABEL']
print("Ground Truth: ", ground_truth)
print("===============================")
print("LLM (CoT):", llm_prediction_cot)

pretty_print_patient_summary(ground_truth, llm_prediction_cot)

Ground Truth:  High
LLM (CoT): To assess the patient's risk of hospital readmission, I'll consider various factors:

1. **Age**: The patient is 79 years old, which is considered elderly. Older adults are more likely to experience complications and have higher rates of readmissions due to comorbidities, functional decline, and polypharmacy.

2. **Chronic conditions**: The patient has no listed chronic conditions, which could reduce the risk of hospital readmission. However, it's essential to note that the presence or absence of chronic conditions is just one factor.

3. **Number of inpatient hospitalizations in past year**: The patient has had two inpatient hospitalizations in the past year. This indicates a higher risk of hospital readmission due to recent healthcare utilization and potential underlying health issues.

4. **Current medications**: The patient is taking multiple medications, including chemotherapy agents (cyclophosphamide and fulvestrant) and hormone therapy (palbociclib

## Tree-of-Thought Prompt

In [221]:
# Tot prompt for one patient summary
tot_prompt = f"""
You are a clinical reasoning model that considers multiple reasoning paths before deciding.

Patient summary:
{summary}

Path A: Analyze risk based on age.
Path B: Analyze risk based on chronic conditions.
Path C: Analyze risk based on hospitalizations.
Path D: Analyze risk based on medications or management.

Evaluate each path, then make a final decision: is the readmission risk high or low?
"""

print("\nTree-of-Thought Prompt:\n", tot_prompt)


Tree-of-Thought Prompt:
 
You are a clinical reasoning model that considers multiple reasoning paths before deciding.

Patient summary:
Patient age: 79
    Gender: F
    Chronic conditions: None
    Number of inpatient hospitalizations in past year: 2
    Current medications: Acetaminophen 325 MG Oral Tablet, Alendronic acid 10 MG Oral Tablet, Cyclophosphamide 1000 MG Injection, 5 ML fulvestrant 50 MG/ML Prefilled Syringe, palbociclib 100 MG Oral Capsule

Path A: Analyze risk based on age.
Path B: Analyze risk based on chronic conditions.
Path C: Analyze risk based on hospitalizations.
Path D: Analyze risk based on medications or management.

Evaluate each path, then make a final decision: is the readmission risk high or low?



## Tree of thought response

In [222]:
# Classify one sample
llm_prediction_tot = get_llm_response(tot_prompt)

ground_truth = risk_df.loc[sample_patient_id]['RISK_LABEL']
print("Ground Truth: ", ground_truth)
print("===============================")
print("LLM (ToT): ", llm_prediction_tot)

pretty_print_patient_summary(ground_truth, llm_prediction_tot)

Ground Truth:  High
LLM (ToT):  To evaluate the readmission risk, I will analyze each of the given paths and consider multiple reasoning paths before making a final decision.

**Path A: Age-based analysis**

As the patient is 79 years old, age can be considered as a significant factor in predicting readmission. Older adults are more susceptible to hospital readmissions due to declining physical function, comorbidities, and polypharmacy (multiple medications). However, without further information about the patient's functional status, cognitive abilities, or specific medical conditions, it is difficult to assign a high or low risk based solely on age.

**Path B: Chronic condition analysis**

The patient has no chronic conditions listed in the summary. The absence of chronic conditions may indicate that the patient's overall health status is relatively stable, which could contribute to a lower readmission risk. However, this analysis assumes that the absence of chronic conditions means t

In [None]:
# Define prompt
test_prompt = "What is the capital of France?"

# Send request
def generate_request(prompt):
    request = requests.post(
        OLLAMA_ENDPOINT,
        headers={'Content-Type': 'application/json'},
        json={
            'model': MODEL,
            'prompt': prompt,
            'temperature': 0.7, # Adjust the temperature for randomness
            'max_tokens': 50, # Adjust the max tokens for the response length
            # 'stream:': False # Set to true for streaming responses
        }
    )
    return request

print(generate_request(test_prompt))

test_request = generate_request(test_prompt)

# Get response
def formatted_response(response):
    raw_response = response.text
    json_objects = raw_response.splitlines()  # Split by lines if JSON objects are newline-separated

    for obj in json_objects:
        try:
            data = json.loads(obj)  # Parse each JSON object
            print("Parsed JSON:", data)
        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)
            
    # Collecting data
    full_response = ""

    for line in response.iter_lines():
        if line:
            try:
                partial = json.loads(line.decode('utf-8'))
                full_response += partial.get('response', '')
            except json.JSONDecodeError as e:
                print("Error decoding JSON:", e)
    return full_response
                
test_response = formatted_response(test_request)

# Print the full response
print("================================")
print("Answer:")
print(test_response)