In [1]:
import pandas as pd

### Hypothesis
#### Patients who's condition worsened - triage increased - experienced longer stay in Emergency department

In [5]:
filtered_data =  pd.read_csv("../data/raw/filtered_data.csv")

<p> In the code below we extract all patients who's entry triage <b> triage_entry_severity </b> is different from their exit triage (<b> triage_exit_severity</b> ). This implies that the patient's condition has changed during their stay at the Emergency Department, they either got better (triage increase) or their condition increased in severity and thus the triage decreased.
</br>
<p> We return two .csv files - <i>"patients_got_worse"</i> for patients with decreased triage, and <i>"patients_got_better"</i> for increased triage </p>

In [None]:
# --- Step 1: Define triage severity hierarchy ---
severity_rank = {
    'RED': 1,
    'ORANGE': 2,
    'BLUE': 3,
    'GREEN': 4,
    'WHITE': 5
}

# --- Step 2: Ensure timestamps are datetime ---
filtered_data['arrival_ts'] = pd.to_datetime(filtered_data['arrival_ts'])
filtered_data['discharge_ts'] = pd.to_datetime(filtered_data['discharge_ts'])

# --- Step 3: Sort and get first/last values per case_id ---
filtered_data = filtered_data.sort_values(by=['case_id', 'arrival_ts'])
# Since the dataset is a combination of inputs that includes many duplicates per patient record, we will group them by case_id
grouped = (
    filtered_data.groupby('case_id')
    .agg(
        triage_entry_severity=('triage_entry_severity', 'first'),
        triage_exit_severity=('triage_exit_severity', 'last'),
        arrival_ts=('arrival_ts', 'first'),
        discharge_ts=('discharge_ts', 'last')
    )
    .reset_index()
)

# --- Step 4: Compute numeric ranks ---
grouped['entry_rank'] = grouped['triage_entry_severity'].map(severity_rank)
grouped['exit_rank'] = grouped['triage_exit_severity'].map(severity_rank)

# --- Step 5: Compute ER stay duration ---
grouped['er_stay_duration'] = grouped['discharge_ts'] - grouped['arrival_ts']

# Also get total minutes or hours
grouped['er_stay_hours'] = grouped['er_stay_duration'].dt.total_seconds() / 3600

# --- Step 6: Split patients by outcome ---
got_worse = grouped[grouped['exit_rank'] < grouped['entry_rank']].copy()
got_better = grouped[grouped['exit_rank'] > grouped['entry_rank']].copy()
same_severity = grouped[grouped['exit_rank'] == grouped['entry_rank']].copy()

# --- Step 7 Save to CSV ---
got_worse.to_csv("../data/samples/patients_got_worse.csv", index=False)
got_better.to_csv("../data/samples/patients_got_better.csv", index=False)


<p> Once we accomplish this we simply  obtain the mean of both of the er_stay_hours and compare which groups of patients on average experience larger stay durations. </p>

In [10]:
got_better_mean = got_better['er_stay_hours'].mean()
got_worse_mean = got_worse['er_stay_hours'].mean()
res = got_better_mean - got_worse_mean
if (res > 0):
    print(f"Patients that got better experience larger stay duration by {round(res,2)} hours. This refutes our hypothesis")
else:
    print(f"Patients that got worse experience larger stay duration by {round((got_worse_mean-got_better_mean),2)}. This supports our hypothesis! ")

Patients that got better experience larger stay duration by 2.52 hours. This refutes our hypothesis
