In [8]:
import pandas as pd
import numpy as np

# Load dataset
triage_df = pd.read_csv("../data/ED_triage.csv")

# Filter triage grades 1–4 only (remove class 5)
df = triage_df[triage_df['TriageGrade'] < 5].copy()


In [9]:
# Features chosen based on usefulness and availability
features = [
    'age', 'gender', 'admission_hour', 'admission_weekday',
    'NeedFastExecute', 'PainGrade', 'MentalDistress',
    'MaterialDistress', 'CriticalStatus', 'StuporStatus',
    'AVPU'
]

# Add target column
df = df[features + ['TriageGrade']]


In [10]:
# Check for nulls before cleaning
df.isnull().sum().sort_values(ascending=False)

AVPU                 90549
MentalDistress       10380
MaterialDistress     10380
CriticalStatus       10380
StuporStatus         10380
PainGrade            10316
age                      0
gender                   0
admission_hour           0
admission_weekday        0
NeedFastExecute          0
TriageGrade              0
dtype: int64

In [12]:
# Step 1: Fill numerical missing values with median (safe assignment)
for col in ['PainGrade', 'MentalDistress', 'MaterialDistress', 'CriticalStatus', 'StuporStatus']:
    df[col] = df[col].fillna(df[col].median())

# Step 2: Fill AVPU with 'Unknown' safely
df['AVPU'] = df['AVPU'].fillna('Unknown')

# Step 3: Encode categorical features
from sklearn.preprocessing import LabelEncoder

df['gender'] = LabelEncoder().fit_transform(df['gender'])   # Male=1, Female=0
df['AVPU'] = LabelEncoder().fit_transform(df['AVPU'])       # A, V, P, U, Unknown

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Feature set
features = [
    'age', 'gender', 'admission_hour', 'admission_weekday',
    'NeedFastExecute', 'PainGrade', 'MentalDistress',
    'MaterialDistress', 'CriticalStatus', 'StuporStatus',
    'AVPU'
]

X = df[features]
y = df['TriageGrade']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       1.00      1.00      1.00      2063
           2       1.00      1.00      1.00     16221
           3       0.76      0.81      0.78      6873
           4       0.58      0.52      0.55      3556

    accuracy                           0.89     28713
   macro avg       0.84      0.83      0.83     28713
weighted avg       0.89      0.89      0.89     28713



In [17]:
# Shift target labels to start from 0
y_train_xgb = y_train - 1
y_test_xgb = y_test - 1

In [19]:
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=4,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

xgb_model.fit(X_train, y_train_xgb)
y_pred_xgb = xgb_model.predict(X_test)

# Shift predictions back up by 1 to match original labels
y_pred_xgb = y_pred_xgb + 1

# Evaluate
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           1       1.00      1.00      1.00      2063
           2       1.00      1.00      1.00     16221
           3       0.76      0.91      0.83      6873
           4       0.73      0.44      0.55      3556

    accuracy                           0.91     28713
   macro avg       0.87      0.84      0.84     28713
weighted avg       0.91      0.91      0.90     28713



In [20]:
from sklearn.utils import resample

# Combine features + target
df_model = df[features + ['TriageGrade']].copy()

# Separate minority and majority
df_class_4 = df_model[df_model['TriageGrade'] == 4]
df_other = df_model[df_model['TriageGrade'] < 4]

# Upsample class 4
df_class_4_upsampled = resample(
    df_class_4,
    replace=True,
    n_samples=10000,  # adjust if needed
    random_state=42
)

# Combine
df_balanced = pd.concat([df_other, df_class_4_upsampled])

# Redefine features/target
X = df_balanced[features]
y = df_balanced['TriageGrade']

In [21]:
# Shift y down by 1 to match XGBoost format
y = y - 1


In [22]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Model
model = XGBClassifier(
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    n_estimators=300,
    objective='multi:softmax',
    num_class=4,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Shift prediction back up
y_pred = y_pred + 1
y_test = y_test + 1

print(classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           1       1.00      1.00      1.00      2063
           2       1.00      1.00      1.00     16221
           3       0.83      0.98      0.90      6873
           4       0.84      0.31      0.45      2000

    accuracy                           0.94     27157
   macro avg       0.92      0.82      0.84     27157
weighted avg       0.94      0.94      0.93     27157



In [24]:
import joblib

# Save the trained and tuned model
joblib.dump(model, "../model/triage_model.pkl")

['../model/triage_model.pkl']

In [26]:
model.save_model("../model/triage_model.json")