# Data preparation and processing

- Import libraries

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import model_architecture
import numpy as np
import pandas as pd

- Download dataset via kagglehub

In [None]:
# import kagglehub

# path = kagglehub.dataset_download("mitishaagarwal/patient")

# print("Path to dataset files:", path)

- Give a variable to store the path
- Read the CSV from the path
- Display the first 5 rows of data to test if successfully read the path

In [2]:
path = "/Users/soongjun/.cache/kagglehub/datasets/mitishaagarwal/patient/versions/3/dataset.csv"

data = pd.read_csv(path)

# data.head()

- Count the number of rows and columns
- Display the columns' names

In [3]:
print("Dataset shape:", data.shape, "\n")
print("Columns:", data.columns)

Dataset shape: (91713, 85) 

Columns: Index(['encounter_id', 'patient_id', 'hospital_id', 'age', 'bmi',
       'elective_surgery', 'ethnicity', 'gender', 'height', 'icu_admit_source',
       'icu_id', 'icu_stay_type', 'icu_type', 'pre_icu_los_days', 'weight',
       'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative',
       'arf_apache', 'gcs_eyes_apache', 'gcs_motor_apache',
       'gcs_unable_apache', 'gcs_verbal_apache', 'heart_rate_apache',
       'intubated_apache', 'map_apache', 'resprate_apache', 'temp_apache',
       'ventilated_apache', 'd1_diasbp_max', 'd1_diasbp_min',
       'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min',
       'd1_heartrate_max', 'd1_heartrate_min', 'd1_mbp_max', 'd1_mbp_min',
       'd1_mbp_noninvasive_max', 'd1_mbp_noninvasive_min', 'd1_resprate_max',
       'd1_resprate_min', 'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_max',
       'd1_sysbp_min', 'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min',
       'd1_temp_max', 'd1_t

- Drop empty column by index
- Count the number of rows and columns to ensure the column is removed
- Display the new columns' names

In [4]:
data = data.drop(data.columns[-2], axis=1)
data = data.drop(columns = ["encounter_id", "hospital_id"])

print("Dataset shape:", data.shape, "\n")
print("Columns:", data.columns)

Dataset shape: (91713, 82) 

Columns: Index(['patient_id', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender',
       'height', 'icu_admit_source', 'icu_id', 'icu_stay_type', 'icu_type',
       'pre_icu_los_days', 'weight', 'apache_2_diagnosis',
       'apache_3j_diagnosis', 'apache_post_operative', 'arf_apache',
       'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache',
       'gcs_verbal_apache', 'heart_rate_apache', 'intubated_apache',
       'map_apache', 'resprate_apache', 'temp_apache', 'ventilated_apache',
       'd1_diasbp_max', 'd1_diasbp_min', 'd1_diasbp_noninvasive_max',
       'd1_diasbp_noninvasive_min', 'd1_heartrate_max', 'd1_heartrate_min',
       'd1_mbp_max', 'd1_mbp_min', 'd1_mbp_noninvasive_max',
       'd1_mbp_noninvasive_min', 'd1_resprate_max', 'd1_resprate_min',
       'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_max', 'd1_sysbp_min',
       'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min', 'd1_temp_max',
       'd1_temp_min', 'h1_diasbp_max', 'h1_

- Check the total columns with missing values

In [5]:
missing_values = data.isnull().mean() * 100

missing_summary = missing_values[missing_values > 0].sort_values(ascending=False)

print(f"Total columns with missing values: {missing_summary.shape[0]}")

Total columns with missing values: 74


- Set a random seed to ensure the randomness is consistent

In [6]:
np.random.seed(42)

- Separate the columns into numerical and categorical
- Handle missing values in numerical and categorical seperately
- Numerical will fill in missing values with medium values
- Categorical will fill in missing values with random choice from non-missing values
- Check if missing values is remained exist 

In [7]:
numerical_columns = data.select_dtypes(include=['number']).columns
categorical_columns = data.select_dtypes(exclude=['number']).columns

for col in numerical_columns:
    if data[col].isnull().sum() > 0:
        data[col] = data[col].fillna(data[col].median())

for col in categorical_columns:
    if data[col].isnull().sum() > 0:
        non_missing_values = data[col].dropna().unique()
        
        data[col] = data[col].apply(
            lambda x: np.random.choice(non_missing_values) if pd.isnull(x) else x
        )

missing_values_after_imputation = data.isnull().sum().sum()
print(f"Total missing values after imputation: {missing_values_after_imputation}")

Total missing values after imputation: 0


- Check the data types to identify categorical columns
- Apply one-hot Encoding to to categorical columns
- Count the number of rows and columns in the data

In [8]:
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()

data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

print("Categorical Columns:", categorical_columns, "\n")
# print("Data after one-hot encoding:", "\n")
# print(df_encoded.head(), "\n")
print("Shape after encoding:", data_encoded.shape, "\n")

Categorical Columns: ['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'apache_3j_bodysystem', 'apache_2_bodysystem'] 

Shape after encoding: (91713, 113) 



- Identify the numerical columns
- Remove the target variables
- Normalise or scale numerial features with one-hot encoder

In [9]:
numerical_columns = data_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()

numerical_columns.remove('hospital_death')

scaler = StandardScaler()

data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])

# print("Data after scaling:", "\n")
# print(data_encoded.head(), "\n")
print("Shape after scaling:", data_encoded.shape, "\n")

Shape after scaling: (91713, 113) 



- Separate the encoded data into X and y
- Split the data into training, validation, and testing set
- Data spliting ratio is 80:10:10
- Apply feature scaling after splitting

In [10]:
X = data_encoded.drop(columns=['hospital_death', 'patient_id'])
y = data_encoded['hospital_death']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

X_train_scaled = scaler.fit_transform(X_train)

X_val_scaled = scaler.transform(X_val)

X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape, "\n")
print("Validation set shape:", X_val_scaled.shape, "\n")
print("Test set shape:", X_test_scaled.shape, "\n")

Training set shape: (73370, 111) 

Validation set shape: (9171, 111) 

Test set shape: (9172, 111) 



# Model Architecture

In [None]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter

# 1. Data Loading
df = pd.read_csv('dataset.csv')

# 2. Feature & Target Separation
X = df.drop(columns=["hospital_death"])
y = df["hospital_death"]

# 3. Handle Missing Values
# Separate numeric and categorical columns
numeric_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Fill missing values
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())  # Median for numeric
X[categorical_cols] = X[categorical_cols].fillna(X[categorical_cols].mode().iloc[0])  # Mode for categorical

# 4. Encode Categorical Data
label_encoder = LabelEncoder()
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

print(X[categorical_cols].head())

# 5. Remove Low Variance Features
selector = VarianceThreshold(threshold=0.01)
X_selected = pd.DataFrame(selector.fit_transform(X), columns=X.columns[selector.get_support()])

dropped_features = X.columns[~selector.get_support()]
selected_features = X.columns[selector.get_support()]

print("Dropped Features:\n", dropped_features)

print("Selected Features:\n", selected_features)
print(X.shape, X_selected.shape)

# 6. Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_selected, y)

# Print class distribution after SMOTE
print("Class Distribution After SMOTE:", Counter(y_resampled))
# Visualize class distribution after SMOTE
plt.figure(figsize=(8, 6))
sns.countplot(x=y_resampled, palette="plasma")
plt.title("Class Distribution After SMOTE")
plt.xlabel("Class Labels")
plt.ylabel("Count")
plt.show()

# 7. Split Data into Training and Test Sets (80%, 20%)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 8. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 9. Convert Data to PyTorch Tensors
class MortalityDataset(Dataset):
    def __init__(self, X, y):
        # Convert the input features and target labels to PyTorch tensors
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y.values.reshape(-1, 1), dtype=torch.float32)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

# Initialize datasets and loaders
train_dataset = MortalityDataset(X_train_scaled, y_train)
test_dataset = MortalityDataset(X_test_scaled, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 11. Define the PyTorch MLP Model
class MortalityMLP(nn.Module):
    def __init__(self, input_size):
        super(MortalityMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 64) # First hidden layer with 64 neurons
        self.bn1 = nn.BatchNorm1d(64) # Stable training and faster convergence speed
        self.fc2 = nn.Linear(64, 32) # Second hidden layer with 32 neurons
        self.bn2 = nn.BatchNorm1d(32)
        self.output = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        return torch.sigmoid(self.output(x))  # Sigmoid is often used for binary classification

# 12. Initialize Model, Loss, and Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
input_size = X_train.shape[1]
model = MortalityMLP(input_size).to(device) # 
criterion = nn.BCELoss() # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 13. Train the Model
loss_values_train = []
loss_values_val = []
epochs = 10

for epoch in range(epochs):
    total_train_loss = 0
    total_val_loss = 0
    
    # Training Phase
    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device) # Ensure that train with GPU

        optimizer.zero_grad() #  Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
        
        outputs = model(inputs) # get output from the model, given the inputs
        loss = criterion(outputs, labels)  # get loss for the predicted output
        
        # get gradients w.r.t to parameters
        loss.backward()
        # update parameters
        optimizer.step()
        
        total_train_loss += loss.item()
    
    # Validation Phase
    model.eval()
    y_pred_list, y_true_list = [], []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()
    
    # Calculate average losses
    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(test_loader)
    loss_values_train.append(avg_train_loss)
    loss_values_val.append(avg_val_loss)
    
    print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Evaluate the Model
model.eval()
y_pred_list = []
y_true_list = []

# Collect all predictions and true labels
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predictions = ((outputs) > 0.5).float()
        y_pred_list.extend(predictions.cpu().numpy())
        y_true_list.extend(labels.cpu().numpy())

# Calculate Accuracy and Print Classification Report
accuracy = accuracy_score(y_true_list, y_pred_list)
report = classification_report(y_true_list, y_pred_list)

print("\Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(range(1, epochs + 1), loss_values_train, label="Training Loss")
plt.plot(range(1, epochs + 1), loss_values_val, label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.show()

# Optimizing Network Architecture - Deeper Neural NEtwork
1. Increase the number of neurons for each hidden layers
2. Additional hidden layer is added
3. Increase Dropout
4. Reduce learning rate
5. Assign weight_decay (L2 Regularization) to prevent overfitting
6. Learning Scheduler 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Model Definition with Increased Complexity and Dropout
class TunedMortalityMLP(nn.Module):
    def __init__(self, input_size):
        super(TunedMortalityMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)  # Increase to 128 neurons
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)          # Add another layer
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)           # Another smaller layer
        self.bn3 = nn.BatchNorm1d(32)
        self.output = nn.Linear(32, 1)         # Output layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)         # Increase dropout for regularization

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        return torch.sigmoid(self.output(x))

# Initialize the Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = X_train.shape[1]
model = TunedMortalityMLP(input_size).to(device)

# Define Loss, Optimizer, and Scheduler
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay= 0.0001)  # Lower LR + L2 Regularization to avoid overfitting
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)  # Reduce LR after 5 epochs

# Training Loop with Hyperparameter Tuning
epochs = 20
loss_values_train, loss_values_val = [], []

for epoch in range(epochs):
    total_train_loss = 0
    total_val_loss = 0
    
    # Training Phase
    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad() # Clear gradients
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    # Validation Phase
    model.eval()
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()

    # Record average losses
    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(test_loader)
    loss_values_train.append(avg_train_loss)
    loss_values_val.append(avg_val_loss)
    
    # Step the learning rate scheduler
    scheduler.step()
    
    print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Plot Learning Curves
plt.figure(figsize=(8, 6))
plt.plot(range(1, epochs + 1), loss_values_train, label="Training Loss")
plt.plot(range(1, epochs + 1), loss_values_val, label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss (Tuned Model)")
plt.legend()
plt.show()

# Evaluate the Tuned Model
model.eval()
y_pred_list, y_true_list = [], []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predictions = (outputs > 0.5).float()
        y_pred_list.extend(predictions.cpu().numpy())
        y_true_list.extend(labels.cpu().numpy())

accuracy = accuracy_score(y_true_list, y_pred_list)
report = classification_report(y_true_list, y_pred_list)
print(f"\nTuned Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)


**Observations:**
* The model improvements from tuning (added layers, dropout, L2 regularization, and a learning rate scheduler) have significantly boosted performance and ensured generalization.
* Both train loss and validation loss are relatively low and consistent.

# Hyperparameter Tuning
1. Grid Search is used with different combination of:
* Learning rate
* Epochs
* Dropout
* Batch Sizes 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
import itertools
import numpy as np

# Define Hyperparameter Grid
learning_rates = [0.001, 0.0005]
epochs_list = [10, 20]
dropout_rates = [0.2, 0.3]
batch_sizes = [64, 128]

# Generate all combinations of hyperparameters
hyperparameter_combinations = list(itertools.product(learning_rates, epochs_list, dropout_rates, batch_sizes))

# Model Definition with Dropout as a Hyperparameter
class TunedMortalityMLP(nn.Module):
    def __init__(self, input_size, dropout_rate):
        super(TunedMortalityMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.output = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        return torch.sigmoid(self.output(x))

# Function to Train and Evaluate Model
def train_and_evaluate_model(lr, epochs, dropout_rate, batch_size):
    # Prepare DataLoader
    train_dataset = MortalityDataset(X_train_scaled, y_train)
    test_dataset = MortalityDataset(X_test_scaled, y_test)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize Model, Loss, and Optimizer
    model = TunedMortalityMLP(input_size, dropout_rate).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Training Loop
    model.train()
    for epoch in range(epochs):
        total_train_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

    # Evaluation Loop
    model.eval()
    y_pred_list = []
    y_true_list = []
    total_val_loss = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()
            preds = (outputs > 0.5).float()
            y_pred_list.extend(preds.cpu().numpy())
            y_true_list.extend(labels.cpu().numpy())

    # Calculate Loss and Accuracy
    avg_val_loss = total_val_loss / len(test_loader)
    val_accuracy = accuracy_score(y_true_list, y_pred_list)

    return avg_val_loss, val_accuracy

# Hyperparameter Tuning Loop
results = []
for lr, epochs, dropout_rate, batch_size in hyperparameter_combinations:
    print(f"Testing Configuration: LR={lr}, Epochs={epochs}, Dropout={dropout_rate}, Batch Size={batch_size}")
    val_loss, val_accuracy = train_and_evaluate_model(lr, epochs, dropout_rate, batch_size)
    results.append((lr, epochs, dropout_rate, batch_size, val_loss, val_accuracy))
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}\n")

# Identify Best Configuration
best_config = min(results, key=lambda x: x[4])  # Sort by lowest validation loss
print("\nBest Hyperparameter Configuration:")
print(f"Learning Rate: {best_config[0]}, Epochs: {best_config[1]}, Dropout: {best_config[2]}, Batch Size: {best_config[3]}")
print(f"Best Validation Loss: {best_config[4]:.4f}, Best Validation Accuracy: {best_config[5]:.4f}")
