# Data preparation and processing

- Import libraries

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import numpy as np
import pandas as pd

- Download dataset via kagglehub

In [2]:
# import kagglehub

# path = kagglehub.dataset_download("mitishaagarwal/patient")

# print("Path to dataset files:", path)

- Give a variable to store the path
- Read the CSV from the path
- Display the first 5 rows of data to test if successfully read the path

In [3]:
path = "/Users/soongjun/.cache/kagglehub/datasets/mitishaagarwal/patient/versions/3/dataset.csv"

data = pd.read_csv(path)

# data.head()

- Count the number of rows and columns
- Display the columns' names

In [4]:
print("Dataset shape:", data.shape, "\n")
print("Columns:", data.columns)

Dataset shape: (91713, 85) 

Columns: Index(['encounter_id', 'patient_id', 'hospital_id', 'age', 'bmi',
       'elective_surgery', 'ethnicity', 'gender', 'height', 'icu_admit_source',
       'icu_id', 'icu_stay_type', 'icu_type', 'pre_icu_los_days', 'weight',
       'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative',
       'arf_apache', 'gcs_eyes_apache', 'gcs_motor_apache',
       'gcs_unable_apache', 'gcs_verbal_apache', 'heart_rate_apache',
       'intubated_apache', 'map_apache', 'resprate_apache', 'temp_apache',
       'ventilated_apache', 'd1_diasbp_max', 'd1_diasbp_min',
       'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min',
       'd1_heartrate_max', 'd1_heartrate_min', 'd1_mbp_max', 'd1_mbp_min',
       'd1_mbp_noninvasive_max', 'd1_mbp_noninvasive_min', 'd1_resprate_max',
       'd1_resprate_min', 'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_max',
       'd1_sysbp_min', 'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min',
       'd1_temp_max', 'd1_t

- Drop empty column by index
- Count the number of rows and columns to ensure the column is removed
- Display the new columns' names

In [5]:
data = data.drop(data.columns[-2], axis=1)
data = data.drop(columns = ["encounter_id", "hospital_id"])

print("Dataset shape:", data.shape, "\n")
print("Columns:", data.columns)

Dataset shape: (91713, 82) 

Columns: Index(['patient_id', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender',
       'height', 'icu_admit_source', 'icu_id', 'icu_stay_type', 'icu_type',
       'pre_icu_los_days', 'weight', 'apache_2_diagnosis',
       'apache_3j_diagnosis', 'apache_post_operative', 'arf_apache',
       'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache',
       'gcs_verbal_apache', 'heart_rate_apache', 'intubated_apache',
       'map_apache', 'resprate_apache', 'temp_apache', 'ventilated_apache',
       'd1_diasbp_max', 'd1_diasbp_min', 'd1_diasbp_noninvasive_max',
       'd1_diasbp_noninvasive_min', 'd1_heartrate_max', 'd1_heartrate_min',
       'd1_mbp_max', 'd1_mbp_min', 'd1_mbp_noninvasive_max',
       'd1_mbp_noninvasive_min', 'd1_resprate_max', 'd1_resprate_min',
       'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_max', 'd1_sysbp_min',
       'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min', 'd1_temp_max',
       'd1_temp_min', 'h1_diasbp_max', 'h1_

- Check the total columns with missing values

In [6]:
missing_values = data.isnull().mean() * 100

missing_summary = missing_values[missing_values > 0].sort_values(ascending=False)

print(f"Total columns with missing values: {missing_summary.shape[0]}")

Total columns with missing values: 74


- Set a random seed to ensure the randomness is consistent

In [7]:
np.random.seed(42)

- Separate the columns into numerical and categorical
- Handle missing values in numerical and categorical seperately
- Numerical will fill in missing values with medium values
- Categorical will fill in missing values with random choice from non-missing values
- Check if missing values is remained exist 

In [8]:
numerical_columns = data.select_dtypes(include=['number']).columns
categorical_columns = data.select_dtypes(exclude=['number']).columns

for col in numerical_columns:
    if data[col].isnull().sum() > 0:
        data[col] = data[col].fillna(data[col].median())

for col in categorical_columns:
    if data[col].isnull().sum() > 0:
        non_missing_values = data[col].dropna().unique()
        
        data[col] = data[col].apply(
            lambda x: np.random.choice(non_missing_values) if pd.isnull(x) else x
        )

missing_values_after_imputation = data.isnull().sum().sum()
print(f"Total missing values after imputation: {missing_values_after_imputation}")

Total missing values after imputation: 0


- Check the data types to identify categorical columns
- Apply one-hot Encoding to to categorical columns
- Count the number of rows and columns in the data

In [9]:
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()

data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

print("Categorical Columns:", categorical_columns, "\n")
# print("Data after one-hot encoding:", "\n")
# print(df_encoded.head(), "\n")
print("Shape after encoding:", data_encoded.shape, "\n")

Categorical Columns: ['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'apache_3j_bodysystem', 'apache_2_bodysystem'] 

Shape after encoding: (91713, 113) 



- Identify the numerical columns
- Remove the target variables
- Normalise or scale numerial features with one-hot encoder

In [10]:
numerical_columns = data_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()

numerical_columns.remove('hospital_death')

scaler = StandardScaler()

data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])

# print("Data after scaling:", "\n")
# print(data_encoded.head(), "\n")
print("Shape after scaling:", data_encoded.shape, "\n")

Shape after scaling: (91713, 113) 



- Separate the encoded data into X and y
- Split the data into training, validation, and testing set
- Data spliting ratio is 80:10:10
- Apply feature scaling after splitting

In [11]:
X = data_encoded.drop(columns=['hospital_death', 'patient_id'])
y = data_encoded['hospital_death']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

X_train_scaled = scaler.fit_transform(X_train)

X_val_scaled = scaler.transform(X_val)

X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape, "\n")
print("Validation set shape:", X_val_scaled.shape, "\n")
print("Test set shape:", X_test_scaled.shape, "\n")

Training set shape: (73370, 111) 

Validation set shape: (9171, 111) 

Test set shape: (9172, 111) 



# Model Architecture

- Import external Python file which is the model architecture
- Import libraries 

In [17]:
from model_architecture import MLPModel

import torch
import torch.nn as nn

- Set initial hyperparameters
- Call the model from external Python file
- Display the architecture of the model
- Apply Binary Cross Entropy Loss
- Apply Adam

In [18]:
input_dim = X_train.shape[1]
hidden_layers = [128, 64]
dropout_rate = 0.3
learning_rate = 0.001
batch_size = 32
num_epochs = 50

model = MLPModel(input_dim=input_dim, hidden_layers=hidden_layers, dropout_rate=dropout_rate)

print(model)

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

MLPModel(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=111, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=64, bias=True)
  )
  (output_layer): Linear(in_features=64, out_features=1, bias=True)
)
