In [1]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# Import Libraries
The first step involves importing the necessary libraries for data manipulation, model building, and evaluation

In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Layer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             roc_curve, precision_recall_curve)
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import mutual_info_classif

# Extracting Valid Dataset
A function is defined to generate a synthetic dataset with specified characteristics. This dataset simulates patient records with various features related to sepsis.


In [8]:
def generate_synthetic_sepsis_dataset(n_samples=2000):
    categorical_features = {
        'sex': np.random.choice(['male', 'female'], n_samples),
        'ethnicity': np.random.choice(['white', 'black', 'hispanic', 'asian', 'other'], n_samples),
        'metastatic_cancer': np.random.choice([0, 1], n_samples, p=[0.85, 0.15]),
        'diabetes': np.random.choice([0, 1], n_samples, p=[0.70, 0.30])
    }
    quantitative_features = {
        'age': np.random.normal(65, 15, n_samples),
        'hospital_elixhauser': np.random.normal(4, 2, n_samples),
        'vent': np.random.choice([0, 1], n_samples),
        'couch': np.random.normal(1.5, 0.5, n_samples),
        'sirs': np.random.normal(3, 1, n_samples),
        'qsofa': np.random.normal(1, 0.5, n_samples),
        'anion_gap_medium': np.random.normal(15, 4, n_samples),
        'bocarbonate_medium': np.random.normal(24, 3, n_samples),
        'creatinine_medium': np.random.normal(1.2, 0.4, n_samples),
        'glucose_medium': np.random.normal(100, 25, n_samples),
        'hemoglobin_medium': np.random.normal(12.5, 2, n_samples),
        'lactate_medium': np.random.normal(2.5, 1, n_samples),
        'platelet_means': np.random.normal(250, 50, n_samples),
        'potassium_means': np.random.normal(4, 0.5, n_samples),
        'inr_means': np.random.normal(1.1, 0.2, n_samples),
        'sodium_means': np.random.normal(140, 3, n_samples),
        'wbc_means': np.random.normal(10, 2.5, n_samples),
        'heart_rate_means': np.random.normal(90, 15, n_samples),
        'sys_bp_means': np.random.normal(120, 15, n_samples),
        'dias_bp_means': np.random.normal(80, 10, n_samples),
        'resp_rate_means': np.random.normal(18, 4, n_samples),
        'temp_c_means': np.random.normal(37, 0.5, n_samples),
        'spo2_medians': np.random.normal(95, 2, n_samples),
        'urine_output': np.random.normal(1500, 500, n_samples)
    }
    outcomes = {
        'sepsis': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        'hospital_expire_flag': np.random.choice([0, 1], n_samples, p=[0.9, 0.1])
    }
    df = pd.DataFrame({**categorical_features, **quantitative_features, **outcomes})
    return df

# Data Augmentation
This function introduces missing values and addresses class imbalance in the dataset.

In [9]:
def augment_data_with_missing_and_imbalance(df, missing_ratio=0.1, imbalance_ratio=0.3):
    df_with_missing = df.copy()
    n_missing = int(missing_ratio * df_with_missing.size)
    random_indices = np.random.choice(df_with_missing.size, n_missing, replace=False)
    flat_data = df_with_missing.to_numpy().flatten()
    flat_data[random_indices] = np.nan
    df_with_missing = pd.DataFrame(flat_data.reshape(df_with_missing.shape), columns=df_with_missing.columns)
    minority_class = df_with_missing[df_with_missing['hospital_expire_flag'] == 1]
    majority_class = df_with_missing[df_with_missing['hospital_expire_flag'] == 0]
    target_majority_size = int(len(minority_class) / imbalance_ratio)
    majority_class = majority_class.sample(target_majority_size, random_state=42, replace=True)
    df_imbalanced = pd.concat([minority_class, majority_class]).sample(frac=1, random_state=42).reset_index(drop=True)
    return df_imbalanced

# Preprocessing Pipeline
The preprocessing pipeline handles missing values and prepares the data for modeling.

In [10]:
def select_important_features(X, y):
    """
    Select the most important features using mutual information.
    """
    mutual_info = mutual_info_classif(X, y)
    feature_importances = pd.Series(mutual_info, index=X.columns)
    selected_features = feature_importances[feature_importances > 0.01].index  # Adjust threshold as needed
    if len(selected_features) == 0:  # If no features meet the threshold
        return X  # Fallback to all features
    return X[selected_features]

def balance_data_with_smote(X, y):
    """
    Balance the dataset using SMOTE with validation for numeric data.
    """
    # Ensure that all features in X are numeric
    if not np.issubdtype(X.dtypes, np.number):
        X = X.apply(pd.to_numeric, errors='coerce')
    
    # Check for NaN or invalid values and fill them (fallback to median)
    if X.isnull().any().any():
        X = X.fillna(X.median())
    
    smote = SMOTE(random_state=42)
    try:
        X_balanced, y_balanced = smote.fit_resample(X, y)
    except ValueError as e:
        print(f"SMOTE Error: {e}. Falling back to simple oversampling.")
        # Fallback to naive oversampling
        minority_class = X[y == 1]
        majority_class = X[y == 0]
        minority_oversampled = minority_class.sample(len(majority_class), replace=True, random_state=42)
        X_balanced = pd.concat([majority_class, minority_oversampled])
        y_balanced = np.concatenate([np.zeros(len(majority_class)), np.ones(len(minority_oversampled))])
    return X_balanced, y_balanced


def improved_preprocessing_pipeline(df):
    """
    Full preprocessing pipeline with feature selection and SMOTE for balancing.
    """
    X = df.drop(columns=['hospital_expire_flag'], axis=1)
    y = df['hospital_expire_flag'].astype(int)

    # Handle missing values
    categorical_cols = X.select_dtypes(include=['object']).columns
    numerical_cols = X.select_dtypes(include=['number', 'bool']).columns

    if not categorical_cols.empty:
        cat_imputer = SimpleImputer(strategy='most_frequent')
        X_categorical = pd.DataFrame(cat_imputer.fit_transform(X[categorical_cols]), columns=categorical_cols)
    else:
        X_categorical = pd.DataFrame()

    if not numerical_cols.empty:
        num_imputer = SimpleImputer(strategy='median')
        X_numerical = pd.DataFrame(num_imputer.fit_transform(X[numerical_cols]), columns=numerical_cols)
    else:
        X_numerical = pd.DataFrame()

    if not X_categorical.empty:
        X_categorical_encoded = pd.get_dummies(X_categorical, drop_first=True)
    else:
        X_categorical_encoded = pd.DataFrame()

    # Combine numerical and encoded categorical features
    X_processed = pd.concat([X_categorical_encoded, X_numerical], axis=1)

    # Feature selection
    X_selected = select_important_features(X_processed, y)

    # Ensure X_selected is valid for SMOTE
    X_selected = X_selected.applymap(lambda x: int(x) if isinstance(x, bool) else x)
    X_selected = X_selected.apply(pd.to_numeric, errors='coerce').fillna(0)

    # Balance the dataset using SMOTE
    X_balanced, y_balanced = balance_data_with_smote(X_selected, y)

    # Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_balanced)

    # Stratified K-Fold
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    return X_scaled, y_balanced, cv


# Model Definition with Attention Mechanism
The model architecture includes dense layers and a custom attention layer to enhance feature representation.

In [11]:
# Step 4: Model Definition
class CustomAttentionLayer(Layer):
    def __init__(self, heads=4, key_dim=32, **kwargs):
        super().__init__(**kwargs)
        self.heads = heads
        self.key_dim = key_dim
    
    def build(self, input_shape):
        self.query_dense = Dense(self.heads * self.key_dim)
        self.key_dense = Dense(self.heads * self.key_dim)
        self.value_dense = Dense(self.heads * self.key_dim)
        self.output_dense = Dense(input_shape[-1])
        super().build(input_shape)
    
    def call(self, inputs):
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)
        scores = tf.matmul(query, key, transpose_b=True) / tf.math.sqrt(tf.cast(self.key_dim, tf.float32))
        attention_weights = tf.nn.softmax(scores, axis=-1)
        attended = tf.matmul(attention_weights, value)
        return self.output_dense(attended)

def create_advanced_mortality_model(input_shape):
    inputs = Input(shape=(input_shape,))
    x = Dense(256, activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    attention_output = CustomAttentionLayer(heads=8, key_dim=64)(x)
    combined = x + attention_output
    x = Dense(64, activation='relu')(combined)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(32, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC()])
    return model

# Comprehensive Model Evaluation and Visualization Functions
This section includes functions for evaluating the model's performance using cross-validation and plotting ROC and Precision-Recall curves.

In [12]:
# Step 5: Comprehensive Evaluation
def plot_roc_and_pr_curves(y_true, y_pred_proba):
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, label='ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, label='Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()

    plt.show()

def comprehensive_model_evaluation(X, y, cv):
    cv_scores = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'auc': []}
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        model = create_advanced_mortality_model(X_train.shape[1])
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=3)
        history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32, callbacks=[early_stopping, lr_reducer], verbose=0)
        y_pred_proba = model.predict(X_val).flatten()
        y_pred = (y_pred_proba > 0.5).astype(int)
        cv_scores['accuracy'].append(accuracy_score(y_val, y_pred))
        cv_scores['precision'].append(precision_score(y_val, y_pred))
        cv_scores['recall'].append(recall_score(y_val, y_pred))
        cv_scores['f1'].append(f1_score(y_val, y_pred))
        cv_scores['auc'].append(roc_auc_score(y_val, y_pred_proba))
        plot_roc_and_pr_curves(y_val, y_pred_proba)
    print("Cross-Validation Results:")
    for metric, scores in cv_scores.items():
        print(f"{metric.capitalize()}: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

# Main Execution Block
Finally, the main execution block ties all components together to run the entire workflow from data generation through evaluation.


In [13]:
# Step 6: Main Execution
df = generate_synthetic_sepsis_dataset()
df_augmented = augment_data_with_missing_and_imbalance(df, missing_ratio=0.1, imbalance_ratio=0.3)
X, y, cv = improved_preprocessing_pipeline(df_augmented)
comprehensive_model_evaluation(X, y, cv)


  X_selected = X_selected.applymap(lambda x: int(x) if isinstance(x, bool) else x)
I0000 00:00:1745328349.241784    6592 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22272 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:01:00.0, compute capability: 8.9
I0000 00:00:1745328351.192032    7583 service.cc:152] XLA service 0x7c9218002e30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745328351.192049    7583 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2025-04-22 19:10:51.233668: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
E0000 00:00:1745328351.484504    7583 cuda_dnn.cc:522] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9.3.0.  CuDNN library needs to have matching major version and equal or hig

FailedPreconditionError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/home/sunway/miniconda3/lib/python3.12/runpy.py", line 198, in _run_module_as_main

  File "/home/sunway/miniconda3/lib/python3.12/runpy.py", line 88, in _run_code

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/sunway/miniconda3/lib/python3.12/asyncio/base_events.py", line 645, in run_forever

  File "/home/sunway/miniconda3/lib/python3.12/asyncio/base_events.py", line 1999, in _run_once

  File "/home/sunway/miniconda3/lib/python3.12/asyncio/events.py", line 88, in _run

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3047, in run_cell

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3102, in _run_cell

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3306, in run_cell_async

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3489, in run_ast_nodes

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3549, in run_code

  File "/tmp/ipykernel_6592/3749745590.py", line 5, in <module>

  File "/tmp/ipykernel_6592/2819076226.py", line 31, in comprehensive_model_evaluation

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 371, in fit

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/home/sunway/miniconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

DNN library initialization failed. Look at the errors above for more details.
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_multi_step_on_iterator_5622]