Q2.1 Classic Machine Learning Methods

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from sklearn.metrics import roc_auc_score, average_precision_score
from tsfresh.feature_extraction import extract_features, EfficientFCParameters
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional,MultiHeadAttention, Flatten, Input, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

ModuleNotFoundError: No module named 'tsfresh'

In [None]:
# === PART 1 OF Q2.1 ===
# Load preprocessed datasets
df_a = pd.read_parquet('processed_data/set-a.parquet')
df_b = pd.read_parquet('processed_data/set-b.parquet')
df_c = pd.read_parquet('processed_data/set-c.parquet')  # Test set

# Combine training sets
df_train = pd.concat([df_a, df_b], ignore_index=True)
df_test = df_c  # Test set is set-c

# Define static and dynamic variables
static_vars = ['Age', 'Gender', 'Height', 'Weight']
time_series_vars = [col for col in df_train.columns if col not in static_vars + ['Hour', 'PatientID', 'RecordID', 'ICUType', 'In_hospital_death']]

# Feature extraction function
def extract_features(df):
    features = df.groupby('PatientID')[time_series_vars].agg(['mean', 'max', 'last'])
    features.columns = ['_'.join(col) for col in features.columns]
    
    # Add static variables
    static_data = df.groupby('PatientID')[static_vars].last()
    features = features.merge(static_data, left_index=True, right_index=True)
    
    # Add labels
    labels = df.groupby('PatientID')['In_hospital_death'].last()
    features['In_hospital_death'] = labels
    return features.reset_index()

# Extract features
df_train_features = extract_features(df_train)
df_test_features = extract_features(df_test)

# Prepare data for ML
X_train = df_train_features.drop(columns=['PatientID', 'In_hospital_death'])
y_train = df_train_features['In_hospital_death']
X_test = df_test_features.drop(columns=['PatientID', 'In_hospital_death'])
y_test = df_test_features['In_hospital_death']

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train classifiers
logreg = LogisticRegression()
rf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_model = SVC(kernel='linear', probability=True, random_state=42)

logreg.fit(X_train, y_train)
rf.fit(X_train, y_train)
svm_model.fit(X_train, y_train)

# Evaluate models
def evaluate_model(model, X_test, y_test, name):
    y_pred = model.predict_proba(X_test)[:, 1]
    auroc = roc_auc_score(y_test, y_pred)
    auprc = average_precision_score(y_test, y_pred)
    print(f"{name} - AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

# Report performance
evaluate_model(logreg, X_test, y_test, 'Logistic Regression')
evaluate_model(rf, X_test, y_test, 'Random Forest')
evaluate_model(svm_model, X_test, y_test, 'SVM')




In [None]:
# === PART 2 OF Q2.1 ===

# Define static and dynamic variables and additional features
static_vars = ['Age', 'Gender', 'Height', 'Weight']
time_series_vars = [col for col in df_train.columns if col not in static_vars + ['Hour', 'PatientID', 'RecordID', 'ICUType', 'In_hospital_death']]
additional_vars = ['Creatinine', 'HR', 'PaCO2', 'PaO2', 'pH','Urine', 'Temp']

# Include additional_vars in time_series_vars
time_series_vars += additional_vars

# Updated feature extraction function
def extract_features_with_tsfresh(df):
    # Extract time-series features using tsfresh
    ts_features = extract_features(
        df[['PatientID', 'Hour'] + time_series_vars],
        column_id='PatientID',
        column_sort='Hour',
        default_fc_parameters=EfficientFCParameters(),
        n_jobs=-1
    )
    
    # Add static variables
    static_data = df.groupby('PatientID')[static_vars].last()
    ts_features = ts_features.merge(static_data, left_index=True, right_index=True)
    
    # Add labels
    labels = df.groupby('PatientID')['In_hospital_death'].last()
    ts_features['In_hospital_death'] = labels
    return ts_features.reset_index()

# Extract features with tsfresh
df_train_features = extract_features_with_tsfresh(df_train)
df_test_features = extract_features_with_tsfresh(df_test)

# Extract features
df_train_features = extract_features(df_train)
df_test_features = extract_features(df_test)

# Prepare data for ML
X_train = df_train_features.drop(columns=['PatientID', 'In_hospital_death'])
y_train = df_train_features['In_hospital_death']
X_test = df_test_features.drop(columns=['PatientID', 'In_hospital_death'])
y_test = df_test_features['In_hospital_death']

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train classifiers
logreg = LogisticRegression()
rf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_model = SVC(kernel='linear', probability=True, random_state=42)

logreg.fit(X_train, y_train)
rf.fit(X_train, y_train)
svm_model.fit(X_train, y_train)

# Evaluate models
def evaluate_model(model, X_test, y_test, name):
    y_pred = model.predict_proba(X_test)[:, 1]
    auroc = roc_auc_score(y_test, y_pred)
    auprc = average_precision_score(y_test, y_pred)
    print(f"{name} - AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

# Report performance
evaluate_model(logreg, X_test, y_test, 'Logistic Regression')
evaluate_model(rf, X_test, y_test, 'Random Forest')
evaluate_model(svm_model, X_test, y_test, 'SVM')

Q2.2 Recurrent Neural Networks

In [None]:
# === Implementing LSTM ===

# Prepare data for LSTM
def prepare_lstm_data(df, time_series_vars, static_vars):
    # Group by PatientID and create 3D arrays for time-series data
    time_series_data = df.groupby('PatientID')[time_series_vars].apply(lambda x: x.values)
    static_data = df.groupby('PatientID')[static_vars].last()
    labels = df.groupby('PatientID')['In_hospital_death'].last()
    
    # Pad sequences to ensure uniform length
    max_timesteps = max(time_series_data.apply(len))
    time_series_data = np.array([np.pad(x, ((0, max_timesteps - len(x)), (0, 0)), mode='constant') for x in time_series_data])
    
    return time_series_data, static_data.values, labels.values

# Prepare training and test data
X_train_ts, X_train_static, y_train = prepare_lstm_data(df_a, time_series_vars, static_vars)
X_test_ts, X_test_static, y_test = prepare_lstm_data(df_c, time_series_vars, static_vars)

# Standardize static and time-series features
scaler_static = StandardScaler()
X_train_static = scaler_static.fit_transform(X_train_static)
X_test_static = scaler_static.transform(X_test_static)

scaler_ts = StandardScaler()
X_train_ts = np.array([scaler_ts.fit_transform(x) for x in X_train_ts])
X_test_ts = np.array([scaler_ts.transform(x) for x in X_test_ts])

# Build LSTM model
def build_lstm_model(input_shape_ts, input_shape_static):
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape_ts, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(32, return_sequences=False))
    model.add(Dropout(0.3))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define input shapes
input_shape_ts = (X_train_ts.shape[1], X_train_ts.shape[2])  # (timesteps, features)
input_shape_static = X_train_static.shape[1]  # Number of static features

# Build and train the model
lstm_model = build_lstm_model(input_shape_ts, input_shape_static)
history = lstm_model.fit(X_train_ts, y_train, validation_split=0.2, epochs=20, batch_size=32, verbose=1)

# Evaluate the model
y_pred = lstm_model.predict(X_test_ts).flatten()
auroc = roc_auc_score(y_test, y_pred)
auprc = average_precision_score(y_test, y_pred)

print(f"LSTM Model - Test Set C Performance: AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

In [None]:
# === Build Bidirectional LSTM model ===

def build_bidirectional_lstm_model(input_shape_ts):
    model = Sequential()
    model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=input_shape_ts))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(32, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define input shape for time-series data
input_shape_ts = (X_train_ts.shape[1], X_train_ts.shape[2])  # (timesteps, features)

# Build and train the bidirectional LSTM model
bidirectional_lstm_model = build_bidirectional_lstm_model(input_shape_ts)
history = bidirectional_lstm_model.fit(X_train_ts, y_train, validation_split=0.2, epochs=20, batch_size=32, verbose=1)

# Evaluate the model
y_pred = bidirectional_lstm_model.predict(X_test_ts).flatten()
auroc = roc_auc_score(y_test, y_pred)
auprc = average_precision_score(y_test, y_pred)

print(f"Bidirectional LSTM Model - Test Set C Performance: AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

Q2.3a Transformers

In [None]:
# Build Transformer model
def build_transformer_model(input_shape_ts):
    inputs = Input(shape=input_shape_ts)  # Input shape: (timesteps, features)
    
    # Multi-Head Attention
    attention_output = MultiHeadAttention(num_heads=4, key_dim=input_shape_ts[1])(inputs, inputs)
    attention_output = Dropout(0.3)(attention_output)
    attention_output = LayerNormalization(epsilon=1e-6)(attention_output + inputs)
    
    # Feed-Forward Network
    ff_output = Dense(64, activation='relu')(attention_output)
    ff_output = Dropout(0.3)(ff_output)
    ff_output = Dense(32, activation='relu')(ff_output)
    ff_output = LayerNormalization(epsilon=1e-6)(ff_output + attention_output)
    
    # Flatten and Output Layer
    flatten_output = Flatten()(ff_output)
    outputs = Dense(1, activation='sigmoid')(flatten_output)  # Binary classification
    
    # Compile Model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define input shape for time-series data
input_shape_ts = (X_train_ts.shape[1], X_train_ts.shape[2])  # (timesteps, features)

# Build and train the transformer model
transformer_model = build_transformer_model(input_shape_ts)
history = transformer_model.fit(X_train_ts, y_train, validation_split=0.2, epochs=20, batch_size=32, verbose=1)

# Evaluate the model
y_pred = transformer_model.predict(X_test_ts).flatten()
auroc = roc_auc_score(y_test, y_pred)
auprc = average_precision_score(y_test, y_pred)

print(f"Transformer Model - Test Set C Performance: AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

Q2.3b Tokenizing Time-Series Data and Transformers 

In [None]:
# Define paths
input_path = "processed_data"
output_path = "tokenized_data"
os.makedirs(output_path, exist_ok=True)

# Define static and time-series variables
static_vars = ['Age', 'Gender', 'Height', 'Weight']
time_series_vars = []  # Will be populated dynamically

# Load one dataset to infer time-series variable names
df_sample = pd.read_parquet(os.path.join(input_path, "set-a.parquet"))
time_series_vars = [col for col in df_sample.columns if col not in static_vars + ['Hour', 'PatientID', 'RecordID', 'ICUType', 'In_hospital_death']]

# Normalize time (t) to range [0,1]
def scale_time(hour):
    return hour / 48.0  # Since we have 48-hour windows

# Encode variable names (z) as categorical integers
variable_encoder = LabelEncoder()
variable_encoder.fit(time_series_vars)

# Scale observed values (v)
value_scalers = {var: MinMaxScaler() for var in time_series_vars}

# Fit scalers using set A (training set)
df_train = pd.read_parquet(os.path.join(input_path, "set-a.parquet"))
for var in time_series_vars:
    df_train[var] = df_train[var].fillna(0)  # Fill missing values before scaling
    value_scalers[var].fit(df_train[[var]])

def tokenize_patient_data(df):
    tokenized_data = []
    
    for _, patient_group in df.groupby("PatientID"):
        patient_id = patient_group["PatientID"].iloc[0]
        
        for _, row in patient_group.iterrows():
            for var in time_series_vars:
                if pd.notna(row[var]):  # Only use observed values
                    t = scale_time(row['Hour'])
                    z = variable_encoder.transform([var])[0]  # Encode variable
                    v = value_scalers[var].transform([[row[var]]])[0][0]  # Scale value
                    tokenized_data.append([patient_id, t, z, v])
    
    return pd.DataFrame(tokenized_data, columns=['PatientID', 't', 'z', 'v'])

# Process datasets
for set_id in ['a', 'b', 'c']:
    print(f"Tokenizing set {set_id}...")
    df = pd.read_parquet(os.path.join(input_path, f"set-{set_id}.parquet"))
    tokenized_df = tokenize_patient_data(df)
    tokenized_df.to_parquet(os.path.join(output_path, f"tokenized-set-{set_id}.parquet"))
    print(f"Saved tokenized set {set_id} with {len(tokenized_df)} rows.")

print("Tokenization complete. Data saved to tokenized_data/")

# Load tokenized datasets
def load_tokenized_data(set_id):
    return pd.read_parquet(f"tokenized_data/tokenized-set-{set_id}.parquet")

train_df = load_tokenized_data('a')
test_df = load_tokenized_data('c')  # Test set

# Prepare input tensors
max_seq_len = 500  # Maximum sequence length
num_variables = 41  # Number of unique variables (z values)

# Pad sequences to max length
def prepare_sequences(df, max_seq_len):
    grouped = df.groupby("PatientID").apply(lambda x: x.sort_values("t").iloc[:max_seq_len])
    X_t = grouped["t"].groupby("PatientID").apply(lambda x: np.pad(x.values, (0, max_seq_len - len(x)), 'constant'))
    X_z = grouped["z"].groupby("PatientID").apply(lambda x: np.pad(x.values, (0, max_seq_len - len(x)), 'constant'))
    X_v = grouped["v"].groupby("PatientID").apply(lambda x: np.pad(x.values, (0, max_seq_len - len(x)), 'constant'))
    y = grouped["PatientID"].apply(lambda x: x["v"].iloc[0])  # Dummy target (modify if needed)
    return np.stack([X_t, X_z, X_v], axis=-1), y.values

X_train, y_train = prepare_sequences(train_df, max_seq_len)
X_test, y_test = prepare_sequences(test_df, max_seq_len)

# Transformer model definition
def build_transformer_model(input_shape, num_variables, d_model=64, num_heads=4, ff_dim=128):
    inputs = Input(shape=input_shape)
    
    # Variable Embedding
    time_input = inputs[:, :, 0:1]
    variable_input = Embedding(input_dim=num_variables, output_dim=d_model)(inputs[:, :, 1])
    value_input = Dense(d_model)(inputs[:, :, 2:3])
    
    # Combine embeddings
    x = tf.keras.layers.Concatenate()([time_input, variable_input, value_input])
    
    # Multi-Head Attention
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(x, x)
    attn_output = Dropout(0.1)(attn_output)
    attn_output = LayerNormalization(epsilon=1e-6)(attn_output + x)
    
    # Feed-Forward Layer
    ff_output = Dense(ff_dim, activation='relu')(attn_output)
    ff_output = Dense(d_model)(ff_output)
    ff_output = Dropout(0.1)(ff_output)
    ff_output = LayerNormalization(epsilon=1e-6)(ff_output + attn_output)
    
    # Classification Head
    x = Flatten()(ff_output)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.1)(x)
    outputs = Dense(1, activation='sigmoid')(x)  # Binary classification
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])
    return model

# Build and train model
model = build_transformer_model((max_seq_len, 3), num_variables)
model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32)

# Evaluate on test set
y_pred_probs = model.predict(X_test).flatten()
auroc = roc_auc_score(y_test, y_pred_probs)
auprc = average_precision_score(y_test, y_pred_probs)

print(f"Test AUROC: {auroc:.4f}")
print(f"Test AUPRC: {auprc:.4f}")
