Import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from collections import defaultdict as dd
from xgboost import XGBClassifier
from tqdm import tqdm
import pickle
import os

Import or define data

In [2]:
SEED = 42

FOLDS = 10

In [3]:
data = pd.read_csv('../data/curated/cleaned_data.csv')

# Preprocessing

## Split

In [4]:
# stratisfied split based on target variable mortality_status, and gender
stratify_columns = data[['mortality_status', 'gender_M']]

cv_data, test_data = train_test_split(
    data, test_size=0.3, stratify=stratify_columns, random_state=SEED)

In [5]:
cv_data.to_csv('../data/curated/modelling/train_data.csv', index=False)

In [6]:
# 10cv
stratify_columns_combined = cv_data['mortality_status'].astype(
    str) + "_" + cv_data['gender_M'].astype(str)

cv_data_dict = {'train': [], 'val': []}


skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)

# Perform 10-fold cross-validation
for train_index, val_index in skf.split(cv_data, stratify_columns_combined):
    train_data = cv_data.iloc[train_index]
    val_data = cv_data.iloc[val_index]

    cv_data_dict['train'].append(train_data)
    cv_data_dict['val'].append(val_data)

## SMOTE

In [7]:
# Initialize SMOTE
smote = SMOTE(random_state=SEED)

# Create a new dictionary to store SMOTE-resampled data
smote_cv_data_dict = {'train': [], 'val': cv_data_dict['val']}

# Loop through the 10 training sets and apply SMOTE
for i in range(10):
    # Extract the current training set
    train_data = cv_data_dict['train'][i]

    # Separate features and target for SMOTE
    X_train = train_data.drop(columns=['mortality_status'])
    y_train = train_data['mortality_status']

    # Apply SMOTE to the training data
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Combine the resampled features and target back into a DataFrame
    train_data_smote = X_train_smote.copy()
    train_data_smote['mortality_status'] = y_train_smote

    # Append the SMOTE-resampled training data to the new dictionary
    smote_cv_data_dict['train'].append(train_data_smote)

## Normalise

In [8]:
# find the continuous variables in the data by going through columns and checking the number of unique values
continuous_columns = []
for column in data.columns:
    if data[column].nunique() > 10 and column not in ['subject_id', 'hadm_id']:
        continuous_columns.append(column)

In [9]:
def normalise(data_dict, continuous_columns):

    for i in range(10):
        train_data = data_dict['train'][i]
        val_data = data_dict['val'][i]

        # Normalise the continuous features
        scaler = StandardScaler()
        train_data[continuous_columns] = scaler.fit_transform(
            train_data[continuous_columns])
        val_data[continuous_columns] = scaler.transform(
            val_data[continuous_columns])

        data_dict['train'][i] = train_data
        data_dict['val'][i] = val_data

    return data_dict

In [10]:
smote_cv_data_dict = normalise(smote_cv_data_dict, continuous_columns)
cv_data_dict = normalise(cv_data_dict, continuous_columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data[continuous_columns] = scaler.transform(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data[continuous_columns] = scaler.transform(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data[continuous_columns] = scaler.transform(
A value is trying to be set on a copy of a slice from a Da

In [11]:
os.makedirs('../data/curated/modelling/normalise', exist_ok=True)
os.makedirs('../data/curated/modelling/smote_normalise', exist_ok=True)

for i in range(10):
    cv_data_dict['train'][i].to_csv(
        f'../data/curated/modelling/normalise/train_{i}.csv', index=False)
    cv_data_dict['val'][i].to_csv(
        f'../data/curated/modelling/normalise/val_{i}.csv', index=False)

    smote_cv_data_dict['train'][i].to_csv(
        f'../data/curated/modelling/smote_normalise/train_{i}.csv', index=False)
    cv_data_dict['val'][i].to_csv(
        f'../data/curated/modelling/smote_normalise/val_{i}.csv', index=False)

In [12]:
test_data.to_csv('../data/curated/modelling/test_data.csv', index=False)

## Feature Selection

In [13]:
def create_constraints(paired_variables: list):
    """ 
        Helper function which creates dictionary format in terms of constraints

        Input:
            - paired_variables: list of lists, where each list contains two variables which are paired together

        Output:
            - constraints: dictionary where each key is a variable and each value is a list of variables which it is paired with
            - e.g. {'A': ['B', 'C'], 'B': ['A', 'C'], 'C': ['A', 'B']
    """

    constraints = {}

    for paired_variable in paired_variables:
        for variable in paired_variable:
            constraints[variable] = [
                var for var in paired_variable if var != variable]

    return constraints

In [14]:
def get_feature_importance(model_list, constraints: dict = {}):
    """
        Gets sequential feature importance of model and returns it in a list format

        Input:
            - model: model which has been fitted - must have feature_importances_ and feature_names_in_ attributes
            - constraints: dictionary where each key is a variable and each value is a list of variables which it is paired with
            - restrictions: dictionary where each key is a variable and each value is a list of variables which it is restricted with
    """

    ordered_feature_importance = dict()

    feature_importance_dict = dd(float)

    for i in range(len(model_list[0].feature_names_in_)):

        for model in model_list:
            feature_importance_dict[str(model.feature_names_in_[
                i])] += float(model.feature_importances_[i])

        feature_importance_dict[str(model.feature_names_in_[
                                    i])] /= len(model_list)

    feature_importance_list = list(feature_importance_dict.items())

    feature_importance_list.sort(key=lambda x: x[1], reverse=True)

    curr = []
    score = 0
    for i in range(len(feature_importance_list)):

        if feature_importance_list[i][0] in curr:
            continue

        # test for constraint
        if feature_importance_list[i][0] in constraints.keys():
            for constrainted_feature in constraints[feature_importance_list[i][0]]:
                if constrainted_feature in curr:
                    continue
                else:
                    curr.append(constrainted_feature)
                    score += feature_importance_dict[constrainted_feature]

        curr.append(feature_importance_list[i][0])
        score += feature_importance_list[i][1]

        ordered_feature_importance[tuple(curr)] = score

    return ordered_feature_importance

In [15]:
# create constraints - one in all in

data_columns = data.columns.to_list()
constraints_list = []
for one_hot_column in ['gender', 'race', 'admission_type', 'insurance', 'marital_status']:
    constraints_list.append(
        [column for column in data_columns if one_hot_column in column])

In [16]:
target = 'mortality_status'

drop_columns = ['mortality_status', 'subject_id', 'hadm_id', 'stay_id']

In [17]:
constraints = create_constraints(constraints_list)
for PREPROCESSING in ['smote_normalise', 'normalise']:
    print(f'Getting feature importance ordering for {PREPROCESSING}')

    if PREPROCESSING == 'smote_normalise':
        train_data = smote_cv_data_dict['train'][0]
    else:
        train_data = cv_data_dict['train'][0]

    train_x_list = []
    train_y_list = []

    for i in range(FOLDS):

        train_x = train_data.drop(
            drop_columns, axis=1)
        train_y = train_data[target]

        train_x_list.append(train_x)
        train_y_list.append(train_y)

    # get XGB model for feature importance ordering
    xgb_model_list = []

    for i in tqdm(range(FOLDS)):
        xgb_model = XGBClassifier(max_depth=12,
                                  random_state=42,
                                  n_estimators=100,
                                  colsample_bytree=0.75,
                                  subsample=0.75
                                  )

        xgb_model.fit(train_x_list[i], train_y_list[i])

        xgb_model_list.append(xgb_model)

    # we aggregate the 10 model feature importances to unify ordering
    xgb_feature_importance_ordering = get_feature_importance(
        xgb_model_list, constraints)

    # export
    with open(f'../models/xgb_feature_importance_ordering_{PREPROCESSING}.pickle', 'wb') as f:
        pickle.dump(xgb_feature_importance_ordering, f)

Getting feature importance ordering for smote_normalise


100%|██████████| 10/10 [00:08<00:00,  1.17it/s]


Getting feature importance ordering for normalise


100%|██████████| 10/10 [00:06<00:00,  1.50it/s]


# Final Modelling

In [18]:
# normalise train data
train_data = cv_data

train_data_scaler = StandardScaler()
train_data[continuous_columns] = train_data_scaler.fit_transform(
    train_data[continuous_columns])

train_data.to_csv(
    '../data/curated/modelling/normalised_train_data.csv', index=False)

# apply to test data
test_data[continuous_columns] = train_data_scaler.transform(
    test_data[continuous_columns])

test_data.to_csv(
    '../data/curated/modelling/normalised_test_data.csv', index=False)