In [None]:
## Load modules

# Standard modules
import pandas as pd
import numpy as np

# Preprocessing modules
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,\
 OneHotEncoder, LabelEncoder, OrdinalEncoder

# Train-test split module
from sklearn.model_selection import train_test_split

# Classifier modules
from sklearn.ensemble import RandomForestClassifier

# Regression modules
from sklearn.ensemble import RandomForestRegressor

# Model selection modules
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, GridSearchCV

# Pipeline module
from sklearn.pipeline import Pipeline

# Imputation modules
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

# Performance metric modules
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Plotting modules
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
%matplotlib inline
plt.rcParams['figure.figsize'] = (4.0, 4.0) # set default size of plots

pd.options.display.max_columns = None

# Module for categorical variables
from pandas.api.types import CategoricalDtype

# Modules for building custom encoders and transformers
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    # Change path below starting from /content/drive/MyDrive/Colab Notebooks/
    # depending on how data is organized inside your Colab Notebooks folder in
    # Google Drive
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/EvenSem2024MAHE'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = 'Data/'

In [None]:
## Load ICU Data
file = DATA_DIR+'ICU_filtered.csv'
dfICU = pd.read_csv(file, sep = ',', header = 0, index_col = 0)

print('ICU dataset')
print('-----------')
print('Initial number of samples = %d'%(dfICU.shape[0]))
print('Initial number of features = %d\n'%(dfICU.shape[1]))
dfICU.head(5)

In [None]:
## Plot percentage of missing values (NaNs) for each feature
cutoff = 30 # we will remove features missing in more than 20% of the samples
fig = plt.figure(figsize=(6, 4))
percent_missing = (dfICU.isna().sum() / dfICU.shape[0]) * 100
percent_missing.plot(kind = 'bar', color = cm.rainbow(np.linspace(0, 1, 2))[(percent_missing <= cutoff).values.astype(int)])
plt.plot(np.arange(dfICU.shape[1]), np.repeat(cutoff, dfICU.shape[1]), 'g--')
fig.suptitle('Percentage Missing Values Across All Features', fontsize = 12)
plt.xlabel('Feature', fontsize = 12)
plt.ylabel('% Missing Values', fontsize = 12);

In [None]:
## Wrangle the dataframe
# Retain features with <= cutoff percentage missing values
dfICU = dfICU.loc[:, dfICU.columns[percent_missing <= cutoff]]

# Collate different one-hot-encoded ICU-type columns into a single column called ICU
dfICU.loc[dfICU['CCU'] == 1, 'ICU'] = 1
dfICU.loc[dfICU['CSRU'] == 1, 'ICU'] = 2
dfICU.loc[dfICU['SICU'] == 1, 'ICU'] = 3
dfICU.loc[(dfICU['CCU'] == 0 ) & (dfICU['CSRU'] == 0) & (dfICU['SICU'] == 0), 'ICU'] = 4
dfICU.drop(['CCU', 'CSRU', 'SICU'], axis = 1, inplace = True)

In [None]:
# Create lists of ordinal, categorical, and continuous features
ordinal_features = ['GCS_first']
categorical_features = [ 'Gender', 'ICU', 'MechVent']
continuous_features = dfICU.columns[~dfICU.columns.isin(ordinal_features + categorical_features)].to_list()
dfICU.head(5)

In [None]:
## Custom ordinal encoder
class CustomOrdinalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, ordinal_cols_dict: dict):
        self.ordinal_cols_dict = ordinal_cols_dict

    def fit(self, X: pd.DataFrame, y = None):
        return self

    def transform(self, X: pd.DataFrame, y = None):
        for col, order in self.ordinal_cols_dict.items():
            cat_type = CategoricalDtype(categories = self.ordinal_cols_dict[col], ordered = True)
            X[col] = X[col].astype(cat_type).factorize(sort = True)[0]
            X[col] = X[col].astype(CategoricalDtype(ordered = True))
            X[col] = X[col].replace(-1, np.nan)
        return X

    def fit_transform(self, X: pd.DataFrame, y = None):
        self.fit(X)
        return self.transform(X)

In [None]:
## Custom categorical encoder
class CustomCategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_cols: list):
        self.categorical_cols = categorical_cols

    def fit(self, X: pd.DataFrame, y = None):
        return self

    def transform(self, X: pd.DataFrame, y = None):
        for col in self.categorical_cols:
            cat_type = 'category'
            X[col] = X[col].astype(cat_type).factorize()[0]
            X[col] = X[col].astype('category')
            X[col] = X[col].replace(-1, np.nan)
        return X

    def fit_transform(self, X: pd.DataFrame, y = None):
        self.fit(X)
        return self.transform(X)

In [None]:
## Build preprocessing pipeline for ordinal, categorical, and continuous features
# Define preprocessing pipeline for ordinal features
# Dictionary for ordinal features
ordinal_cols_dict = {
    'GCS_first': np.arange(np.max(dfICU['GCS_first']), np.min(dfICU['GCS_first'])-1, -1)
    }
ordinal_transformer = Pipeline(steps = [('customordenc', CustomOrdinalEncoder(ordinal_cols_dict)),
    ('imputer', IterativeImputer(estimator = RandomForestClassifier(n_estimators = 3),
                                 initial_strategy = 'most_frequent',
                                 missing_values = np.nan,
                                 max_iter = 10,
                                 random_state = 0))])

# Define preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps = [('customcatenc', CustomCategoricalEncoder(categorical_features)),
    ('imputer', IterativeImputer(estimator = RandomForestClassifier(n_estimators = 3),
                                 initial_strategy = 'most_frequent',
                                 missing_values = np.nan,
                                 max_iter = 10,
                                 random_state = 0))])

# Define preprocessing pipeline for continuous features
numeric_transformer = Pipeline(steps = [('imputer', IterativeImputer(estimator = RandomForestRegressor(n_estimators = 3),
                                                                     initial_strategy = 'median',
                                                                     missing_values = np.nan,
                                                                     max_iter = 10,
                                                                     random_state = 0)),
                                        ('scaler', RobustScaler())])

# Create preprocessor object for all features
preprocessor = ColumnTransformer(transformers = [('ord', ordinal_transformer, ordinal_features),
                                                 ('cat', categorical_transformer, categorical_features),
                                                 ('num', numeric_transformer, continuous_features)
                                                 ])

# Fit and transform using the preprocessor
dfICU = pd.DataFrame(preprocessor.fit_transform(dfICU))

In [None]:
dfICU