<a href="https://colab.research.google.com/github/Shi-Yile/SPH6004-Assignment-1/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*  Notice: Due to computation issue, some super time-consuming chunks were run once and then commented to save running time.

In [1]:
# !pip install sklearn-genetic

In [2]:
# Import packages

# Data-processing packages
import pandas as pd
import numpy as np

# Plotting packages
import matplotlib.pyplot as plt
import seaborn as sn

# ML packages
import sklearn

from sklearn.preprocessing import StandardScaler # standardization

from sklearn.feature_selection import mutual_info_classif # mutual information

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, StratifiedKFold, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.utils.class_weight import compute_sample_weight

from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score

# feature selection algorithms (time-consuming)
# from sklearn.feature_selection import SequentialFeatureSelector
# from sklearn-genetic import GeneticSearchCV

# working directory settings
import os
from google.colab import drive

In [3]:
# give permission to load data from google drive
drive.mount('/content/drive')
os.chdir('drive/My Drive/SPH6004/Assignment-1')

Mounted at /content/drive


In [4]:
# load dataset
df = pd.read_csv('data.csv')
df_origin = df.copy()
df.head()

Unnamed: 0,id,aki,gender,admission_age,race,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,...,ggt_max,ld_ldh_min,ld_ldh_max,gcs_min,gcs_motor,gcs_verbal,gcs_eyes,gcs_unable,height,weight_admit
0,36570066,3,F,79.953141,BLACK/AFRICAN AMERICAN,96.0,104.0,100.083333,103.0,126.0,...,,236.0,318.0,15.0,6.0,5.0,4.0,0.0,157.0,110.0
1,39307659,0,F,78.194169,WHITE - RUSSIAN,72.0,134.0,97.263158,97.0,127.0,...,,,,15.0,6.0,5.0,4.0,0.0,,82.0
2,38743306,2,F,65.602396,WHITE,60.0,97.0,84.166667,95.0,143.0,...,,,,15.0,6.0,5.0,4.0,0.0,,62.1
3,32339865,2,F,64.906629,UNKNOWN,59.0,87.0,71.461538,113.0,150.0,...,,,,15.0,1.0,0.0,1.0,1.0,170.0,113.1
4,35526987,2,M,57.438861,WHITE,57.0,100.0,82.387097,81.0,127.0,...,,,,15.0,,0.0,1.0,1.0,178.0,97.4


In [5]:
# basic infomration of each column in df
df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50920 entries, 0 to 50919
Data columns (total 162 columns):
 #    Column                  Non-Null Count  Dtype  
---   ------                  --------------  -----  
 0    id                      50920 non-null  int64  
 1    aki                     50920 non-null  int64  
 2    gender                  50920 non-null  object 
 3    admission_age           50920 non-null  float64
 4    race                    50920 non-null  object 
 5    heart_rate_min          50841 non-null  float64
 6    heart_rate_max          50841 non-null  float64
 7    heart_rate_mean         50841 non-null  float64
 8    sbp_min                 50823 non-null  float64
 9    sbp_max                 50823 non-null  float64
 10   sbp_mean                50823 non-null  float64
 11   dbp_min                 50823 non-null  float64
 12   dbp_max                 50823 non-null  float64
 13   dbp_mean                50823 non-null  float64
 14   mbp_min             

  df.info(verbose = True, null_counts = True)


## Data Pre-processing & Feature selection

In [6]:
# frequencies in 'race'
df['race'].value_counts()

WHITE                                        32637
UNKNOWN                                       5579
BLACK/AFRICAN AMERICAN                        3845
OTHER                                         1745
WHITE - OTHER EUROPEAN                         918
UNABLE TO OBTAIN                               726
ASIAN                                          614
ASIAN - CHINESE                                547
HISPANIC/LATINO - PUERTO RICAN                 530
HISPANIC OR LATINO                             501
WHITE - RUSSIAN                                430
PATIENT DECLINED TO ANSWER                     371
HISPANIC/LATINO - DOMINICAN                    337
BLACK/CAPE VERDEAN                             319
BLACK/CARIBBEAN ISLAND                         282
BLACK/AFRICAN                                  194
ASIAN - SOUTH EAST ASIAN                       168
PORTUGUESE                                     161
ASIAN - ASIAN INDIAN                           121
WHITE - EASTERN EUROPEAN       

In [7]:
# re-catergorize 'race' with fewer levels, and create dummy variables
# white, black, asian, hispanic/latino, other, unknown
# df['white'] = df['race'].apply(lambda x: 1 if 'WHITE' in x else 0)
df['race_white'] = df['race'].apply(lambda x: int('WHITE' in x))
df['race_black'] = df['race'].apply(lambda x: int('BLACK' in x))
df['race_asian'] = df['race'].apply(lambda x: int('ASIAN' in x))
df['race_hispanic/latino'] = df['race'].apply(
    lambda x: int('HISPANIC' in x or 'LATINO' in x))
df['race_unknown'] = df['race'].apply(lambda x: int(
    x == 'UNKNOWN' or x == 'UNABLE TO OBTAIN' or
    x == 'PATIENT DECLINED TO ANSWER'))
df['race_other'] = df['race'].apply(lambda x: int(
    x == 'OTHER' or x == 'PORTUGUESE' or
    x == 'AMERICAN INDIAN/ALASKA NATIVE' or
    x == 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER' or
    x == 'MULTIPLE RACE/ETHNICITY' or x == 'SOUTH AMERICAN'))

# frequency in each level
print('WHITE: ', df['race_white'].sum())
print('BLACK: ', df['race_black'].sum())
print('ASIAN: ', df['race_asian'].sum())
print('HISPANIC/LATINO: ', df['race_hispanic/latino'].sum())
print('OTHER: ', df['race_other'].sum())
print('UNKNOWN: ', df['race_unknown'].sum())

WHITE:  34204
BLACK:  4640
ASIAN:  1496
HISPANIC/LATINO:  1735
OTHER:  2169
UNKNOWN:  6676


In [8]:
# recode gender as 0-1  0: male, 1: female
df['gender_'] = df['gender'].apply(lambda x:int(x == 'F'))
df['gender_'].value_counts()

0    28440
1    22480
Name: gender_, dtype: int64

In [9]:
# re-categorise AKI as a binary column  0: aki = 0, 1: aki = 1/2/3
df['aki_'] = df['aki'].apply(
    lambda x:int(x == 1 or x == 2 or x == 3))
df['aki_'].value_counts()

1    34060
0    16860
Name: aki_, dtype: int64

In [10]:
# NAs detected, thus need further processing including feature selection
# calculate the proportion of NAs in each column
df_NA_prop = 1 - df.count() / len(df)

# extract two sets of indices from df
# whose correpsonding columns including NAs less than 10% and 50%, respectively
idx_NA10 = df_NA_prop[df_NA_prop <= 0.1].index
idx_NA50 = df_NA_prop[df_NA_prop <= 0.5].index
# idx_NA10

# extract two subsets of df based on previous indices
df_sub_NA10 = df[idx_NA10]
df_sub_NA50 = df[idx_NA50]
# df_sub_NA10.info()

#  drop rows with NAs
df_sub_com10 = df_sub_NA10.dropna()
df_sub_com50 = df_sub_NA50.dropna()
# df_sub_com10.info()

In [11]:
# df_sub_com10.info()
# df_sub_NA50.info()

In [12]:
# value count for each class in the outcome
df_sub_com10['aki_'].value_counts()

1    26484
0    12147
Name: aki_, dtype: int64

In [13]:
df_sub_com50['aki_'].value_counts()

1    10569
0     2945
Name: aki_, dtype: int64

In [14]:
# create outcome vector and predictor matrix
# drop id and race first
df_10 = df_sub_com10.drop(columns = ['id', 'aki', 'race', 'gender'])
df_50 = df_sub_com50.drop(columns = ['id', 'aki', 'race', 'gender'])

In [15]:
# split dataset
# first we extract X and y

# df_10
y_df_10 = df_10['aki_']
X_df_10 = df_10.drop(columns = ['aki_'])

# df_50
y_df_50 = df_50['aki_']
X_df_50 = df_50.drop(columns = ['aki_'])

In [16]:
# compute the mutual information of each factor with aki_binary
# case: df_10
MI_df_10 = pd.DataFrame(
    data = mutual_info_classif(X_df_10, y_df_10),
    index = X_df_10.columns,
    columns = ['MI'])

MI_df_10.sort_values(by = 'MI', ascending = False).head(15)

Unnamed: 0,MI
bun_min,0.026411
bun_max,0.024827
inr_max,0.020067
gcs_verbal,0.019179
glucose_mean,0.017967
resp_rate_mean,0.01777
admission_age,0.016637
gcs_motor,0.016544
pt_max,0.016528
sbp_min,0.01454


In [17]:
# factors with 0 MI with aki
MI_df_10[MI_df_10.MI == 0]

Unnamed: 0,MI
sbp_max,0.0
race_unknown,0.0


In [18]:
# drop columns irrelated to aki
df_10.drop(columns = MI_df_10[MI_df_10.MI == 0].index, inplace = True)

In [19]:
# case: df_50
MI_df_50 = pd.DataFrame(
    data = mutual_info_classif(X_df_50, y_df_50),
    index = X_df_50.columns,
    columns = ['MI'])

MI_df_50.sort_values(by = 'MI', ascending = False).head(15)

Unnamed: 0,MI
bun_max,0.024654
weight_admit,0.02212
bun_min,0.021474
mbp_min,0.015612
pt_max,0.014383
inr_max,0.011466
glucose_mean,0.01134
admission_age,0.011255
pt_min,0.011153
gcs_verbal,0.011098


In [20]:
# 0-MI factors
MI_df_50[MI_df_50.MI == 0]

Unnamed: 0,MI
dbp_max,0.0
temperature_mean,0.0
spo2_mean,0.0
lactate_min,0.0
pco2_min,0.0
hematocrit_max.1,0.0
platelets_min,0.0
calcium_max.1,0.0
chloride_min.1,0.0
chloride_max.1,0.0


In [21]:
df_50.drop(columns = MI_df_50[MI_df_50.MI == 0].index, inplace = True)

In [22]:
# standardization on X: (X - mean) / std
# Std = StandardScaler(copy = False)
# X_std_df_10 = pd.DataFrame(
    # data = Std.fit_transform(X_df_10),
    # columns = Std.get_feature_names_out(input_features = X_df_10.columns))

# X_std_df_50 = pd.DataFrame(
    # data = Std.fit_transform(X_df_50),
    # columns = Std.get_feature_names_out(input_features = X_df_50.columns))

In [23]:
# training - test split
X_10_train, X_10_test, y_10_train, y_10_test = train_test_split(
    X_df_10, y_df_10, test_size = 0.3, random_state = 42,
    stratify = y_df_10, shuffle = True)

X_50_train, X_50_test, y_50_train, y_50_test = train_test_split(
    X_df_50, y_df_50, test_size = 0.3, random_state = 42,
    stratify = y_df_50, shuffle = True)

## Logistic Regression

In [25]:
# logistic regression with CV
# grid for C
lr_param = {
    'model__C': np.linspace(0.005, 0.05, 10),
}

In [26]:
# GridSearch with CV for best C
stratifiedCV = StratifiedKFold(n_splits = 3)

# pipeline: standardization + logistic regression with pre-specified parameters
LR_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        class_weight = 'balanced', # handle imbalacned outcome
        solver = 'saga',
        penalty = 'l1',
        max_iter = 50000))
])

# GridSearch with CV
BestLR = GridSearchCV(
    LR_pipe,
    param_grid = lr_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)

In [27]:
# this required package sklearn-genetic
# genetic algorithm for feature selection
# very time-consuming: several hours are needed for logistic regression
# selector_LR = GeneticSelectionCV(
#     LR_pipe,
#     cv = stratifiedCV,
#     scoring = 'f1',
#     verbose = 1,
#     n_jobs = -1
# )
# selector_LR.fit(X_50_train, y_50_train)
# selection_LR.support_

In [28]:
# stepwise - forward selection
# runnig time issue again arises

# LR_pipe_C = Pipeline([
#     ('scaler', StandardScaler()),
#     ('model', LogisticRegression(
#         class_weight = 'balanced',
#         solver = 'saga',
#         penalty = 'l1',
#         C = 100,
#         max_iter = 50000
#     ))
# ])

# forward_selection = SequentialFeatureSelector(
#     LR_pipe_C,
#     n_features_to_select = 15,
#     direction = 'forward',
#     n_jobs = -1
# )

# forward_selection.fit(X_10_train, y_10_train)
# forward_selection.get_feature_names_out()

In [29]:
# case: df_10
BestLR.fit(X_10_train, y_10_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [30]:
# best C with the best f1-score
print(BestLR.best_params_)
print(BestLR.best_score_)

{'model__C': 0.045000000000000005}
0.7347461891928089


In [31]:
# predictions on testing data
y_10_BestLRpred = BestLR.predict(X_10_test)

# f1-score
print(f1_score(y_10_test, y_10_BestLRpred))

# confusion matrix
print(confusion_matrix(y_10_test, y_10_BestLRpred))

# detailed classification report
print(pd.DataFrame(classification_report(
    y_10_test, y_10_BestLRpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

0.7365474774333497
[[2562 1082]
 [2683 5263]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.49     0.83      0.68       0.66          0.72
recall        0.70     0.66      0.68       0.68          0.68
f1-score      0.58     0.74      0.68       0.66          0.69
support    3644.00  7946.00      0.68   11590.00      11590.00


In [32]:
# case: df_50
BestLR.fit(X_50_train, y_50_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [33]:
# best C with best f1-score
print(BestLR.best_params_)
print(BestLR.best_score_)

{'model__C': 0.04}
0.7561875542541747


In [34]:
# predictions on testing data
y_50_BestLRpred = BestLR.predict(X_50_test)

# f1-score
print(f1_score(y_50_test, y_50_BestLRpred))

# AUROC (for comparison)
print(roc_auc_score(y_50_test, y_50_BestLRpred))

# confusion matrix
print(confusion_matrix(y_50_test, y_50_BestLRpred))

# detailed classification report
print(pd.DataFrame(classification_report(
    y_50_test, y_50_BestLRpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

0.767491926803014
0.6879540405056571
[[ 620  264]
 [1032 2139]]
           no aki      aki  accuracy  macro avg  weighted avg
precision    0.38     0.89      0.68       0.63          0.78
recall       0.70     0.67      0.68       0.69          0.68
f1-score     0.49     0.77      0.68       0.63          0.71
support    884.00  3171.00      0.68    4055.00       4055.00


# Tree-based Model

## Decision Tree

In [35]:
# Tree-base models
# decision tree
# grid for max_depth
dtc_param = {'model__max_depth': [2, 3, 4, 5, 6, 7]}

In [36]:
# standardization + single decision tree
Tree_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', DecisionTreeClassifier(
        criterion = 'entropy',  # split criteria
        class_weight = 'balanced' # handle imbalanced outcome
    ))
])

# GridSearch with CV
BestDTC = GridSearchCV(
    Tree_pipe,
    param_grid = dtc_param,
    scoring = 'f1',
    cv = stratifiedCV
)

In [37]:
# case: df_10
BestDTC.fit(X_10_train, y_10_train)

In [38]:
# best max_depth with best f1-score
print(BestDTC.best_params_)
print(BestDTC.best_score_)

{'model__max_depth': 2}
0.7589189330014632


In [39]:
# predictions on testing data
y_10_BestDTCpred = BestDTC.predict(X_10_test)

# f1-score
print(f1_score(y_10_test, y_10_BestDTCpred))

# confusion matrix
print(confusion_matrix(y_10_test, y_10_BestDTCpred))

# detailed classification report
print(pd.DataFrame(classification_report(
    y_10_test, y_10_BestDTCpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

0.7570914144923874
[[1753 1891]
 [1954 5992]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.47     0.76      0.67       0.62          0.67
recall        0.48     0.75      0.67       0.62          0.67
f1-score      0.48     0.76      0.67       0.62          0.67
support    3644.00  7946.00      0.67   11590.00      11590.00


In [40]:
# case: df_50
BestDTC.fit(X_50_train, y_50_train)

In [41]:
# best max_depth with best f1-score
print(BestDTC.best_params_)
print(BestDTC.best_score_)

{'model__max_depth': 2}
0.7544027250491357


In [43]:
# predictions on testing data
y_50_BestDTCpred = BestDTC.predict(X_50_test)

# f1-score
print(f1_score(y_50_test, y_50_BestDTCpred))

# confusion matrix
print(confusion_matrix(y_50_test, y_50_BestDTCpred))

# detailed classification report
print(pd.DataFrame(classification_report(
    y_50_test, y_50_BestDTCpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

0.7997336440819044
[[ 450  434]
 [ 769 2402]]
           no aki      aki  accuracy  macro avg  weighted avg
precision    0.37     0.85       0.7       0.61          0.74
recall       0.51     0.76       0.7       0.63          0.70
f1-score     0.43     0.80       0.7       0.61          0.72
support    884.00  3171.00       0.7    4055.00       4055.00


## Random Forest

In [44]:
# random forest
rf_param = {
    'model__n_estimators': np.arange(start = 10, stop = 100, step = 10),
    'model__max_depth': np.arange(start = 2, stop = 10, step = 1)
}

In [45]:
RF_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(
        criterion = 'entropy',
        class_weight = 'balanced'
        ))
])

BestRF = GridSearchCV(
    RF_pipe,
    param_grid = rf_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)

In [46]:
BestRF.fit(X_10_train, y_10_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [47]:
print(BestRF.best_params_)
print(BestRF.best_score_)

{'model__max_depth': 9, 'model__n_estimators': 40}
0.7688253854408984


In [48]:
y_10_BestRFpred = BestRF.predict(X_10_test)

print(confusion_matrix(y_10_test, y_10_BestRFpred))
print(pd.DataFrame(classification_report(
    y_10_test, y_10_BestRFpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[2339 1305]
 [2314 5632]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.50     0.81      0.69       0.66          0.71
recall        0.64     0.71      0.69       0.68          0.69
f1-score      0.56     0.76      0.69       0.66          0.70
support    3644.00  7946.00      0.69   11590.00      11590.00


In [49]:
# case: df_50
BestRF.fit(X_50_train, y_50_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [50]:
print(BestRF.best_params_)
print(BestRF.best_score_)

{'model__max_depth': 9, 'model__n_estimators': 80}
0.8461728648991848


In [51]:
y_50_BestRFpred = BestRF.predict(X_50_test)

print(confusion_matrix(y_50_test, y_50_BestRFpred))
print(pd.DataFrame(classification_report(
    y_50_test, y_50_BestRFpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[ 429  455]
 [ 545 2626]]
           no aki      aki  accuracy  macro avg  weighted avg
precision    0.44     0.85      0.75       0.65          0.76
recall       0.49     0.83      0.75       0.66          0.75
f1-score     0.46     0.84      0.75       0.65          0.76
support    884.00  3171.00      0.75    4055.00       4055.00


## AdaBoost

In [52]:
# adaboost
ada_param = {
    'model__n_estimators': [100, 200, 500],
    'model__learning_rate': np.arange(start = 0.05, stop = 0.25, step = 0.05)
}

In [53]:
Ada_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', AdaBoostClassifier())
])

BestAda = GridSearchCV(
    Ada_pipe,
    param_grid = ada_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)

In [54]:
# case: df_10
BestAda.fit(X_10_train, y_10_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [55]:
print(BestAda.best_params_)
print(BestAda.best_score_)

{'model__learning_rate': 0.1, 'model__n_estimators': 200}
0.8261643708840841


In [56]:
# feature importance
BestAda_10 = AdaBoostClassifier(n_estimators = 200, learning_rate = 0.1)
BestAda_10.fit(X_10_train, y_10_train)
FI_BestAda_10 = pd.DataFrame(
    data = BestAda_10.feature_importances_,
    index = X_10_train.columns,
    columns = ['FI']
)

In [57]:
FI_BestAda_10.sort_values(by = 'FI', ascending = False).head(15)

Unnamed: 0,FI
weight_admit,0.165
admission_age,0.095
sbp_min,0.075
gcs_verbal,0.065
bun_max,0.035
chloride_max.1,0.035
pt_max,0.035
glucose_max,0.035
gcs_eyes,0.035
inr_max,0.03


In [58]:
# unimportant features
FI_BestAda_10[FI_BestAda_10.FI == 0]

Unnamed: 0,FI
heart_rate_min,0.0
dbp_max,0.0
mbp_min,0.0
mbp_mean,0.0
resp_rate_min,0.0
temperature_min,0.0
temperature_mean,0.0
glucose_mean,0.0
hematocrit_min.1,0.0
hematocrit_max.1,0.0


In [59]:
y_10_BestAdapred = BestAda.predict(X_10_test)

print(confusion_matrix(y_10_test, y_10_BestAdapred))
print(pd.DataFrame(classification_report(
    y_10_test, y_10_BestAdapred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[1143 2501]
 [ 600 7346]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.66     0.75      0.73       0.70          0.72
recall        0.31     0.92      0.73       0.62          0.73
f1-score      0.42     0.83      0.73       0.63          0.70
support    3644.00  7946.00      0.73   11590.00      11590.00


In [60]:
# case: df_50
BestAda.fit(X_50_train, y_50_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [61]:
print(BestAda.best_params_)
print(BestAda.best_score_)

{'model__learning_rate': 0.2, 'model__n_estimators': 200}
0.8839393641720394


In [62]:
BestAda_50 = AdaBoostClassifier(n_estimators = 200, learning_rate = 0.2)
BestAda_50.fit(X_50_train, y_50_train)
FI_BestAda_50 = pd.DataFrame(
    data = BestAda_50.feature_importances_,
    index = X_50_train.columns,
    columns = ['FI']
)

In [63]:
FI_BestAda_50.sort_values(by = 'FI', ascending = False).head(15)

Unnamed: 0,FI
weight_admit,0.11
admission_age,0.05
po2_max,0.045
sbp_mean,0.04
platelets_min,0.03
ptt_max,0.03
potassium_max.1,0.03
sbp_min,0.03
sbp_max,0.025
glucose_min,0.025


In [64]:
FI_BestAda_50[FI_BestAda_50.FI == 0]

Unnamed: 0,FI
dbp_max,0.0
resp_rate_min,0.0
temperature_mean,0.0
spo2_max,0.0
glucose_mean,0.0
pco2_min,0.0
totalco2_min,0.0
hematocrit_min.1,0.0
hematocrit_max.1,0.0
platelets_max,0.0


In [65]:
y_50_BestAdapred = BestAda.predict(X_50_test)

print(confusion_matrix(y_50_test, y_50_BestAdapred))
print(pd.DataFrame(classification_report(
    y_50_test, y_50_BestAdapred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[ 152  732]
 [  80 3091]]
           no aki      aki  accuracy  macro avg  weighted avg
precision    0.66     0.81       0.8       0.73          0.78
recall       0.17     0.97       0.8       0.57          0.80
f1-score     0.27     0.88       0.8       0.58          0.75
support    884.00  3171.00       0.8    4055.00       4055.00


## XGBoost

In [66]:
# xgboost
# grid for n_estimators, max_depth and learning rate
xg_param = {
    'model__n_estimators': [500, 1000, 1500],
    'model__max_depth': np.arange(start = 3, stop = 7, step = 1),
    'model__learning_rate': np.arange(start = 0.1, stop = 0.35, step = 0.05)
}

In [67]:
# standardization + xgboost classifier
XGBoost_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', XGBClassifier(
        importance_type = 'gain')) # criteria for feature importance
])

# GridSearch with CV
BestXGBoost = GridSearchCV(
    estimator = XGBoost_pipe,
    param_grid = xg_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)

In [92]:
# case: df_10

# xgboost classifier do not have 'class_weight' option to handle imbalance issue
# therefore, we compute a balanced sample weight and pass it to GridSearch
class_weight = compute_sample_weight(
    class_weight = 'balanced',
    y = y_10_train)

BestXGBoost.fit(X_10_train, y_10_train, **{'model__sample_weight': class_weight}) # self-generated class_weight passed

In [69]:
# best learning rate, max_depth and n_estimators, with best f1-score
print(BestXGBoost.best_params_)
print(BestXGBoost.best_score_)

{'model__learning_rate': 0.1, 'model__max_depth': 6, 'model__n_estimators': 1500}
0.8049383618356646


In [93]:
# we want to have a look at the feature importance
# since GridSearch.fit does not have attributes for feature importance
# we refit a xgboost classifier with optimal hyperparameters
BestXGBoost_10 = XGBClassifier(
    learning_rate = 0.1,
    max_depth = 6,
    n_estimators = 1500,
    importance_type = 'gain')

# refit on training data
BestXGBoost_10.fit(X_10_train, y_10_train, sample_weight = class_weight)

# extract feature importance from xgboost.fit
FI_BestXGBoost_10 = pd.DataFrame(
    data = BestXGBoost_10.feature_importances_,
    index = X_10_train.columns,
    columns = ['FI']
)

In [94]:
# rank fi and show the first 15 rows
FI_BestXGBoost_10.sort_values(by = 'FI', ascending = False).head(15)

Unnamed: 0,FI
gcs_unable,0.157892
gcs_verbal,0.064279
bun_max,0.028702
inr_max,0.027888
weight_admit,0.021876
bun_min,0.021467
gcs_eyes,0.018876
sbp_min,0.017702
admission_age,0.015902
spo2_min,0.015053


In [95]:
# columns with no importance
FI_BestXGBoost_10[FI_BestXGBoost_10.FI == 0]

Unnamed: 0,FI


In [73]:
# predictions on testing data
y_10_BestXGBpred = BestXGBoost.predict(X_10_test)

# f1-score
print(f1_score(y_10_test, y_10_BestXGBpred))

# confusion matrix
print(confusion_matrix(y_10_test, y_10_BestXGBpred))

# detailed classification report
print(pd.DataFrame(classification_report(
    y_10_test, y_10_BestXGBpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

0.8019991361757265
[[1882 1762]
 [1447 6499]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.57     0.79      0.72       0.68          0.72
recall        0.52     0.82      0.72       0.67          0.72
f1-score      0.54     0.80      0.72       0.67          0.72
support    3644.00  7946.00      0.72   11590.00      11590.00


In [None]:
# this chunk requires package sklearn-genetic
# genetic algorithm for feature selection
# very time-consuming: several hours needed
#
# selector_XGBoost = GeneticSelectionCV(
#     XGBoost_pipe,
#     cv = stratifiedCV,
#     scoring = 'f1',
#     verbose = 1,
#     n_jobs = -1
# )
#
# selector_XGBoost.fit(X_50_train, y_50_train, **{'model__sample_weight': class_weight})
# selection_XGBoost.support_

In [74]:
# case: df_50
# again, compute the sample weight manually
class_weight = compute_sample_weight(
    class_weight = 'balanced',
    y = y_50_train)

BestXGBoost.fit(X_50_train, y_50_train, **{'model__sample_weight': class_weight})

Fitting 3 folds for each of 60 candidates, totalling 180 fits


In [75]:
# best learning rate, max_depth and n_estimators with best f1-score
print(BestXGBoost.best_params_)
print(BestXGBoost.best_score_)

{'model__learning_rate': 0.25000000000000006, 'model__max_depth': 6, 'model__n_estimators': 1500}
0.8703593095484295


In [88]:
# similarly, refit model with optimal hyperparamter for feature importance
BestXGBoost_50 = XGBClassifier(
    learning_rate = 0.25,
    max_depth = 6,
    n_estimators = 1500,
    importance_type = 'gain')

BestXGBoost_50.fit(X_50_train, y_50_train, sample_weight = class_weight)

# extract feature importance
FI_BestXGBoost_50 = pd.DataFrame(
    data = BestXGBoost_50.feature_importances_,
    index = X_50_train.columns,
    columns = ['FI']
)

In [89]:
# rank and display the first 15 features
FI_BestXGBoost_50.sort_values(by = 'FI', ascending = False).head(15)

Unnamed: 0,FI
bun_min,0.048386
gcs_verbal,0.025519
weight_admit,0.024094
bun_max,0.021914
race_hispanic/latino,0.021826
race_black,0.021334
gcs_eyes,0.020613
inr_max,0.019969
po2_max,0.017662
sbp_min,0.017342


In [90]:
# features with no importance
FI_BestXGBoost_50[FI_BestXGBoost_50.FI == 0]

Unnamed: 0,FI


In [79]:
# predictions on testing data
y_50_BestXGBpred = BestXGBoost.predict(X_50_test)

# f1-score
print(f1_score(y_50_test, y_50_BestXGBpred))

# AUROC
# print(roc_auc_score(y_50_test, y_50_BestXGBpred))

# confusion matrix
print(confusion_matrix(y_50_test, y_50_BestXGBpred))

# detailed classificaition report
print(pd.DataFrame(classification_report(
    y_50_test, y_50_BestXGBpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

0.8712041884816756
[[ 282  602]
 [ 259 2912]]
           no aki      aki  accuracy  macro avg  weighted avg
precision    0.52     0.83      0.79       0.67          0.76
recall       0.32     0.92      0.79       0.62          0.79
f1-score     0.40     0.87      0.79       0.63          0.77
support    884.00  3171.00      0.79    4055.00       4055.00


# SVM

In [80]:
# SVM
# grid for C (and kernel)
svm_param = {
    'model__C': [1, 10, 100] # np.arange(start = 1, stop = 1, step = 0.1),
    # 'kernel': ['linear', 'rbf']
    }

In [81]:
# standardization + SVC with rbf kernel
SVC_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVC(class_weight = 'balanced', kernel = 'rbf'))
])

# GridSearch + CV
BestSVC = GridSearchCV(
    SVC_pipe,
    param_grid = svm_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)

In [82]:
BestSVC.fit(X_10_train, y_10_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [83]:
# best C with best f1-score
print(BestSVC.best_params_)
print(BestSVC.best_score_)

{'model__C': 10}
0.7573708483689301


In [84]:
# predictions on testing data
y_10_BestSVCpred = BestSVC.predict(X_10_test)

# f1-score
print(f1_score(y_10_test, y_10_BestSVCpred))

# confusion matrix
print(confusion_matrix(y_10_test, y_10_BestSVCpred))

# detailed classification report
print(pd.DataFrame(classification_report(
    y_10_test, y_10_BestSVCpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

0.7556032747282245
[[2318 1326]
 [2316 5630]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.50     0.81      0.69       0.65          0.71
recall        0.64     0.71      0.69       0.67          0.69
f1-score      0.56     0.76      0.69       0.66          0.69
support    3644.00  7946.00      0.69   11590.00      11590.00


In [85]:
# case: df_50
BestSVC.fit(X_50_train, y_50_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [86]:
# best C with best f1-score
print(BestSVC.best_params_)
print(BestSVC.best_score_)

{'model__C': 100}
0.8329237565957466


In [87]:
# predictions on testing data
y_50_BestSVCpred = BestSVC.predict(X_50_test)

# f1-score
print(f1_score(y_50_test, y_50_BestSVCpred))

# confusion matrix
print(confusion_matrix(y_50_test, y_50_BestSVCpred))

# detailed classification report
print(pd.DataFrame(classification_report(
    y_50_test, y_50_BestSVCpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

0.8307548054383499
[[ 314  570]
 [ 513 2658]]
           no aki      aki  accuracy  macro avg  weighted avg
precision    0.38     0.82      0.73        0.6          0.73
recall       0.36     0.84      0.73        0.6          0.73
f1-score     0.37     0.83      0.73        0.6          0.73
support    884.00  3171.00      0.73     4055.0       4055.00
