<a href="https://colab.research.google.com/github/Shi-Yile/SPH6004-Assignment-1/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install sklearn-genetic

In [2]:
# Import packages

# Data-processing packages
import pandas as pd
import numpy as np

# Plotting packages
import matplotlib.pyplot as plt
import seaborn as sn

# ML packages
import sklearn

from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import mutual_info_classif

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, StratifiedKFold, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.metrics import classification_report, confusion_matrix, f1_score

# feature selection algorithms (time-consuming)
# from sklearn.feature_selection import SequentialFeatureSelector
# from sklearn-genetic import GeneticSearchCV

# working directory settings
import os
from google.colab import drive

In [3]:
# give permission to load data from google drive
drive.mount('/content/drive')
os.chdir('drive/My Drive/SPH6004/Assignment-1')

Mounted at /content/drive


In [4]:
# load dataset
df = pd.read_csv('data.csv')
df_origin = df.copy()
df.head()

Unnamed: 0,id,aki,gender,admission_age,race,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,...,ggt_max,ld_ldh_min,ld_ldh_max,gcs_min,gcs_motor,gcs_verbal,gcs_eyes,gcs_unable,height,weight_admit
0,36570066,3,F,79.953141,BLACK/AFRICAN AMERICAN,96.0,104.0,100.083333,103.0,126.0,...,,236.0,318.0,15.0,6.0,5.0,4.0,0.0,157.0,110.0
1,39307659,0,F,78.194169,WHITE - RUSSIAN,72.0,134.0,97.263158,97.0,127.0,...,,,,15.0,6.0,5.0,4.0,0.0,,82.0
2,38743306,2,F,65.602396,WHITE,60.0,97.0,84.166667,95.0,143.0,...,,,,15.0,6.0,5.0,4.0,0.0,,62.1
3,32339865,2,F,64.906629,UNKNOWN,59.0,87.0,71.461538,113.0,150.0,...,,,,15.0,1.0,0.0,1.0,1.0,170.0,113.1
4,35526987,2,M,57.438861,WHITE,57.0,100.0,82.387097,81.0,127.0,...,,,,15.0,,0.0,1.0,1.0,178.0,97.4


In [5]:
# basic infomration of each column in df
df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50920 entries, 0 to 50919
Data columns (total 162 columns):
 #    Column                  Non-Null Count  Dtype  
---   ------                  --------------  -----  
 0    id                      50920 non-null  int64  
 1    aki                     50920 non-null  int64  
 2    gender                  50920 non-null  object 
 3    admission_age           50920 non-null  float64
 4    race                    50920 non-null  object 
 5    heart_rate_min          50841 non-null  float64
 6    heart_rate_max          50841 non-null  float64
 7    heart_rate_mean         50841 non-null  float64
 8    sbp_min                 50823 non-null  float64
 9    sbp_max                 50823 non-null  float64
 10   sbp_mean                50823 non-null  float64
 11   dbp_min                 50823 non-null  float64
 12   dbp_max                 50823 non-null  float64
 13   dbp_mean                50823 non-null  float64
 14   mbp_min             

  df.info(verbose = True, null_counts = True)


In [6]:
# frequencies in 'race'
df['race'].value_counts()

WHITE                                        32637
UNKNOWN                                       5579
BLACK/AFRICAN AMERICAN                        3845
OTHER                                         1745
WHITE - OTHER EUROPEAN                         918
UNABLE TO OBTAIN                               726
ASIAN                                          614
ASIAN - CHINESE                                547
HISPANIC/LATINO - PUERTO RICAN                 530
HISPANIC OR LATINO                             501
WHITE - RUSSIAN                                430
PATIENT DECLINED TO ANSWER                     371
HISPANIC/LATINO - DOMINICAN                    337
BLACK/CAPE VERDEAN                             319
BLACK/CARIBBEAN ISLAND                         282
BLACK/AFRICAN                                  194
ASIAN - SOUTH EAST ASIAN                       168
PORTUGUESE                                     161
ASIAN - ASIAN INDIAN                           121
WHITE - EASTERN EUROPEAN       

In [7]:
# re-catergorize 'race' with fewer levels, and create dummy variables
# white, black, asian, hispanic/latino, other, unknown
# df['white'] = df['race'].apply(lambda x: 1 if 'WHITE' in x else 0)
df['race_white'] = df['race'].apply(lambda x: int('WHITE' in x))
df['race_black'] = df['race'].apply(lambda x: int('BLACK' in x))
df['race_asian'] = df['race'].apply(lambda x: int('ASIAN' in x))
df['race_hispanic/latino'] = df['race'].apply(
    lambda x: int('HISPANIC' in x or 'LATINO' in x))
df['race_unknown'] = df['race'].apply(lambda x: int(
    x == 'UNKNOWN' or x == 'UNABLE TO OBTAIN' or
    x == 'PATIENT DECLINED TO ANSWER'))
df['race_other'] = df['race'].apply(lambda x: int(
    x == 'OTHER' or x == 'PORTUGUESE' or
    x == 'AMERICAN INDIAN/ALASKA NATIVE' or
    x == 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER' or
    x == 'MULTIPLE RACE/ETHNICITY' or x == 'SOUTH AMERICAN'))

# frequency in each level
print('WHITE: ', df['race_white'].sum())
print('BLACK: ', df['race_black'].sum())
print('ASIAN: ', df['race_asian'].sum())
print('HISPANIC/LATINO: ', df['race_hispanic/latino'].sum())
print('OTHER: ', df['race_other'].sum())
print('UNKNOWN: ', df['race_unknown'].sum())

WHITE:  34204
BLACK:  4640
ASIAN:  1496
HISPANIC/LATINO:  1735
OTHER:  2169
UNKNOWN:  6676


In [8]:
# recode gender as 0-1  0: male, 1: female
df['gender_'] = df['gender'].apply(lambda x:int(x == 'F'))
df['gender_'].value_counts()

0    28440
1    22480
Name: gender_, dtype: int64

In [9]:
# re-categorise AKI as a binary column  0: aki = 0, 1: aki = 1/2/3
df['aki_'] = df['aki'].apply(
    lambda x:int(x == 1 or x == 2 or x == 3))
df['aki_'].value_counts()

1    34060
0    16860
Name: aki_, dtype: int64

In [10]:
# NA detected, thus need further processing including feature selection
# calculate the proportion of NAs in each column
df_NA_prop = 1 - df.count() / len(df)

# extract two sets of indices from df
# whose correpsonding columns including NAs less than 10% and 50%, respectively
idx_NA10 = df_NA_prop[df_NA_prop <= 0.1].index
idx_NA50 = df_NA_prop[df_NA_prop <= 0.5].index
# idx_NA10

# extract two subsets of df based on previous indices
df_sub_NA10 = df[idx_NA10]
df_sub_NA50 = df[idx_NA50]
# df_sub_NA10.info()

#  drop rows with NAs
df_sub_com10 = df_sub_NA10.dropna()
df_sub_com50 = df_sub_NA50.dropna()
# df_sub_com10.info()

In [11]:
# df_sub_com10.info()
# df_sub_NA50.info()

In [12]:
df_sub_com10['aki_'].value_counts()

1    26484
0    12147
Name: aki_, dtype: int64

In [13]:
df_sub_com50['aki_'].value_counts()

1    10569
0     2945
Name: aki_, dtype: int64

In [14]:
# create outcome vector and predictor matrix
# drop id and race first
df_10 = df_sub_com10.drop(columns = ['id', 'aki', 'race', 'gender'])
df_50 = df_sub_com50.drop(columns = ['id', 'aki', 'race', 'gender'])

In [15]:
# split dataset
# first we extract X and y

# df_10
y_df_10 = df_10['aki_']
X_df_10 = df_10.drop(columns = ['aki_'])

# df_50
y_df_50 = df_50['aki_']
X_df_50 = df_50.drop(columns = ['aki_'])

In [16]:
# compute the mutual information of each factor with aki_binary
# case: df_10
MI_df_10 = pd.DataFrame(
    data = mutual_info_classif(X_df_10, y_df_10),
    index = X_df_10.columns,
    columns = ['MI'])

MI_df_10.sort_values(by = 'MI', ascending = False).head(15)

Unnamed: 0,MI
bun_max,0.028571
bun_min,0.024963
pt_max,0.022819
resp_rate_mean,0.019586
glucose_mean,0.018459
gcs_verbal,0.01772
sbp_min,0.017033
admission_age,0.016678
dbp_mean,0.016451
dbp_min,0.015894


In [17]:
# factors with 0 MI with aki
MI_df_10[MI_df_10.MI == 0]

Unnamed: 0,MI
hemoglobin_max.1,0.0
chloride_min.1,0.0
race_black,0.0
race_asian,0.0
race_hispanic/latino,0.0
race_unknown,0.0
race_other,0.0


In [18]:
# drop columns irrelated to aki
df_10.drop(columns = MI_df_10[MI_df_10.MI == 0].index, inplace = True)

In [19]:
# case: df_50
MI_df_50 = pd.DataFrame(
    data = mutual_info_classif(X_df_50, y_df_50),
    index = X_df_50.columns,
    columns = ['MI'])

MI_df_50.sort_values(by = 'MI', ascending = False).head(15)

Unnamed: 0,MI
weight_admit,0.025629
bun_max,0.023822
bun_min,0.023588
mbp_min,0.02066
glucose_mean,0.013487
pt_min,0.012257
dbp_min,0.011885
sbp_min,0.01183
inr_max,0.011289
admission_age,0.011208


In [20]:
# 0-MI factors
MI_df_50[MI_df_50.MI == 0]

Unnamed: 0,MI
dbp_max,0.0
resp_rate_min,0.0
temperature_mean,0.0
spo2_max,0.0
lactate_min,0.0
pco2_min,0.0
totalco2_min,0.0
platelets_min,0.0
wbc_min,0.0
wbc_max,0.0


In [21]:
df_50.drop(columns = MI_df_50[MI_df_50.MI == 0].index, inplace = True)

In [22]:
# standardization on X: (X - mean) / std
# Std = StandardScaler(copy = False)
# X_std_df_10 = pd.DataFrame(
    # data = Std.fit_transform(X_df_10),
    # columns = Std.get_feature_names_out(input_features = X_df_10.columns))

# X_std_df_50 = pd.DataFrame(
    # data = Std.fit_transform(X_df_50),
    # columns = Std.get_feature_names_out(input_features = X_df_50.columns))

In [23]:
# training - test split
X_10_train, X_10_test, y_10_train, y_10_test = train_test_split(
    X_df_10, y_df_10, test_size = 0.3, random_state = 42,
    stratify = y_df_10, shuffle = True)

X_50_train, X_50_test, y_50_train, y_50_test = train_test_split(
    X_df_50, y_df_50, test_size = 0.3, random_state = 42,
    stratify = y_df_50, shuffle = True)

## Logistic Regression

In [24]:
# logistic regression with CV
lr_param = {
    'model__C': np.linspace(0.005, 0.05, 10),
}

In [25]:
# GridSearch with CV for best C
stratifiedCV = StratifiedKFold(n_splits = 3)
LR_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        class_weight = 'balanced',
        solver = 'saga',
        penalty = 'l1',
        max_iter = 50000))
])


BestLR = GridSearchCV(
    LR_pipe,
    param_grid = lr_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)

In [26]:
# this required package sklearn-genetic
# genetic algorithm for feature selection
# very time-consuming: several hours are needed for logistic regression
# selector_LR = GeneticSelectionCV(
#     LR_pipe,
#     cv = stratifiedCV,
#     scoring = 'f1',
#     verbose = 1,
#     n_jobs = -1
# )

In [27]:
# stepwise - forward selection
# LR_pipe_C = Pipeline([
#     ('scaler', StandardScaler()),
#     ('model', LogisticRegression(
#         class_weight = 'balanced',
#         solver = 'saga',
#         penalty = 'l1',
#         C = 100,
#         max_iter = 50000
#     ))
# ])

# forward_selection = SequentialFeatureSelector(
#     LR_pipe_C,
#     n_features_to_select = 15,
#     direction = 'forward',
#     n_jobs = -1
# )

In [28]:
# forward_selection.fit(X_10_train, y_10_train)

In [29]:
# case: df_10
BestLR.fit(X_10_train, y_10_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [30]:
print(BestLR.best_params_)
print(BestLR.best_score_)

{'model__C': 0.045000000000000005}
0.7347461891928089


In [31]:
y_10_BestLR_pred = BestLR.predict(X_10_test)

print(confusion_matrix(y_10_test, y_10_BestLR_pred))
print(pd.DataFrame(classification_report(
    y_10_test, y_10_BestLR_pred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[2562 1082]
 [2683 5263]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.49     0.83      0.68       0.66          0.72
recall        0.70     0.66      0.68       0.68          0.68
f1-score      0.58     0.74      0.68       0.66          0.69
support    3644.00  7946.00      0.68   11590.00      11590.00


In [32]:
# case: df_50
BestLR.fit(X_50_train, y_50_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [33]:
print(BestLR.best_params_)
print(BestLR.best_score_)

{'model__C': 0.04}
0.7561875542541747


In [94]:
y_50_BestLRpred = BestLR.predict(X_50_test)

print(f1_score(y_50_test, y_50_BestLRpred))

print(confusion_matrix(y_50_test, y_50_BestLRpred))
print(pd.DataFrame(classification_report(
    y_50_test, y_50_BestLRpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

0.767491926803014
[[ 620  264]
 [1032 2139]]
           no aki      aki  accuracy  macro avg  weighted avg
precision    0.38     0.89      0.68       0.63          0.78
recall       0.70     0.67      0.68       0.69          0.68
f1-score     0.49     0.77      0.68       0.63          0.71
support    884.00  3171.00      0.68    4055.00       4055.00


# Tree-based Model

## Decision Tree

In [35]:
# Tree-base models
# decision tree
dtc_param = {'model__max_depth': [2, 3, 4, 5, 6, 7]}

In [36]:
Tree_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', DecisionTreeClassifier(
        criterion = 'entropy',
        class_weight = 'balanced'
    ))
])


BestDTC = GridSearchCV(
    Tree_pipe,
    param_grid = dtc_param,
    scoring = 'f1',
    cv = stratifiedCV
)

In [37]:
# case: df_10
BestDTC.fit(X_10_train, y_10_train)

In [38]:
print(BestDTC.best_params_)
print(BestDTC.best_score_)

{'model__max_depth': 2}
0.7589189330014632


In [39]:
y_10_BestDTCpred = BestDTC.predict(X_10_test)

print(confusion_matrix(y_10_test, y_10_BestDTCpred))
print(pd.DataFrame(classification_report(
    y_10_test, y_10_BestDTCpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[1753 1891]
 [1954 5992]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.47     0.76      0.67       0.62          0.67
recall        0.48     0.75      0.67       0.62          0.67
f1-score      0.48     0.76      0.67       0.62          0.67
support    3644.00  7946.00      0.67   11590.00      11590.00


In [40]:
# case: df_50
BestDTC.fit(X_50_train, y_50_train)

In [41]:
print(BestDTC.best_params_)
print(BestDTC.best_score_)

{'model__max_depth': 2}
0.7544027250491357


In [91]:
BestDTC.get_params()

{'cv': StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('scaler', StandardScaler()),
  ('model',
   DecisionTreeClassifier(class_weight='balanced', criterion='entropy'))],
 'estimator__verbose': False,
 'estimator__scaler': StandardScaler(),
 'estimator__model': DecisionTreeClassifier(class_weight='balanced', criterion='entropy'),
 'estimator__scaler__copy': True,
 'estimator__scaler__with_mean': True,
 'estimator__scaler__with_std': True,
 'estimator__model__ccp_alpha': 0.0,
 'estimator__model__class_weight': 'balanced',
 'estimator__model__criterion': 'entropy',
 'estimator__model__max_depth': None,
 'estimator__model__max_features': None,
 'estimator__model__max_leaf_nodes': None,
 'estimator__model__min_impurity_decrease': 0.0,
 'estimator__model__min_samples_leaf': 1,
 'estimator__model__min_samples_split': 2,
 'estimator__model__min_weight_fraction_leaf': 0.0,
 'estimator__model__random_state': 

In [89]:
y_50_BestDTCpred = BestDTC.predict(X_50_test)

print(f1_score(y_50_test, y_50_BestDTCpred))

print(confusion_matrix(y_50_test, y_50_BestDTCpred))
print(pd.DataFrame(classification_report(
    y_50_test, y_50_BestDTCpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

0.7997336440819044
[[ 450  434]
 [ 769 2402]]
           no aki      aki  accuracy  macro avg  weighted avg
precision    0.37     0.85       0.7       0.61          0.74
recall       0.51     0.76       0.7       0.63          0.70
f1-score     0.43     0.80       0.7       0.61          0.72
support    884.00  3171.00       0.7    4055.00       4055.00


## Random Forest

In [43]:
# random forest
rf_param = {
    'model__n_estimators': np.arange(start = 10, stop = 100, step = 10),
    'model__max_depth': np.arange(start = 2, stop = 10, step = 1)
}

In [44]:
RF_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(
        criterion = 'entropy',
        class_weight = 'balanced'
        ))
])

BestRF = GridSearchCV(
    RF_pipe,
    param_grid = rf_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)

In [45]:
BestRF.fit(X_10_train, y_10_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [46]:
print(BestRF.best_params_)
print(BestRF.best_score_)

{'model__max_depth': 9, 'model__n_estimators': 90}
0.7688202316502513


In [47]:
y_10_BestRFpred = BestRF.predict(X_10_test)

print(confusion_matrix(y_10_test, y_10_BestRFpred))
print(pd.DataFrame(classification_report(
    y_10_test, y_10_BestRFpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[2320 1324]
 [2246 5700]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.51     0.81      0.69       0.66          0.72
recall        0.64     0.72      0.69       0.68          0.69
f1-score      0.57     0.76      0.69       0.66          0.70
support    3644.00  7946.00      0.69   11590.00      11590.00


In [48]:
# case: df_50
BestRF.fit(X_50_train, y_50_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [49]:
print(BestRF.best_params_)
print(BestRF.best_score_)

{'model__max_depth': 9, 'model__n_estimators': 90}
0.8474058289347299


In [50]:
y_50_BestRFpred = BestRF.predict(X_50_test)

print(confusion_matrix(y_50_test, y_50_BestRFpred))
print(pd.DataFrame(classification_report(
    y_50_test, y_50_BestRFpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[ 445  439]
 [ 550 2621]]
           no aki      aki  accuracy  macro avg  weighted avg
precision    0.45     0.86      0.76       0.65          0.77
recall       0.50     0.83      0.76       0.66          0.76
f1-score     0.47     0.84      0.76       0.66          0.76
support    884.00  3171.00      0.76    4055.00       4055.00


## AdaBoost

In [51]:
# adaboost
ada_param = {
    'model__n_estimators': [100, 200, 500],
    'model__learning_rate': np.arange(start = 0.05, stop = 0.25, step = 0.05)
}

In [52]:
Ada_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', AdaBoostClassifier())
])

BestAda = GridSearchCV(
    Ada_pipe,
    param_grid = ada_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)

In [53]:
# case: df_10
BestAda.fit(X_10_train, y_10_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [54]:
print(BestAda.best_params_)
print(BestAda.best_score_)

{'model__learning_rate': 0.1, 'model__n_estimators': 200}
0.8261643708840841


In [55]:
# feature importance
BestAda_10 = AdaBoostClassifier(n_estimators = 200, learning_rate = 0.1)
BestAda_10.fit(X_10_train, y_10_train)
FI_BestAda_10 = pd.DataFrame(
    data = BestAda_10.feature_importances_,
    index = X_10_train.columns,
    columns = ['FI']
)

In [56]:
FI_BestAda_10.sort_values(by = 'FI', ascending = False).head(15)

Unnamed: 0,FI
weight_admit,0.165
admission_age,0.095
sbp_min,0.075
gcs_verbal,0.065
bun_max,0.035
chloride_max.1,0.035
pt_max,0.035
glucose_max,0.035
gcs_eyes,0.035
inr_max,0.03


In [57]:
# unimportant features
FI_BestAda_10[FI_BestAda_10.FI == 0]

Unnamed: 0,FI
heart_rate_min,0.0
dbp_max,0.0
mbp_min,0.0
mbp_mean,0.0
resp_rate_min,0.0
temperature_min,0.0
temperature_mean,0.0
glucose_mean,0.0
hematocrit_min.1,0.0
hematocrit_max.1,0.0


In [58]:
y_10_BestAdapred = BestAda.predict(X_10_test)

print(confusion_matrix(y_10_test, y_10_BestAdapred))
print(pd.DataFrame(classification_report(
    y_10_test, y_10_BestAdapred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[1143 2501]
 [ 600 7346]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.66     0.75      0.73       0.70          0.72
recall        0.31     0.92      0.73       0.62          0.73
f1-score      0.42     0.83      0.73       0.63          0.70
support    3644.00  7946.00      0.73   11590.00      11590.00


In [59]:
# case: df_50
BestAda.fit(X_50_train, y_50_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [60]:
print(BestAda.best_params_)
print(BestAda.best_score_)

{'model__learning_rate': 0.2, 'model__n_estimators': 200}
0.8839393641720394


In [61]:
BestAda_50 = AdaBoostClassifier(n_estimators = 200, learning_rate = 0.2)
BestAda_50.fit(X_50_train, y_50_train)
FI_BestAda_50 = pd.DataFrame(
    data = BestAda_50.feature_importances_,
    index = X_50_train.columns,
    columns = ['FI']
)

In [62]:
FI_BestAda_50.sort_values(by = 'FI', ascending = False).head(15)

Unnamed: 0,FI
weight_admit,0.11
admission_age,0.05
po2_max,0.045
sbp_mean,0.04
platelets_min,0.03
ptt_max,0.03
potassium_max.1,0.03
sbp_min,0.03
sbp_max,0.025
glucose_min,0.025


In [63]:
FI_BestAda_50[FI_BestAda_50.FI == 0]

Unnamed: 0,FI
dbp_max,0.0
resp_rate_min,0.0
temperature_mean,0.0
spo2_max,0.0
glucose_mean,0.0
pco2_min,0.0
totalco2_min,0.0
hematocrit_min.1,0.0
hematocrit_max.1,0.0
platelets_max,0.0


In [64]:
y_50_BestAdapred = BestAda.predict(X_50_test)

print(confusion_matrix(y_50_test, y_50_BestAdapred))
print(pd.DataFrame(classification_report(
    y_50_test, y_50_BestAdapred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[ 152  732]
 [  80 3091]]
           no aki      aki  accuracy  macro avg  weighted avg
precision    0.66     0.81       0.8       0.73          0.78
recall       0.17     0.97       0.8       0.57          0.80
f1-score     0.27     0.88       0.8       0.58          0.75
support    884.00  3171.00       0.8    4055.00       4055.00


## XGBoost

In [65]:
# xgboost
xg_param = {
    'model__n_estimators': [100, 200, 500],
    'model__max_depth': np.arange(start = 2, stop = 5, step = 1),
    'model__learning_rate': np.arange(start = 0.01, stop = 0.1, step = 0.01)
}

In [66]:
XGBoost_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', XGBClassifier(importance_type = 'gain'))
])

BestXGBoost = GridSearchCV(
    estimator = XGBoost_pipe,
    param_grid = xg_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)

In [67]:
# case: df_10
BestXGBoost.fit(X_10_train, y_10_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [68]:
print(BestXGBoost.best_params_)
print(BestXGBoost.best_score_)

{'model__learning_rate': 0.02, 'model__max_depth': 4, 'model__n_estimators': 200}
0.8272823655868796


In [69]:
BestXGBoost_10 = XGBClassifier(learning_rate = 0.02, max_depth = 4,
                               n_estimators = 200, importance_type = 'gain')
BestXGBoost_10.fit(X_10_train, y_10_train)
FI_BestXGBoost_10 = pd.DataFrame(
    data = BestXGBoost_10.feature_importances_,
    index = X_10_train.columns,
    columns = ['FI']
)

In [70]:
FI_BestXGBoost_10.sort_values(by = 'FI', ascending = False).head(15)

Unnamed: 0,FI
bun_max,0.095608
inr_max,0.076064
bun_min,0.061545
gcs_verbal,0.056649
gcs_unable,0.042725
glucose_max,0.031548
potassium_max.1,0.028598
admission_age,0.026312
weight_admit,0.025985
spo2_min,0.025952


In [71]:
FI_BestXGBoost_10[FI_BestXGBoost_10.FI == 0]

Unnamed: 0,FI
race_black,0.0
race_asian,0.0
race_other,0.0


In [72]:
y_10_BestXGBpred = BestXGBoost.predict(X_10_test)

print(confusion_matrix(y_10_test, y_10_BestXGBpred))
print(pd.DataFrame(classification_report(
    y_10_test, y_10_BestXGBpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[1181 2463]
 [ 602 7344]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.66     0.75      0.74       0.71          0.72
recall        0.32     0.92      0.74       0.62          0.74
f1-score      0.44     0.83      0.74       0.63          0.70
support    3644.00  7946.00      0.74   11590.00      11590.00


In [73]:
# case: df_50
BestXGBoost.fit(X_50_train, y_50_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [74]:
print(BestXGBoost.best_params_)
print(BestXGBoost.best_score_)

{'model__learning_rate': 0.060000000000000005, 'model__max_depth': 2, 'model__n_estimators': 200}
0.8839386288153473


In [75]:
BestXGBoost_50 = XGBClassifier(learning_rate = 0.06, max_depth = 2,
                               n_estimators = 200, importance_type = 'gain')
BestXGBoost_50.fit(X_50_train, y_50_train)
FI_BestXGBoost_50 = pd.DataFrame(
    data = BestXGBoost_50.feature_importances_,
    index = X_50_train.columns,
    columns = ['FI']
)

In [76]:
FI_BestXGBoost_50.sort_values(by = 'FI', ascending = False).head(15)

Unnamed: 0,FI
bun_min,0.096028
bun_max,0.075057
gcs_verbal,0.041731
ptt_max,0.030089
weight_admit,0.028388
gcs_eyes,0.023941
admission_age,0.021603
sbp_min,0.021516
pt_max,0.019538
abs_basophils_min,0.019293


In [77]:
FI_BestXGBoost_50[FI_BestXGBoost_50.FI == 0]

Unnamed: 0,FI
pco2_min,0.0
totalco2_min,0.0
platelets_max,0.0
wbc_min,0.0
bicarbonate_min.1,0.0
bicarbonate_max.1,0.0
sodium_max.1,0.0
potassium_min.1,0.0
abs_eosinophils_min,0.0
abs_eosinophils_max,0.0


In [88]:
y_50_BestXGBpred = BestXGBoost.predict(X_50_test)

print(f1_score(y_50_test, y_50_BestXGBpred))

print(confusion_matrix(y_50_test, y_50_BestXGBpred))
print(pd.DataFrame(classification_report(
    y_50_test, y_50_BestXGBpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

0.8841446057500713
[[ 135  749]
 [  65 3106]]
           no aki      aki  accuracy  macro avg  weighted avg
precision    0.68     0.81       0.8       0.74          0.78
recall       0.15     0.98       0.8       0.57          0.80
f1-score     0.25     0.88       0.8       0.57          0.75
support    884.00  3171.00       0.8    4055.00       4055.00


# SVM

In [80]:
# SVM
svm_param = {
    'model__C': [1, 10, 100] # np.arange(start = 1, stop = 1, step = 0.1),
    # 'kernel': ['linear', 'rbf']
    }

In [81]:
SVC_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVC(class_weight = 'balanced', kernel = 'rbf'))
])

BestSVC = GridSearchCV(
    SVC_pipe,
    param_grid = svm_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)

In [82]:
BestSVC.fit(X_10_train, y_10_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [83]:
print(BestSVC.best_params_)
print(BestSVC.best_score_)

{'model__C': 10}
0.7573708483689301


In [84]:
y_10_BestSVCpred = BestSVC.predict(X_10_test)

print(confusion_matrix(y_10_test, y_10_BestSVCpred))
print(pd.DataFrame(classification_report(
    y_10_test, y_10_BestSVCpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[2318 1326]
 [2316 5630]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.50     0.81      0.69       0.65          0.71
recall        0.64     0.71      0.69       0.67          0.69
f1-score      0.56     0.76      0.69       0.66          0.69
support    3644.00  7946.00      0.69   11590.00      11590.00


In [85]:
BestSVC.fit(X_50_train, y_50_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [86]:
print(BestSVC.best_params_)
print(BestSVC.best_score_)

{'model__C': 100}
0.8329237565957466


In [92]:
y_50_BestSVCpred = BestSVC.predict(X_50_test)

print(f1_score(y_50_test, y_50_BestSVCpred))

print(confusion_matrix(y_50_test, y_50_BestSVCpred))
print(pd.DataFrame(classification_report(
    y_50_test, y_50_BestSVCpred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

0.8307548054383499
[[ 314  570]
 [ 513 2658]]
           no aki      aki  accuracy  macro avg  weighted avg
precision    0.38     0.82      0.73        0.6          0.73
recall       0.36     0.84      0.73        0.6          0.73
f1-score     0.37     0.83      0.73        0.6          0.73
support    884.00  3171.00      0.73     4055.0       4055.00
