<a href="https://colab.research.google.com/github/Shi-Yile/SPH6004-Assignment-1/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import packages

# Data-processing packages
import pandas as pd
import numpy as np

# Plotting packages
import matplotlib.pyplot as plt
import seaborn as sn

# ML packages
import sklearn

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, StratifiedKFold, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix, recall_score, accuracy_score


import os
from google.colab import drive

In [3]:
# give permission to load data from google drive
drive.mount('/content/drive')
os.chdir('drive/My Drive/SPH6004/Assignment-1')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# load dataset
df = pd.read_csv('data.csv')
df_origin = df.copy()
df.head()

Unnamed: 0,id,aki,gender,admission_age,race,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,...,ggt_max,ld_ldh_min,ld_ldh_max,gcs_min,gcs_motor,gcs_verbal,gcs_eyes,gcs_unable,height,weight_admit
0,36570066,3,F,79.953141,BLACK/AFRICAN AMERICAN,96.0,104.0,100.083333,103.0,126.0,...,,236.0,318.0,15.0,6.0,5.0,4.0,0.0,157.0,110.0
1,39307659,0,F,78.194169,WHITE - RUSSIAN,72.0,134.0,97.263158,97.0,127.0,...,,,,15.0,6.0,5.0,4.0,0.0,,82.0
2,38743306,2,F,65.602396,WHITE,60.0,97.0,84.166667,95.0,143.0,...,,,,15.0,6.0,5.0,4.0,0.0,,62.1
3,32339865,2,F,64.906629,UNKNOWN,59.0,87.0,71.461538,113.0,150.0,...,,,,15.0,1.0,0.0,1.0,1.0,170.0,113.1
4,35526987,2,M,57.438861,WHITE,57.0,100.0,82.387097,81.0,127.0,...,,,,15.0,,0.0,1.0,1.0,178.0,97.4


In [5]:
# basic infomration of each column in df
df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50920 entries, 0 to 50919
Data columns (total 162 columns):
 #    Column                  Non-Null Count  Dtype  
---   ------                  --------------  -----  
 0    id                      50920 non-null  int64  
 1    aki                     50920 non-null  int64  
 2    gender                  50920 non-null  object 
 3    admission_age           50920 non-null  float64
 4    race                    50920 non-null  object 
 5    heart_rate_min          50841 non-null  float64
 6    heart_rate_max          50841 non-null  float64
 7    heart_rate_mean         50841 non-null  float64
 8    sbp_min                 50823 non-null  float64
 9    sbp_max                 50823 non-null  float64
 10   sbp_mean                50823 non-null  float64
 11   dbp_min                 50823 non-null  float64
 12   dbp_max                 50823 non-null  float64
 13   dbp_mean                50823 non-null  float64
 14   mbp_min             

  df.info(verbose = True, null_counts = True)


In [6]:
# calculate the proportion of NAs in each column
df_NA_prop = 1 - df.count() / len(df)
idx_NA10 = df_NA_prop[df_NA_prop <= 0.1].index
idx_NA50 = df_NA_prop[df_NA_prop <= 0.5].index
# idx_NA10

# extract a subset of df with columns including NAs less than 10%
df_sub_NA10 = df[idx_NA10]
# df_sub_NA10.info()

#  drop rows with NAs
df_sub_com10 = df_sub_NA10.dropna()
# df_sub_com10.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38631 entries, 0 to 50918
Data columns (total 63 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 38631 non-null  int64  
 1   aki                38631 non-null  int64  
 2   gender             38631 non-null  object 
 3   admission_age      38631 non-null  float64
 4   race               38631 non-null  object 
 5   heart_rate_min     38631 non-null  float64
 6   heart_rate_max     38631 non-null  float64
 7   heart_rate_mean    38631 non-null  float64
 8   sbp_min            38631 non-null  float64
 9   sbp_max            38631 non-null  float64
 10  sbp_mean           38631 non-null  float64
 11  dbp_min            38631 non-null  float64
 12  dbp_max            38631 non-null  float64
 13  dbp_mean           38631 non-null  float64
 14  mbp_min            38631 non-null  float64
 15  mbp_max            38631 non-null  float64
 16  mbp_mean           386

In [7]:
# frequencies in 'race'
df_sub_com10['race'].value_counts()

WHITE                                        24528
UNKNOWN                                       4525
BLACK/AFRICAN AMERICAN                        2874
OTHER                                         1345
WHITE - OTHER EUROPEAN                         702
UNABLE TO OBTAIN                               515
ASIAN                                          472
HISPANIC/LATINO - PUERTO RICAN                 409
ASIAN - CHINESE                                406
HISPANIC OR LATINO                             362
WHITE - RUSSIAN                                330
HISPANIC/LATINO - DOMINICAN                    253
PATIENT DECLINED TO ANSWER                     252
BLACK/CAPE VERDEAN                             239
BLACK/CARIBBEAN ISLAND                         216
BLACK/AFRICAN                                  154
ASIAN - SOUTH EAST ASIAN                       136
PORTUGUESE                                     123
WHITE - EASTERN EUROPEAN                       101
ASIAN - ASIAN INDIAN           

In [8]:
# frequencies in 'gender'
df_sub_com10['gender'].value_counts()

M    21509
F    17122
Name: gender, dtype: int64

In [9]:
# recode gender as 0-1  0: male, 1: female
df_sub_com10['gender_'] = df_sub_com10['gender'].apply(lambda x:int(x == 'F'))
df_sub_com10['gender_'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub_com10['gender_'] = df_sub_com10['gender'].apply(lambda x:int(x == 'F'))


0    21509
1    17122
Name: gender_, dtype: int64

In [10]:
# frequencies in 'aki'
df_sub_com10['aki'].value_counts()

2    12570
0    12147
1     7368
3     6546
Name: aki, dtype: int64

In [11]:
# re-categorise AKI as a binary column  0: aki = 0, 1: aki = 1/2/3
df_sub_com10['aki_binary'] = df_sub_com10['aki'].apply(
    lambda x:int(x == 1 or x == 2 or x == 3))
df_sub_com10['aki_binary'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub_com10['aki_binary'] = df_sub_com10['aki'].apply(


1    26484
0    12147
Name: aki_binary, dtype: int64

In [38]:
# create outcome vector and predictor matrix
# drop id and race first
df = df_sub_com10.drop(columns = ['id', 'aki', 'race', 'gender'])
df.head()

Unnamed: 0,admission_age,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,sbp_mean,dbp_min,dbp_max,dbp_mean,...,pt_min,pt_max,gcs_min,gcs_motor,gcs_verbal,gcs_eyes,gcs_unable,weight_admit,gender_,aki_binary
0,79.953141,96.0,104.0,100.083333,103.0,126.0,116.136364,40.0,58.0,47.863636,...,11.9,12.0,15.0,6.0,5.0,4.0,0.0,110.0,1,1
1,78.194169,72.0,134.0,97.263158,97.0,127.0,109.833333,56.0,89.0,70.166667,...,20.0,24.7,15.0,6.0,5.0,4.0,0.0,82.0,1,0
2,65.602396,60.0,97.0,84.166667,95.0,143.0,112.153846,56.0,99.0,73.307692,...,12.1,12.1,15.0,6.0,5.0,4.0,0.0,62.1,1,1
3,64.906629,59.0,87.0,71.461538,113.0,150.0,138.16,60.0,94.0,80.2,...,12.8,12.8,15.0,1.0,0.0,1.0,1.0,113.1,1,1
5,62.002429,78.0,105.0,90.0,80.0,154.0,114.44,42.0,136.0,60.36,...,11.8,12.0,15.0,6.0,5.0,4.0,0.0,137.9,0,1


In [39]:
# split dataset
# first we extract X and y
y_df = df['aki_binary']
X_df = df.drop('aki_binary', axis = 1)

In [40]:
# standardization on X: (X - mean) / std
Std = StandardScaler(copy = False)
X_std_df = pd.DataFrame(data = Std.fit_transform(X_df),
                        columns = Std.get_feature_names_out(input_features = X_df.columns))

In [41]:
# training - test split
X_train, X_test, y_train, y_test = train_test_split(
    X_std_df, y_df, test_size = 0.3, random_state = 42,
    stratify = y_df, shuffle = True)

In [56]:
# logistic regression with CV
lr_param = {
    'C': np.linspace(0.001, 0.01, 10),
}

In [58]:
stratifiedCV = StratifiedKFold(n_splits = 3)
LR = LogisticRegression(class_weight = 'balanced', solver = 'saga', max_iter = 50000)
BestLR = GridSearchCV(
    LR,
    param_grid = lr_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)
BestLR.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [59]:
BestLR.best_params_

{'C': 0.008, 'penalty': 'l2'}

In [61]:
y_BestLR_pred = BestLR.predict(X_test)

print(confusion_matrix(y_test, y_BestLR_pred))
print(pd.DataFrame(classification_report(
    y_test, y_BestLR_pred, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[2557 1087]
 [2682 5264]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.49     0.83      0.67       0.66          0.72
recall        0.70     0.66      0.67       0.68          0.67
f1-score      0.58     0.74      0.67       0.66          0.69
support    3644.00  7946.00      0.67   11590.00      11590.00


logistic

tree-base DTc RF adaboost ostxgbo


SVC

In [None]:
# gridsearch for best C
param_grid = {'C': np.linspace(0.001, 1, 50)}
LR = LogisticRegression(class_weight = 'balanced', solver = 'newton-cholesky', max_iter = 50000)
LR_CV = GridSearchCV(LR, param_grid, cv = 5, scoring = 'f1')
LR_CV.fit(X_train, y_train)
print(LR_CV.best_params_)

{'C': 0.041775510204081635}


In [None]:
LR_opt = LogisticRegression(class_weight = 'balanced', C = 0.042, solver = 'newton-cholesky', max_iter = 50000).fit(X_train, y_train)
prob_LRoptpred_valid = LR_opt.predict_proba(X_valid)
y_LRoptpred_valid = LR_opt.predict(X_valid)

In [None]:
print(accuracy_score(y_valid, y_LRoptpred_valid))
print(roc_auc_score(y_valid, y_LRoptpred_valid))
print(recall_score(y_valid, y_LRoptpred_valid))

0.6600517687661778
0.6659096537784888
0.6501258540093492


In [None]:
print(confusion_matrix(y_valid, y_LRoptpred_valid))
print(pd.DataFrame(classification_report(
    y_valid, y_LRoptpred_valid, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[1739  812]
 [1946 3616]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.47     0.82      0.66       0.64          0.71
recall        0.68     0.65      0.66       0.67          0.66
f1-score      0.56     0.72      0.66       0.64          0.67
support    2551.00  5562.00      0.66    8113.00       8113.00


In [None]:
# Tree-base models
xg_param = {
    'n_estimators': [100, 200, 500],
    'max_depth': np.arange(start = 2, stop = 5, step = 1),
    'learning_rate': np.arange(start = 0.01, stop = 0.1, step = 0.01)
}

xg_param

{'n_estimators': [100, 200, 500],
 'max_depth': array([2, 3, 4]),
 'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09])}

In [None]:
XGBoostModel = XGBClassifier()
BestXGBoost = GridSearchCV(
    XGBoostModel,
    param_grid = xg_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)
BestXGBoost.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [None]:
BestXGBoost.best_params_

{'learning_rate': 0.08, 'max_depth': 3, 'n_estimators': 200}

In [None]:
BestXGBoost.best_score_

0.8275717310046209

In [None]:
y_BestXGBpred_valid = BestXGBoost.predict(X_valid)

In [None]:
print(confusion_matrix(y_valid, y_BestXGBpred_valid))
print(pd.DataFrame(classification_report(
    y_valid, y_BestXGBpred_valid, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))

[[ 961 1590]
 [ 581 4981]]
            no aki      aki  accuracy  macro avg  weighted avg
precision     0.62     0.76      0.73       0.69          0.72
recall        0.38     0.90      0.73       0.64          0.73
f1-score      0.47     0.82      0.73       0.65          0.71
support    2551.00  5562.00      0.73    8113.00       8113.00


In [None]:
# SVM
svm_param = {
    'C': [0.1, 1, 10],
    # 'kernel': ['linear', 'rbf']
    }

In [None]:
SVCModel = SVC(class_weight = 'balanced', kernal = 'rbf')
BestSVC = GridSearchCV(
    SVCModel,
    param_grid = svm_param,
    scoring = 'f1',
    cv = stratifiedCV,
    verbose = 1,
    n_jobs = -1
)
BestSVC.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


KeyboardInterrupt: 

In [None]:
BestSVC.best_estimator_

In [None]:
BestSVC.best_score_

In [None]:
y_BestSVCpred_valid = BestSVC.predict(X_valid)
print(confusion_matrix(y_valid, y_BestSVCpred_valid))
print(pd.DataFrame(classification_report(
    y_valid, y_BestSVCpred_valid, labels = None,
    target_names = ['no aki', 'aki'], sample_weight = None,
    digits = 2, output_dict = True)).round(2))