In [1]:
import os
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

# Import dataset
*** Note that the dataset is already pre-processed and divided into train and test sets

In [2]:
base_dir = './dataset'

train_df = pd.read_csv(os.path.join(base_dir, 'train_data.csv'))
test_df = pd.read_csv(os.path.join(base_dir, 'test_data.csv'))

### Dataset examples

In [3]:
# dataset shapes
train_df.shape, test_df.shape

((1749, 213), (752, 213))

In [4]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,case_control,RCPT_DT,MI,CHF,PVD,Stroke,Dementia,Pulmonary,Rheumatic,...,Occupation_NM_화물차량 운전사,Occupation_NM_회사 사무원,Marriage_0,Marriage_기타,Marriage_기혼,Marriage_미혼,Marriage_별거,Marriage_사별,Marriage_유,Marriage_이혼
0,0,1,2010-11-17 19:20:39,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1,1,2014-06-01 11:30:25,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,1,2013-11-23 11:46:48,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3,1,2011-12-17 11:39:49,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4,1,2017-08-21 09:23:52,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,case_control,RCPT_DT,MI,CHF,PVD,Stroke,Dementia,Pulmonary,Rheumatic,...,Occupation_NM_화물차량 운전사,Occupation_NM_회사 사무원,Marriage_0,Marriage_기타,Marriage_기혼,Marriage_미혼,Marriage_별거,Marriage_사별,Marriage_유,Marriage_이혼
0,0,1,2017-04-18 10:04:08,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1,1,2013-12-02 02:41:03,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,1,2010-09-16 08:52:39,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3,1,2012-02-10 02:03:25,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,4,1,2014-07-20 09:30:57,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# column list for data
print(train_df.columns.tolist())

['Unnamed: 0', 'case_control', 'RCPT_DT', 'MI', 'CHF', 'PVD', 'Stroke', 'Dementia', 'Pulmonary', 'Rheumatic', 'PUD', 'LiverMild', 'LiverSevere', 'DM', 'DMcx', 'Paralysis', 'Renal', 'Cancer', 'Mets', 'HIV', 'age_score', 'CCI', 'Multiple_Primary', 'Extend_CD', 'Ca_LN_No', 'bone', 'brain_CNS_eye', 'breast', 'digestive', 'female_genital', 'lip_oral_pharynx', 'lymphoid_haematopoietic', 'male_genital', 'respiratory', 'skin', 'soft_tissue', 'thyroid_endocrinegland', 'urinary_tract', 'unspecified', 'Tx_surgery', 'Tx_chemo', 'Tx_RT', 'Tx_hormon', 'Tx_bio', 'Tx_miscellaneous', 'steroid_d30', 'immunosuppressant_d30', 'TPN_d30', 'OP30', 'Central', 'prev_other_candida_source', 'Prev_Candidemia', 'age_at_Cx', 'Gender', 'Wt', 'Ht', 'smoking', 'drinking', 'SBP', 'DBP', 'HR', 'RR', 'BT', 'SpO2', 'mean1_SBP', 'mean1_DBP', 'mean1_HR', 'mean1_RR', 'mean1_BT', 'mean1_SpO2', 'var1_SBP', 'var1_DBP', 'var1_HR', 'var1_RR', 'var1_BT', 'var1_SpO2', 'mean7_SBP', 'mean7_DBP', 'mean7_HR', 'mean7_RR', 'mean7_BT', 'm

### List of feature sets to compare
*** Note that these features are initially selected after reviews by clinitians

In [7]:
feature_sets_to_compare = [['target_antibiotics_d7', 'Antianerobes_d7', 'target_antibiotics_d30', 'BL3112', 'Prev_Candidemia', 'Antianerobes_d30', 'Central', 'BL3119', 'prev_other_candida_source', 'lymphoid_haematopoietic', 'Carbapenem_d7', 'BL2021', 'Extended_spectrum_penicillin_d30', 'BL3114', 'mean7_HR', 'Carbapenem_d30', 'TPN_d30', 'HR', 'BL201806', 'mean1_HR', 'var7_RR', 'BL201809', 'var7_SBP', 'steroid_d30', 'Gender', 'BL3118', 'Wt', 'age_at_Cx', 'BL3140', 'CCI'], ['target_antibiotics_d7', 'digestive', 'Antianerobes_d7', 'Tx_surgery', 'Tx_chemo', 'target_antibiotics_d30', 'immunosuppressant_d30', 'BL3112', 'Prev_Candidemia', 'Antianerobes_d30', 'Central', 'BL3119', 'prev_other_candida_source', 'lymphoid_haematopoietic', 'smoking', 'SBP', 'DBP', 'Carbapenem_d7', 'RR', 'BT', 'mean1_SBP', 'mean1_DBP', 'BL2021', 'mean1_RR', 'mean1_BT', 'mean7_SBP', 'mean7_DBP', 'Extended_spectrum_penicillin_d30', 'mean7_RR', 'mean7_BT', 'BL3114', 'var7_DBP', 'var7_HR', 'mean7_HR', 'var7_BT', 'Carbapenem_d30', 'TPN_d30', 'BL201810', 'HR', 'BL201806', 'BL3113', 'mean1_HR', 'var7_RR', 'BL201809', 'BL312002', 'BL3122', 'var7_SBP', 'BL3157', 'steroid_d30', 'Broad_specturm_cephalosporine_3rd_d7', 'Gender', 'Extended_spectrum_penicillin_d7', 'Glycopeptide_d7', 'BL3118', 'Wt', 'Broad_specturm_cephalosporine_3rd_d30', 'age_at_Cx', 'BL3140', 'Glycopeptide_d30', 'CCI', 'Extend_NM_Distant', 'Occupation_NM_기타 NEC, NOS (직업 : 무 & 55세 이상 남자, 여자)'], ['LiverMild', 'LiverSevere', 'DM', 'DMcx', 'target_antibiotics_d7', 'Extend_CD', 'digestive', 'Antianerobes_d7', 'Tx_surgery', 'Tx_chemo', 'Tx_RT', 'target_antibiotics_d30', 'immunosuppressant_d30', 'BL3112', 'OP30', 'Prev_Candidemia', 'Antianerobes_d30', 'Central', 'BL3119', 'prev_other_candida_source', 'lymphoid_haematopoietic', 'Ht', 'smoking', 'drinking', 'SBP', 'DBP', 'Carbapenem_d7', 'RR', 'BT', 'mean1_SBP', 'mean1_DBP', 'BL2021', 'mean1_RR', 'mean1_BT', 'var1_SBP', 'var1_DBP', 'var1_HR', 'var1_RR', 'var1_BT', 'mean7_SBP', 'mean7_DBP', 'Extended_spectrum_penicillin_d30', 'mean7_RR', 'mean7_BT', 'BL3114', 'var7_DBP', 'var7_HR', 'mean7_HR', 'var7_BT', 'BL2011', 'BL2013', 'BL201401', 'BL201402', 'BL201403', 'BL2016', 'Carbapenem_d30', 'TPN_d30', 'BL201810', 'BL201815', 'BL201816', 'BL201818', 'HR', 'BL211103', 'BL3111', 'BL201806', 'BL311201', 'BL3113', 'mean1_HR', 'BL3117', 'var7_RR', 'BL201809', 'BL3120', 'BL312002', 'BL3121', 'BL3122', 'BL3123', 'var7_SBP', 'BL3157', 'BL5044', 'steroid_d30', 'Broad_specturm_cephalosporine_3rd_d7', 'Gender', 'Extended_spectrum_penicillin_d7', 'Glycopeptide_d7', 'BL3118', 'Wt', 'Broad_specturm_cephalosporine_3rd_d30', 'age_at_Cx', 'BL3140', 'Glycopeptide_d30', 'CCI', 'LOS_at_Cx', 'Extend_NM_Distant', 'Extend_NM_Localized', 'Occupation_NM_기타 NEC, NOS (직업 : 무 & 55세 이상 남자, 여자)'], ['MI', 'CHF', 'PVD', 'Stroke', 'Dementia', 'Pulmonary', 'Rheumatic', 'PUD', 'LiverMild', 'LiverSevere', 'DM', 'DMcx', 'Paralysis', 'Renal', 'Cancer', 'Mets', 'HIV', 'age_score', 'target_antibiotics_d7', 'Multiple_Primary', 'Extend_CD', 'Ca_LN_No', 'bone', 'brain_CNS_eye', 'breast', 'digestive', 'female_genital', 'lip_oral_pharynx', 'Antianerobes_d7', 'male_genital', 'respiratory', 'skin', 'soft_tissue', 'thyroid_endocrinegland', 'urinary_tract', 'unspecified', 'Tx_surgery', 'Tx_chemo', 'Tx_RT', 'Tx_hormon', 'Tx_bio', 'Tx_miscellaneous', 'target_antibiotics_d30', 'immunosuppressant_d30', 'BL3112', 'OP30', 'Prev_Candidemia', 'Antianerobes_d30', 'Central', 'BL3119', 'prev_other_candida_source', 'lymphoid_haematopoietic', 'Ht', 'smoking', 'drinking', 'SBP', 'DBP', 'Carbapenem_d7', 'RR', 'BT', 'SpO2', 'mean1_SBP', 'mean1_DBP', 'BL2021', 'mean1_RR', 'mean1_BT', 'mean1_SpO2', 'var1_SBP', 'var1_DBP', 'var1_HR', 'var1_RR', 'var1_BT', 'var1_SpO2', 'mean7_SBP', 'mean7_DBP', 'Extended_spectrum_penicillin_d30', 'mean7_RR', 'mean7_BT', 'mean7_SpO2', 'BL3114', 'var7_DBP', 'var7_HR', 'mean7_HR', 'var7_BT', 'var7_SpO2', 'BL2011', 'BL2012', 'BL2013', 'BL2014', 'BL201401', 'BL201402', 'BL201403', 'BL2015', 'BL2016', 'BL2017', 'BL201701', 'BL201801', 'BL201802', 'BL201803', 'BL201804', 'BL201805', 'Carbapenem_d30', 'BL201807', 'BL201808', 'TPN_d30', 'BL201810', 'BL201811', 'BL201812', 'BL201813', 'BL201814', 'BL201815', 'BL201816', 'BL201817', 'BL201818', 'BL2019', 'BL2020', 'HR', 'BL211103', 'BL3111', 'BL201806', 'BL311201', 'BL3113', 'mean1_HR', 'BL3115', 'BL3116', 'BL3117', 'var7_RR', 'BL201809', 'BL3120', 'BL312002', 'BL3121', 'BL3122', 'BL3123', 'var7_SBP', 'BL3157', 'BL5044', 'steroid_d30', 'Broad_specturm_cephalosporine_3rd_d7', 'Gender', 'Extended_spectrum_penicillin_d7', 'Glycopeptide_d7', 'BL3118', 'Wt', 'Broad_specturm_cephalosporine_3rd_d30', 'age_at_Cx', 'BL3140', 'Glycopeptide_d30', 'CCI', 'LOS_at_Cx', 'Extend_NM_0', 'Extend_NM_Distant', 'Extend_NM_In situ', 'Extend_NM_Localized', 'Extend_NM_Regional direct extension and nodes', 'Extend_NM_Regional nodes only', 'Extend_NM_Regional, NOS', 'Extend_NM_Regional, direct extention only', 'Extend_NM_Unknown if extension or metastasis(unstaged, unknown, or unspecified)', 'Occupation_NM_가정주부 (나이상관없이), 기혼 (55세미만 여자)', 'Occupation_NM_각종 중개인 (부동산, 임대업, 무역)', 'Occupation_NM_경영 관리직 (회사, 금융업 경영자 및 임원, 문화재단이사장)', 'Occupation_NM_경찰관, 소방관', 'Occupation_NM_고급 기술직 (건축기술자, 항공, 선박기술자, 통역사, 정보관리자, 감정평가사)', 'Occupation_NM_공연 예술가', 'Occupation_NM_교육 종사자, 연구원, 학자 (교수, 박사, 교사, 대학시간강사, 강사, 유치원교사)', 'Occupation_NM_국회의원 및 정치인', 'Occupation_NM_기타 (각종 입시준비생, 연수생, 휴학생 등) NEC, NOS', 'Occupation_NM_기타 NEC, NOS', 'Occupation_NM_기타 NEC, NOS (직업 : 무 & 55세 이상 남자, 여자)', 'Occupation_NM_기타 사회 사업가 등 전문적인 활동인 NEC, NOS (세무사, 프리랜서, 디자이너)', 'Occupation_NM_기타 운수 관련 종사자 NEC, NOS', 'Occupation_NM_기타 일반 사무직 근로자 NEC, NOS (은행원, 교직원)', 'Occupation_NM_기타 일반 생산직 근로자 NEC, NOS', 'Occupation_NM_기타 일반 서비스직 종사자 NEC, NOS', 'Occupation_NM_기타 판매 종사자 NEC, NOS', 'Occupation_NM_농업 (농장, 양봉 및 양잠종사자)', 'Occupation_NM_대학, 대학원생', 'Occupation_NM_도, 소매 자영업자 (상업, 사업, 학원운영)', 'Occupation_NM_도, 소매 판매 종사자', 'Occupation_NM_법조계 (판사, 검사, 법무사, 회계사)', 'Occupation_NM_비공연 예술가 (작가, 화가, 영화연출자)', 'Occupation_NM_서비스직 종사자 (조리사, 웨이터, 세탁공, 이발사, 미용사, 안내원, 장의사,수위, 경비원, 여행가이드 등)', 'Occupation_NM_수렵업', 'Occupation_NM_수산업 (해녀)', 'Occupation_NM_실내 제조업 종사자 (재봉공, 제지공, 인쇄공, 가구공, 방송 및 음향 조작공,귀금속 세공공 등)', 'Occupation_NM_요식 숙박업 경영자', 'Occupation_NM_의료계 (의사, 한의사, 치과의사, 수의사, 간호사, 의료기사, 약사, 조산사)', 'Occupation_NM_일반 운송차량 운전사(택시)', 'Occupation_NM_일반 행정 공무원 (우체국, 교육 공무원, 군무원)', 'Occupation_NM_일반사병 (군복무)', 'Occupation_NM_일반장교 (중령, 대령, 중위)', 'Occupation_NM_정부 관리직 공무원 (우체국장, 조합장)', 'Occupation_NM_종교계', 'Occupation_NM_중기계 운전사', 'Occupation_NM_직종을 보고하지 않은 종사자 (직업 : 무 &55세 미만 남자)', 'Occupation_NM_청소부, 가정부 등 노동자에 준하는 자', 'Occupation_NM_축산업', 'Occupation_NM_판매 외무원 (보험설계사)', 'Occupation_NM_하사관', 'Occupation_NM_현장 육체 노동자 (공업, 광원, 목수, 채석원, 정비공, 용접공, 기계 노동자,전기공, 미장 및 도배원 등)', 'Occupation_NM_화물차량 운전사', 'Occupation_NM_회사 사무원', 'Marriage_0', 'Marriage_기타', 'Marriage_기혼', 'Marriage_미혼', 'Marriage_별거', 'Marriage_사별', 'Marriage_유', 'Marriage_이혼']]

In [8]:
# length of features sets
[len(i) for i in feature_sets_to_compare]

[30, 62, 95, 210]

# Models
*** Note that only limited information about model analysis is included in this notebook (This notebook only shows example usage of model development). For seed analysis, parameter searching, threshold analysis, and other analysis are note included.

In [9]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import tensorflow as tf

from sklearn.metrics import roc_curve, auc

### Logistic Regression
Example prediciton model using Logistic Regression with its performance using AUROC

In [11]:
model_name = 'Logistic Regression'
clf = LogisticRegression()
for list_features in feature_sets_to_compare:
    # input matrix
    trX = train_df[list_features].values
    trY = train_df['case_control'].values
    teX = test_df[list_features].values
    teY = test_df['case_control'].values
    # model training
    clf.fit(trX, trY)
    pred = clf.predict_proba(teX)[:,1]
    real = teY
    fpr, tpr, _ = roc_curve(real, pred)
    print('AUC SCORE with Model {} with {} features: {}'.format(model_name, len(list_features), auc(fpr, tpr)))

AUC SCORE with Model Logistic Regression with 30 features: 0.8007845643574175
AUC SCORE with Model Logistic Regression with 62 features: 0.800409912838426
AUC SCORE with Model Logistic Regression with 95 features: 0.7848067789886612
AUC SCORE with Model Logistic Regression with 210 features: 0.7756608742603388


### Random Forest
Example prediciton model using RF with its performance using AUROC

In [12]:
model_name = 'RF'
clf = RandomForestClassifier()
for list_features in feature_sets_to_compare:
    # input matrix
    trX = train_df[list_features].values
    trY = train_df['case_control'].values
    teX = test_df[list_features].values
    teY = test_df['case_control'].values
    # model training
    clf.fit(trX, trY)
    pred = clf.predict_proba(teX)[:,1]
    real = teY
    fpr, tpr, _ = roc_curve(real, pred)
    print('AUC SCORE with Model {} with {} features: {}'.format(model_name, len(list_features), auc(fpr, tpr)))

AUC SCORE with Model RF with 30 features: 0.8843428722548512
AUC SCORE with Model RF with 62 features: 0.8603651750393935
AUC SCORE with Model RF with 95 features: 0.8704421989840333
AUC SCORE with Model RF with 210 features: 0.8643871692873908


### Gradient Boosting
Example prediciton model using Gradient Boosting with its performance using AUROC

In [13]:
model_name = 'Gradient Boosting'
clf = GradientBoostingClassifier()
for list_features in feature_sets_to_compare:
    # input matrix
    trX = train_df[list_features].values
    trY = train_df['case_control'].values
    teX = test_df[list_features].values
    teY = test_df['case_control'].values
    # model training
    clf.fit(trX, trY)
    pred = clf.predict_proba(teX)[:,1]
    real = teY
    fpr, tpr, _ = roc_curve(real, pred)
    print('AUC SCORE with Model {} with {} features: {}'.format(model_name, len(list_features), auc(fpr, tpr)))

AUC SCORE with Model Gradient Boosting with 30 features: 0.8602549834161606
AUC SCORE with Model Gradient Boosting with 62 features: 0.8470319886282244
AUC SCORE with Model Gradient Boosting with 95 features: 0.8451807693579134
AUC SCORE with Model Gradient Boosting with 210 features: 0.8386684444248548


### Adaboost w/ RF
Example prediciton model using Adaboost w/ RF with its performance using AUROC

In [14]:
model_name = 'Adaboost w/ RF'
clf = AdaBoostClassifier(base_estimator = RandomForestClassifier())
for list_features in feature_sets_to_compare:
    # input matrix
    trX = train_df[list_features].values
    trY = train_df['case_control'].values
    teX = test_df[list_features].values
    teY = test_df['case_control'].values
    # model training
    clf.fit(trX, trY)
    pred = clf.predict_proba(teX)[:,1]
    real = teY
    fpr, tpr, _ = roc_curve(real, pred)
    print('AUC SCORE with Model {} with {} features: {}'.format(model_name, len(list_features), auc(fpr, tpr)))

AUC SCORE with Model Adaboost w/ RF with 30 features: 0.8808993840288262
AUC SCORE with Model Adaboost w/ RF with 62 features: 0.8628885632114247
AUC SCORE with Model Adaboost w/ RF with 95 features: 0.8679904353671034
AUC SCORE with Model Adaboost w/ RF with 210 features: 0.8624312679750086


### Stacking ensemble with RF and NN
Example prediciton model using Stacking ensemble with RF and NN with its performance using AUROC

In [15]:
model_name = 'Stacking w/ RF and NN'
estimators = [
    ('rf', RandomForestClassifier()),
    ('nn', MLPClassifier())
]
clf = StackingClassifier(estimators = estimators, n_jobs = -1)
for list_features in feature_sets_to_compare:
    # input matrix
    trX = train_df[list_features].values
    trY = train_df['case_control'].values
    teX = test_df[list_features].values
    teY = test_df['case_control'].values
    # model training
    clf.fit(trX, trY)
    pred = clf.predict_proba(teX)[:,1]
    real = teY
    fpr, tpr, _ = roc_curve(real, pred)
    print('AUC SCORE with Model {} with {} features: {}'.format(model_name, len(list_features), auc(fpr, tpr)))

AUC SCORE with Model Stacking w/ RF and NN with 30 features: 0.880045398948772
AUC SCORE with Model Stacking w/ RF and NN with 62 features: 0.8741721854304636
AUC SCORE with Model Stacking w/ RF and NN with 95 features: 0.8762548071095636
AUC SCORE with Model Stacking w/ RF and NN with 210 features: 0.8722989278355059


### NN (tensorflow NN)
Example prediciton mSequentialel using tensorflow NN with its performance using AUROC

In [16]:
model_name = 'TF-NN'
def make_nn():    
    clf = tf.keras.models.Sequential([
        tf.keras.layers.Dense(128, activation = 'relu'),
        tf.keras.layers.Dense(64, activation = 'relu'),
        tf.keras.layers.Dense(32, activation = 'relu'),
        tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])
    clf.compile(loss = 'binary_crossentropy', optimizer = 'adam')
    return clf
for list_features in feature_sets_to_compare:
    # input matrix
    trX = train_df[list_features].values
    trY = train_df['case_control'].values
    teX = test_df[list_features].values
    teY = test_df['case_control'].values
    # model training
    clf = make_nn()
    clf.fit(trX, trY, epochs = 10, verbose = 0)
    pred = clf.predict(teX)
    real = teY
    fpr, tpr, _ = roc_curve(real, pred)
    print('AUC SCORE with Model {} with {} features: {}'.format(model_name, len(list_features), auc(fpr, tpr)))

AUC SCORE with Model TF-NN with 30 features: 0.8263710592720741
AUC SCORE with Model TF-NN with 62 features: 0.8017762889665128
AUC SCORE with Model TF-NN with 95 features: 0.8121122632257496
AUC SCORE with Model TF-NN with 210 features: 0.8232305980099393
