In [1]:
import pandas as pd
import numpy as np
import pickle, os
from dateutil import parser
from sklearn.model_selection import train_test_split

In [2]:
features_dir = './'

In [3]:
test_size = 0.2
val_size = 0.16
train_size = 0.64
assert(test_size + val_size + train_size == 1)

## Load and Merge Features

In [4]:
features = pd.read_csv(f'{features_dir}features.csv', low_memory=False)
dem_comorbidity_features = pd.read_csv(f'{features_dir}demographic_comorbidity_features.csv', low_memory=False)
labels = pd.read_csv('../phase1_teamB/final_cohort_with_outcome_labels.csv')

In [5]:
features = features.drop(['admittime', 'dischtime'], axis=1).set_index('hadm_id')
dem_comorbidity_features = dem_comorbidity_features.drop(['admittime', 'dischtime', 'Unnamed: 0'], axis=1).set_index('hadm_id')
labels = labels[["hadm_id", "48h_hf", "14d_hf", "30d_hf", "er_hf", "48h", "14d", "30d", "er"]].set_index('hadm_id')

In [6]:
features = features.join(dem_comorbidity_features, how='outer')

In [7]:
# drop all medication features
new_features = features[features.columns[1221:]]
new_features[features.columns[0]] = features[features.columns[0]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_features[features.columns[0]] = features[features.columns[0]]


In [8]:
new_features.head()

Unnamed: 0_level_0,Albumin Latest,Albumin Delta,Urea Nitrogen Latest,Urea Nitrogen Delta,"Calcium, Total Latest","Calcium, Total Delta",Bicarbonate Latest,Bicarbonate Delta,Chloride Latest,Chloride Delta,...,eth_other,eth_white,eth_unknown,insurance_medicaid,insurance_medicare,insurance_other,eci_count,ahrq_score,vanWalraven_score,stay_duration_hrs
hadm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20001395,,,37.0,19.0,9.6,0.3,20.0,-4.0,100.0,0.0,...,0,0,0,0,0,1,7,25,24,209.966667
20002497,,,22.0,6.0,9.4,0.8,30.0,-9.0,98.0,2.0,...,0,0,0,0,0,1,8,15,14,262.15
20003491,3.0,,41.0,0.0,8.7,0.5,28.0,9.0,96.0,-4.0,...,0,0,0,0,1,0,5,17,16,252.65
20003739,,,16.0,-14.0,8.9,-1.0,28.0,0.0,95.0,1.0,...,0,0,0,0,1,0,7,43,36,220.716667
20003880,2.8,0.1,60.0,-23.0,8.1,-0.4,28.0,7.0,100.0,0.0,...,0,0,0,0,1,0,8,42,31,213.05


## Test/Train/Validate Split and Feature Normalization

In [9]:
features_train_val, features_test, labels_train_val, labels_test  = train_test_split(new_features, labels, test_size=test_size)
features_train, features_val, labels_train, labels_val = train_test_split(features_train_val, labels_train_val, test_size=val_size / (val_size + train_size))
features_train = features_train.dropna(thresh=2, axis=1) # drop any columns that are all or all-but-1 NaN
features_val = features_val[features_train.columns]
features_test = features_test[features_train.columns]
feature_variances = features_train.std(axis=0)

In [10]:
# Get whether column is binary
is_binary = ((features_train == 0) | (features_train == 1) | features_train.isnull()).prod(axis=0)
normalizing_variances = is_binary + (1 - is_binary) * feature_variances

In [11]:
# normalization
feature_means = features_train.mean(axis=0)
features_train_norm = (features_train - feature_means).fillna(0) / normalizing_variances
features_val_norm = (features_val - feature_means).fillna(0) / normalizing_variances
features_test_norm = (features_test - feature_means).fillna(0) / normalizing_variances

In [12]:
features_train_norm.head()

Unnamed: 0_level_0,Albumin Latest,Albumin Delta,Urea Nitrogen Latest,Urea Nitrogen Delta,"Calcium, Total Latest","Calcium, Total Delta",Bicarbonate Latest,Bicarbonate Delta,Chloride Latest,Chloride Delta,...,eth_other,eth_white,eth_unknown,insurance_medicaid,insurance_medicare,insurance_other,eci_count,ahrq_score,vanWalraven_score,stay_duration_hrs
hadm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20676642,1.271341,-0.542752,0.130506,-0.815703,0.941214,-0.886593,-0.534631,-0.252643,0.419523,1.018004,...,0.96394,-0.691469,-0.041712,-0.033638,-0.596475,0.630113,-0.226856,-0.727364,-1.220574,0.174986
26269183,0.0,0.0,0.493624,0.244854,-0.479047,-0.009382,-0.32608,-0.463879,1.517348,0.280685,...,-0.03606,0.308531,-0.041712,-0.033638,0.403525,-0.369887,0.758818,-1.432412,-0.980608,-0.139555
28059442,-0.131008,-0.343404,-0.716769,0.341268,-0.163433,0.721628,0.508125,0.5923,0.053581,-0.825293,...,-0.03606,0.308531,-0.041712,-0.033638,0.403525,-0.369887,-0.719693,-0.374841,-0.260711,0.178605
26830929,2.147809,0.0,-0.59573,0.100232,-0.794661,-1.325199,0.508125,1.226008,-0.312361,-1.193953,...,-0.03606,-0.691469,0.958288,-0.033638,-0.596475,0.630113,0.265981,0.594599,0.219221,-0.262539
24508821,0.044286,0.0,-1.241273,-0.333632,1.099021,2.037445,0.299574,1.014772,-0.312361,0.096355,...,-0.03606,-0.691469,-0.041712,-0.033638,-0.596475,0.630113,-1.212531,-0.991757,-0.740643,-0.407873


## Train Model

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

cs = [0.1, 0.2, 0.5, 1.] # for logistic regression
penalties = ["l1", "l2"] # for logistic regression
iters = [25, 50, 100, 250] # for logistic regression
# penalties = [1] # for RandomForestClassifier, GradientBoostingClassifier
depths = [2, 5, 10] # for RandomForestClassifier, GradientBoostingClassifier

best_clfs = []
best_aucs = []
# Learn best meta-parameters
for label_header in labels.columns:
    best_auc = 0
    print(label_header)
    for c in cs: # for logistic regression
#     for depth in depths: # for RandomForestClassifier, GradientBoostingClassifier
        for pen in penalties:
            # GradientBoostingClassifier 
#             clf = GradientBoostingClassifier(max_depth = depth, ).fit(features_train_norm, labels_train[label_header])
            # most important feature for GradientBoostingClassifier
#             i = np.argmax(clf.feature_importances_)
#             print(features_train_norm.columns[i], clf.feature_importances_[i])
            # RandomForestClassifier
#             clf = RandomForestClassifier(max_depth = depth).fit(features_train_norm, labels_train[label_header])
            # LogisticRegression
            clf = LogisticRegression(solver='liblinear', C=c, penalty=pen, max_iter=100).fit(features_train_norm, labels_train[label_header])
            # most important features for LogisticRegression
#             for i in range(len(features_train_norm.columns)):
#                 if abs(clf.coef_[0][i]) > 1:
#                     print(features_train_norm.columns[i], clf.coef_[0][i])
            train_auc = roc_auc_score(labels_train[label_header], clf.predict_proba(features_train_norm)[:, 1])
            print(train_auc)
            val_auc = roc_auc_score(labels_val[label_header], clf.predict_proba(features_val_norm)[:, 1])
            print(val_auc)
            if val_auc > best_auc:
                best_auc = val_auc
                best_clf = clf
    best_clfs.append(best_clf)
    best_aucs.append(best_auc)


48h_hf
0.5
0.5
0.7796781324046245
0.5849158979924036
0.6569979167660577
0.5094614758545849
0.7897762511330566
0.5897314161692891
0.7263608606459615
0.4879951166576234
0.8033172717586629
0.5927835051546391
0.782791056406342
0.5672137818773738
0.8128508499912536
0.594139989148128
14d_hf
0.586820433600385
0.529166950612837
0.6261379155852953
0.5325986428995609
0.600722900308792
0.5209893009083032
0.630454413975762
0.5361082175644233
0.6242096710467409
0.5371012178856882
0.6337420492782481
0.5394766304189098
0.6302057692890193
0.5423193372209621
0.6352741931779488
0.5403479395243334
30d_hf
0.5810385659364982
0.5326199797013194
0.6041127310519034
0.5362737422067566
0.5893408979437301
0.5356357836740612
0.6059627716380651
0.5393910395824272
0.6005587409398206
0.5415340002899812
0.6074309485497498
0.5415397999130057
0.6037543872732505
0.5431404958677686
0.608185879840103
0.5423952443091199
er_hf
0.5802447263814278
0.45460408311332356
0.613921379374574
0.44756712818342437
0.5979847432299141
0.

In [14]:
print(best_aucs)


[0.594139989148128, 0.5423193372209621, 0.5431404958677686, 0.45460408311332356, 0.5582580733442802, 0.5329549153052848, 0.5206963931065411, 0.5038278089272039]
