In [110]:
import pandas as pd
import numpy as np
import pickle, os
from dateutil import parser
from sklearn.model_selection import train_test_split


In [111]:
features_dir = './'

In [112]:
test_size = 0.2
val_size = 0.16
train_size = 0.64
assert(test_size + val_size + train_size == 1)

## Load and Merge Features

In [200]:
features = pd.read_csv(f'{features_dir}features.csv', low_memory=False)
dem_comorbidity_features = pd.read_csv(f'{features_dir}demographic_comorbidity_features.csv', low_memory=False)
labels = pd.read_csv('../phase1_teamB/final_cohort_with_outcome_labels.csv')

In [201]:
features = features.drop(['admittime', 'dischtime'], axis=1).set_index('hadm_id')
dem_comorbidity_features = dem_comorbidity_features.drop(['admittime', 'dischtime', 'Unnamed: 0'], axis=1).set_index('hadm_id')
labels = labels[["hadm_id", "48h_hf", "14d_hf", "30d_hf", "er_hf", "48h", "14d", "30d", "er"]].set_index('hadm_id')

In [202]:
features = features.join(dem_comorbidity_features, how='outer')

In [204]:
new_features = features[features.columns[1221:]]
new_features[features.columns[0]] = features[features.columns[0]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_features[features.columns[0]] = features[features.columns[0]]


In [205]:
new_features.head()

Unnamed: 0_level_0,Albumin Latest,Albumin Delta,Urea Nitrogen Latest,Urea Nitrogen Delta,"Calcium, Total Latest","Calcium, Total Delta",Bicarbonate Latest,Bicarbonate Delta,Chloride Latest,Chloride Delta,...,eth_other,eth_white,eth_unknown,insurance_medicaid,insurance_medicare,insurance_other,eci_count,ahrq_score,vanWalraven_score,stay_duration_hrs
hadm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20001395,,,37.0,19.0,9.6,0.3,20.0,-4.0,100.0,0.0,...,0,0,0,0,0,1,7,25,24,209.966667
20002497,,,22.0,6.0,9.4,0.8,30.0,-9.0,98.0,2.0,...,0,0,0,0,0,1,8,15,14,262.15
20003491,3.0,,41.0,0.0,8.7,0.5,28.0,9.0,96.0,-4.0,...,0,0,0,0,1,0,5,17,16,252.65
20003739,,,16.0,-14.0,8.9,-1.0,28.0,0.0,95.0,1.0,...,0,0,0,0,1,0,7,43,36,220.716667
20003880,2.8,0.1,60.0,-23.0,8.1,-0.4,28.0,7.0,100.0,0.0,...,0,0,0,0,1,0,8,42,31,213.05


## Test/Train/Validate Split and Feature Normalization

In [206]:
features_train_val, features_test, labels_train_val, labels_test  = train_test_split(new_features, labels, test_size=test_size)
features_train, features_val, labels_train, labels_val = train_test_split(features_train_val, labels_train_val, test_size=val_size / (val_size + train_size))
features_train = features_train.dropna(thresh=2, axis=1)
features_val = features_val[features_train.columns]
features_test = features_test[features_train.columns]
feature_variances = features_train.std(axis=0)

In [207]:
# Get whether column is binary
is_binary = ((features_train == 0) | (features_train == 1) | features_train.isnull()).prod(axis=0)
normalizing_variances = is_binary + (1 - is_binary) * feature_variances

In [208]:
feature_means = features_train.mean(axis=0)
features_train_norm = (features_train - feature_means).fillna(0) / normalizing_variances
features_val_norm = (features_val - feature_means).fillna(0) / normalizing_variances
features_test_norm = (features_test - feature_means).fillna(0) / normalizing_variances

In [209]:
nulls = features_train_norm.isnull().sum()
nulls[nulls > 0]

Series([], dtype: int64)

## Train Model

In [222]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
cs = [0.1, 0.2, 0.5, 1.]
# penalties = ["l1", "l2"]
penalties = [1]
depths = [2, 5, 10, 20]
#iters = [25, 50, 100, 250]
best_C = 0
best_pen = "l1"
best_params = []
best_clf = 0
best_clfs = []
best_aucs = []
# Learn best meta-parameters
for label_header in labels.columns:
    best_auc = 0
    print(label_header)
#     for c in cs:
    for depth in depths:
        for pen in penalties:
            clf = GradientBoostingClassifier(max_depth = depth, ).fit(features_train_norm, labels_train[label_header])
#             clf = RandomForestClassifier(max_depth = depth).fit(features_train_norm, labels_train[label_header])
#             clf = LogisticRegression(solver='liblinear', C=c, penalty=pen, max_iter=100).fit(features_train_norm, labels_train[label_header])
#             clf = MultiOutputClassifier(LogisticRegression(solver='liblinear', C=c, penalty=pen, max_iter=100), n_jobs=4).fit(features_train_norm, labels_train)
            train_auc = roc_auc_score(labels_train[label_header], clf.predict_proba(features_train_norm)[:, 1])
            print(train_auc)
            val_auc = roc_auc_score(labels_val[label_header], clf.predict_proba(features_val_norm)[:, 1])
            print(val_auc)
            print(c)
            print(pen)
            if val_auc > best_auc:
                best_auc = val_auc
                best_clf = clf
    best_clfs.append(best_clf)
    best_aucs.append(best_auc)

48h_hf
0.9873651911014845
0.6247187265605467
1.0
1
1.0
0.6240936744728729
1.0
1
1.0
0.47187265605467127
1.0
1
1.0
0.3732186015501292
1.0
1
14d_hf
0.7744530494689373
0.5353500795750591
1.0
1
0.9739481850572302
0.5079473689545764
1.0
1
1.0
0.5726312481626777
1.0
1
1.0
0.5301726322618576
1.0
1
30d_hf
0.7262557078941188
0.5146985162515958
1.0
1
0.9281820003084803
0.5307470053204811
1.0
1
1.0
0.48748068195006933
1.0
1
1.0
0.47525151642874164
1.0
1
er_hf
0.7149125449451611
0.48719161280404255
1.0
1
0.9527105771562208
0.5251422040117908
1.0
1
0.9999923897898884
0.4892118789645008
1.0
1
1.0
0.4870462338495679
1.0
1
48h
0.9122825967938113
0.5849556809024979
1.0
1
0.9999615281037202
0.5828847703464948
1.0
1
1.0
0.580515713134569
1.0
1
1.0
0.5489605157131345
1.0
1
14d
0.6999410040533411
0.5224041644367803
1.0
1
0.9199021306343271
0.5151384511704425
1.0
1
0.9999918948765488
0.49334750550038653
1.0
1
1.0
0.4842049711601356
1.0
1
30d
0.6843926850635349
0.49053496495299714
1.0
1
0.9106777115081451
0.

In [224]:
best_aucs_gb = best_aucs
best_aucs

[0.6247187265605467,
 0.5726312481626777,
 0.5307470053204811,
 0.5251422040117908,
 0.5849556809024979,
 0.5224041644367803,
 0.49883120840994105,
 0.5215198327788153]

## Save to file

In [47]:
features.to_csv('features.csv')

In [203]:
features.columns[1221:]

Index(['rare_medication', 'Albumin Latest', 'Albumin Delta',
       'Urea Nitrogen Latest', 'Urea Nitrogen Delta', 'Calcium, Total Latest',
       'Calcium, Total Delta', 'Bicarbonate Latest', 'Bicarbonate Delta',
       'Chloride Latest',
       ...
       'eth_hispanic', 'eth_other', 'eth_white', 'eth_unknown',
       'insurance_medicaid', 'insurance_medicare', 'insurance_other',
       'eci_count', 'ahrq_score', 'vanWalraven_score'],
      dtype='object', length=175)

In [211]:
labels.sum()

48h_hf      66
14d_hf     687
30d_hf    1246
er_hf     1125
48h        200
14d       1575
30d       2716
er        4471
dtype: int64