In [1]:
import os,sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import cPickle as pk
np.random.seed(1) # to be reproductive

In [2]:
# paths
NOTE_DATA_DIR = '/local/XW/DATA/MIMIC/noteevents_by_sid/'
ICD_FPATH = './subject_diag_icds.txt'
PK_FPATH = './diag_processed_data.pk' # './processed_data_small.pk'

In [3]:
# read k-hot labels data
pk_data = pk.load(open(PK_FPATH, 'rb'))
Y_train = pk_data['Y_train']
Y_val = pk_data['Y_val']

# Naive baseline

In [5]:
print Y_train.shape, Y_val.shape

(36917, 50) (9229, 50)


In [6]:
print Y_train[:2,]

[[ 1.  0.  0.  1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  1.  1.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  1.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.]]


In [7]:
# naive algo just find which label is most common, then always predict this label
stat = Y_train.sum(axis=0)
print stat

[ 14021.   8595.   8119.   7855.   6168.   5990.   5872.   5258.   4578.
   4558.   4390.   4220.   4196.   3952.   3483.   3409.   3296.   3015.
   2882.   2810.   2829.   2718.   2670.   2539.   2362.   2304.   2348.
   2286.   2225.   2198.   2226.   2214.   2175.   2128.   2028.   2020.
   1897.   1852.   1854.   1866.   1847.   1815.   1787.   1713.   1642.
   1642.   1638.   1591.   1599.  35142.]


In [8]:
pred_naive = np.argmax(stat)
print pred_naive

49


In [9]:
acc_naive = Y_val.sum(axis=0)[pred_naive]*1.0 / Y_val.shape[0]
print 'naive baseline is: %.9f' % acc_naive

naive baseline is: 0.952107487


# SVM baseline

** The problem with SVM is that for each instance, there are several labels, so I split each instance into several (instance, one-label) pairs... NOT SURE IF THIS MAKES SENSE. **

This baseline use the method described at: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [15]:
# prepare label data
N_LABELS = 40
K_ICDS_TOKEEP = N_LABELS - 1 # predict only on top K frequent icd codes
N_SUBJECTS = 46146
from collections import Counter
sid2icds = {} # map subject_id ---> icd codes of this patient
icd_ctr = Counter()
with open(ICD_FPATH) as f: 
    for line in tqdm(f, total=N_SUBJECTS): 
        sid, _icds = line.split(',')
        _icds = _icds.split()
        icd_ctr.update(_icds)
        sid2icds[sid] = set(_icds)

100%|██████████| 46146/46146 [00:00<00:00, 90823.60it/s]


In [16]:
print icd_ctr.most_common(K_ICDS_TOKEEP)
icds = zip( *icd_ctr.most_common(K_ICDS_TOKEEP) )[0] + ('other',)
sid2khot = {} # map subject_id to k-hot vector
for sid in sid2icds.keys():
    _khot = np.zeros(N_LABELS)
    for _icd in sid2icds[sid]:
        if _icd in icds: 
            _khot[icds.index(_icd)] = 1
        else: # label 'other icds'
            _khot[-1] = 1
    sid2khot[sid] = _khot

[('4019', 17510), ('41401', 10736), ('42731', 10193), ('4280', 9802), ('5849', 7634), ('2724', 7421), ('25000', 7332), ('51881', 6632), ('5990', 5746), ('V053', 5678), ('V290', 5440), ('2720', 5320), ('53081', 5246), ('2859', 4967), ('486', 4391), ('2851', 4231), ('2762', 4120), ('2449', 3789), ('496', 3572), ('99592', 3504), ('V3000', 3503), ('0389', 3387), ('5070', 3362), ('V5861', 3184), ('3051', 2982), ('311', 2907), ('41071', 2902), ('5859', 2889), ('40390', 2814), ('2761', 2789), ('2875', 2783), ('412', 2775), ('V3001', 2707), ('4240', 2643), ('5119', 2554), ('V1582', 2534), ('78552', 2376), ('V4581', 2318), ('4241', 2302)]


In [17]:
sids = sid2icds.keys()
np.random.shuffle(sids)
VALIDATION_SPLIT = 0.2
validset_sz = int(VALIDATION_SPLIT*len(sids))
train_sids = sids[:-validset_sz]
val_sids = sids[-validset_sz:]

In [18]:
Y_train = [sid2khot[sid] for sid in train_sids]
Y_val = [sid2khot[sid] for sid in val_sids]

To prepare data for svm, need to write our own **generator** of documents. 

In [19]:
def notes_generator(fpaths):
    for fpath in tqdm(fpaths):
        df = pd.read_csv(fpath)
        yield '\n=======\n\n\n'.join(df['text'])

In [21]:
train_files = [os.path.join(NOTE_DATA_DIR, '%s.csv'%sid) for sid in train_sids]
val_files = [os.path.join(NOTE_DATA_DIR, '%s.csv'%sid) for sid in val_sids]

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
MAX_NB_WORDS = 20000 # top 20k most freq words
cnt_vect = CountVectorizer(max_features=MAX_NB_WORDS)

In [23]:
X_train_counts = cnt_vect.fit_transform(notes_generator(train_files))

100%|██████████| 36917/36917 [13:07<00:00, 46.85it/s]


In [24]:
print X_train_counts.shape
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print X_train_tfidf.shape

(36917, 20000)
(36917, 20000)


** generate (instance, one_label) pairs ** 

In [25]:
X_train_idx = []
Y_train_1label = []
for i,y in enumerate(Y_train):
    for label in np.where(y==1)[0]:
#         X_train.append(X_train_tfidf[i,:])
        X_train_idx.append(i)
        Y_train_1label.append(label)
len(Y_train_1label)

184904

In [26]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42) # sgd SVM
clf.fit(X_train_tfidf[X_train_idx,], Y_train_1label) 

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)

** predict and evaluate **

In [27]:
X_val_counts = cnt_vect.fit_transform(notes_generator(val_files))
print X_val_counts.shape
tfidf_transformer = TfidfTransformer()
X_val_tfidf = tfidf_transformer.fit_transform(X_val_counts)
print X_val_tfidf.shape

100%|██████████| 9229/9229 [03:09<00:00, 48.63it/s]


(9229, 20000)
(9229, 20000)


In [28]:
preds = clf.predict(X_val_tfidf)

In [29]:
correct_pred = 0
for i,pred in enumerate(preds):
    if Y_val[i][pred]==1: correct_pred += 1
print '%d instances out of %d correct, relaxed accuracy = %.9f' % \
                (correct_pred, len(Y_val), correct_pred*1.0/len(Y_val))

676 instances out of 9229 correct, relaxed accuracy = 0.073247372


In [30]:
# add regularization?
clf2 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-1, n_iter=5, random_state=42) 

In [31]:
clf2.fit(X_train_tfidf[X_train_idx,], Y_train_1label) 
preds = clf2.predict(X_val_tfidf)
correct_pred = 0
for i,pred in enumerate(preds):
    if Y_val[i][pred]==1: correct_pred += 1
print '%d instances out of %d correct, relaxed accuracy = %.9f' % \
                (correct_pred, len(Y_val), correct_pred*1.0/len(Y_val))

8921 instances out of 9229 correct, relaxed accuracy = 0.966626937


** dump file for later use ** 

In [32]:
data_svm = {
    'X_train': X_train_tfidf,
    'Y_train': Y_train,
    'X_val': X_val_tfidf,
    'Y_val': Y_val,
    'train_sids': train_sids,
    'val_sids': val_sids
}

In [33]:
pk.dump(data_svm, open('./diag_data_svm.pk','wb'), pk.HIGHEST_PROTOCOL)

# Deep patient baseline