In [1]:
import os,sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import cPickle as pk
np.random.seed(1) # to be reproductive

In [4]:
# paths
NOTE_DATA_DIR = '/local/XW/DATA/MIMIC/noteevents_by_sid/'
ICD_FPATH = './subject_icds.txt'
PK_FPATH = './processed_data.pk' # './processed_data_small.pk'

In [5]:
# read k-hot labels data
pk_data = pk.load(open(PK_FPATH, 'rb'))
Y_train = pk_data['Y_train']
Y_val = pk_data['Y_val']

# Naive baseline

In [8]:
print Y_train.shape, Y_val.shape

(33509, 40) (8377, 40)


In [33]:
print Y_train[:2,]

[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.
   1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  1.]]


In [10]:
# naive algo just find which label is most common, then always predict this label
stat = Y_train.sum(axis=0)
print stat

[  8820.   7283.   6563.   6642.   5054.   5369.   4284.   4618.   3908.
   3463.   3512.   3140.   2512.   2458.   1720.   1961.   1905.   2053.
   2071.   1929.   1027.   1602.   1655.   1684.   1516.   1515.   1203.
   1063.   1375.   1344.   1205.   1113.   1197.    977.   1206.   1191.
   1107.    970.    937.  22061.]


In [12]:
pred_naive = np.argmax(stat)
print pred_naive

39


In [15]:
acc_naive = Y_val.sum(axis=0)[pred_naive]*1.0 / Y_val.shape[0]
print 'niave baseline is: %.9f' % acc_naive

niave baseline is: 0.662528351


# SVM baseline

** The problem with SVM is that for each instance, there are several labels, so I split each instance into several (instance, one-label) pairs... NOT SURE IF THIS MAKES SENSE. **

This baseline use the method described at: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [23]:
# prepare label data
N_LABELS = 40
K_ICDS_TOKEEP = N_LABELS - 1 # predict only on top K frequent icd codes
N_SUBJECTS = 41886
from collections import Counter
sid2icds = {} # map subject_id ---> icd codes of this patient
icd_ctr = Counter()
with open(ICD_FPATH) as f: 
    for line in tqdm(f, total=N_SUBJECTS): 
        codes = map(int, line.split())
        sid, _icds = codes[0], codes[1:]
        icd_ctr.update(_icds)
        sid2icds[sid] = set(_icds)

100%|██████████| 41886/41886 [00:00<00:00, 117835.04it/s]


In [31]:
print icd_ctr.most_common(K_ICDS_TOKEEP)
icds = zip( *icd_ctr.most_common(K_ICDS_TOKEEP) )[0] + ('other',)
sid2khot = {} # map subject_id to k-hot vector
for sid in sid2icds.keys():
    _khot = np.zeros(N_LABELS)
    for _icd in sid2icds[sid]:
        if _icd in icds: 
            _khot[icds.index(_icd)] = 1
        else: # label 'other icds'
            _khot[-1] = 1
    sid2khot[sid] = _khot

[(3893, 14631), (9604, 10234), (966, 9245), (9671, 8983), (9904, 7235), (3961, 6831), (9672, 5973), (9955, 5748), (8856, 5324), (3891, 4669), (3615, 4397), (9915, 4228), (8872, 3536), (3722, 3306), (3995, 3235), (3324, 3222), (4513, 2911), (9390, 2721), (3723, 2702), (9983, 2391), (5491, 2209), (331, 2198), (8853, 2112), (640, 2084), (9907, 2026), (3612, 1901), (3491, 1825), (3895, 1812), (311, 1768), (9920, 1737), (3404, 1705), (3897, 1667), (40, 1646), (8841, 1564), (4311, 1530), (3613, 1502), (3521, 1419), (66, 1321), (3606, 1255)]


In [28]:
sids = sid2icds.keys()
np.random.shuffle(sids)
VALIDATION_SPLIT = 0.2
validset_sz = int(VALIDATION_SPLIT*len(sids))
train_sids = sids[:-validset_sz]
val_sids = sids[-validset_sz:]

In [93]:
Y_train = [sid2khot[sid] for sid in train_sids]
Y_val = [sid2khot[sid] for sid in val_sids]

To prepare data for svm, need to write our own **generator** of documents. 

In [44]:
def notes_generator(fpaths):
    for fpath in tqdm(fpaths):
        df = pd.read_csv(fpath)
        yield '\n=======\n\n\n'.join(df['text'])

In [102]:
train_files = [os.path.join(NOTE_DATA_DIR, '%d.csv'%sid) for sid in train_sids]
val_files = [os.path.join(NOTE_DATA_DIR, '%d.csv'%sid) for sid in val_sids]

In [103]:
from sklearn.feature_extraction.text import CountVectorizer
MAX_NB_WORDS = 20000 # top 20k most freq words
cnt_vect = CountVectorizer(max_features=MAX_NB_WORDS)

In [104]:
X_train_counts = cnt_vect.fit_transform(notes_generator(train_files))

100%|██████████| 33509/33509 [10:40<00:00, 52.30it/s]


In [105]:
print X_train_counts.shape
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print X_train_tfidf.shape

(33509, 20000)
(33509, 20000)


** generate (instance, one_label) pairs ** 

In [106]:
X_train_idx = []
Y_train_1label = []
for i,y in enumerate(Y_train):
    for label in np.where(y==1)[0]:
#         X_train.append(X_train_tfidf[i,:])
        X_train_idx.append(i)
        Y_train_1label.append(label)
len(Y_train_1label)

125627

In [107]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42) # sgd SVM
clf.fit(X_train_tfidf[X_train_idx,], Y_train_1label) 

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)

** predict and evaluate **

In [108]:
X_val_counts = cnt_vect.fit_transform(notes_generator(val_files))
print X_val_counts.shape
tfidf_transformer = TfidfTransformer()
X_val_tfidf = tfidf_transformer.fit_transform(X_val_counts)
print X_val_tfidf.shape

100%|██████████| 8377/8377 [02:48<00:00, 49.81it/s]


(8377, 20000)
(8377, 20000)


In [109]:
preds = clf.predict(X_val_tfidf)

In [110]:
correct_pred = 0
for i,pred in enumerate(preds):
    if Y_val[i][pred]==1: correct_pred += 1
print '%d instances out of %d correct, relaxed accuracy = %.9f' % \
                (correct_pred, len(Y_val), correct_pred*1.0/len(Y_val))

1182 instances out of 8377 arecorrect, relaxed accuracy = 0.141100633


In [117]:
# add regularization?
clf2 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-1, n_iter=5, random_state=42) 

In [118]:
clf2.fit(X_train_tfidf[X_train_idx,], Y_train_1label) 
preds = clf2.predict(X_val_tfidf)
correct_pred = 0
for i,pred in enumerate(preds):
    if Y_val[i][pred]==1: correct_pred += 1
print '%d instances out of %d correct, relaxed accuracy = %.9f' % \
                (correct_pred, len(Y_val), correct_pred*1.0/len(Y_val))

5562 instances out of 8377 correct, relaxed accuracy = 0.663960845


** dump file for later use ** 

In [111]:
data_svm = {
    'X_train': X_train_tfidf,
    'Y_train': Y_train,
    'X_val': X_val_tfidf,
    'Y_val': Y_val,
    'train_sids': train_sids,
    'val_sids': val_sids
}

In [112]:
pk.dump(data_svm, open('./data_svm.pk','wb'), pk.HIGHEST_PROTOCOL)

# Deep patient baseline