In [1]:
import os
import pickle
import csv
from datetime import datetime
import numpy as np
import torch
from torch.utils.data import Dataset
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

base_path = '/media/disk3/disk3'
use_cuda = torch.cuda.is_available()

In [202]:
import sklearn
#ndarray -> normalize -> map2word -> tensor    
def read_embeddings(vecidx_path, vec_path):
    words = []
    with open(vecidx_path, 'r') as f:
        tsvreader = csv.reader(f, delimiter='\t')
        for i,row in enumerate(tsvreader):
            words.append(row[0])
#             if i >1000:
#                 break

    vecs = np.ndarray((len(words), 400))
    with open(vec_path, 'r') as f:
        tsvreader = csv.reader(f, delimiter='\t')
        for i,row in enumerate(tsvreader):
            vecs[i,:] = row[:400]
#             if i >1000:
#                 break

    vecs = sklearn.preprocessing.normalize(vecs)
            
    word2vec = {_:vecs[i,:].reshape(-1) for i,_ in enumerate(words)}
    return word2vec

def read_data_dump(data_path):
    with open(data_path, 'r') as f:
        data = pickle.load(f)
    return data

def read_vecs(vec_path):
    with open(vec_path, 'r') as f:
        pretrained = pickle.load(f)
    return pretrained


def store_formatted():
    # Used to optimize preprocessing time by pre-cleaning instead of doing in DataContainer class
    rawdata = read_data_dump(os.path.join(base_path, 'notes_dump.pkl'))
    vocabs = []
    for key in rawdata.keys():
        if 'notes' in rawdata[key]:
            rawdata[key]['notes'] = sorted(rawdata[key]['notes'], key=lambda x:datetime.strptime(x['date'], '%Y-%m-%d'))
            rawdata[key]['notes'] = [clean_str_no_stopwords(note['note']) for note in rawdata[key]['notes']]
            for note in rawdata[key]['notes']:
                vocabs.extend(note)
    f = open(os.path.join(base_path, 'notes_dump_cleaned.pkl'), 'w')
    pickle.dump(rawdata, f)
    f.close()

1458


In [207]:
pretrained_filtered = {}
pretrained_filtered['unknown'] = pretrained['unknown']
pretrained_filtered['pad'] = pretrained['pad']
for word in pretrained.keys():
    if word in vocabs:
        pretrained_filtered[word] = pretrained[word]
f = open(os.path.join(base_path, 'filtered_embeddings.pkl'), 'w')
pickle.dump(pretrained_filtered, f)
f.close()
print len(pretrained.keys())
print len(pretrained_filtered.keys())

KeyboardInterrupt: 

In [194]:
def get_labels(data, ccs_path):
    
    #ccs_mapping
    with open(ccs_path, 'rb') as csvf:
        ccs_mapping = {}
        reader = csv.reader(csv, delimiter=',')
        _ = next(row)        
        for row in reader:
            ccs_mapping[row[0]] = row[1]

    #labels = []
    for hadm in data.keys():
        _seq1_label = data[hadm]['labels']['icd'][0]
        try:
            data[hadm]['ccs_label'] = ccs_mapping[_seq1_label]
        except:
            print _seq1_label
        #labels.append(data[hadm]['labels']['icd'][0])
    return data
        

In [120]:
from sklearn.feature_extraction import stop_words
import re
import random

def clean_str_no_stopwords(s):
    s = re.sub('\[\*\*.*\*\*\]|\\n|\s+', ' ', s).replace('  ', ' ').lower().split() 
    return [token for token in s if token not in stop_words.ENGLISH_STOP_WORDS]

class Datum():
    '''
    seq of notes
        tokenize
    labels vector
    '''
    def __init__(self, data, label_map, embeddings, log_unk):
        self.label = label_map[data['labels']['icd'][0]]
        # List of Average embedding tokens of each note
        self.notes = data['notes']
        self.embeddings = embeddings
        #Logs missing vocabs in notes & in total
        self.log_unk= log_unk
        self.logged = False
        self.preprocess_notes()        
        self.padding_embedding = embeddings['pad']
        
    def preprocess_notes(self):
        self.notes = sorted(self.notes, key=lambda x:datetime.strptime(x['date'], '%Y-%m-%d'))
        self.notes = [clean_str_no_stopwords(note['note']) for note in self.notes]
        # averaging of embeddings
        # PAD & UNKNOWN
        # padding of notes seq. in dataloader collation
        embedded_notes = []
        for note in self.notes:
            emb_note = []
            for _ in note:
                _emb = self.embeddings.get(_, None)
                if _emb is not None:
                    emb_note.append(_emb)
                else:
                    if self.log_unk.get(_) is None:
                        self.log_unk[_] = {'count':1, 'notes':1}
                    else:
                        if self.logged is False:
                            self.log_unk[_]['count'] +=1
                            self.log_unk[_]['notes'] +=1
                            self.logged = True
                        else:
                            self.log_unk[_]['count'] +=1
                    emb_note.append(self.embeddings['unknown'])
                                                    
            embedded_notes.append(np.mean(emb_note, 0))
        self.notes = embedded_notes
        

In [200]:
#Getting only priority #1 labels now
pretrained = read_embeddings(os.path.join(base_path, 'ri-3gram-400-tsv/vocab.tsv'), 
                                         os.path.join(base_path, 'ri-3gram-400-tsv/vectors.tsv'))
# rawdata = read_data_dump(os.path.join(base_path, 'notes_dump.pkl'))

# #TODO Map labels to CCS
# label_map = get_labels(rawdata)
# log_unk = {}
# dataset = []
# for adm in rawdata:
# #     if len(dataset) > 1000:
# #         break
#     if 'notes' in rawdata[adm]:
#         dataset.append(Datum(rawdata[adm], label_map, pretrained, log_unk))
# ###dataset = [Datum(data[adm], label_map, pretrained, log_unk) for adm in data if 'notes' in data[adm]]    
# random.shuffle(dataset)
# margin = int(len(dataset)*0.8)
# trainset = dataset[:margin]
# valset = dataset[margin:]

In [161]:
batch_size = 64
num_workers = 2
hidden_dim = 100

class Dataloader(Dataset):
    def __init__(self, data):
        super(Dataloader, self).__init__()
        self.data = data
    def __getitem__(self, index):
        return self.data[index]#(self.data[index].embedded_notes, self.data[index].label) 
    def __len__(self):
        return len(self.data)
    
def padding_collation(batch):
    batch_list, label_list = [], []
    max_seq_len = np.max([len(datum.notes) for datum in batch])
    for datum in batch:
        padded_vec = [datum.padding_embedding for i in range(max_seq_len-len(datum.notes))] + datum.notes
        batch_list.append(padded_vec)
        label_list.append(datum.label)
    return [batch_list, label_list]    

train_loader = torch.utils.data.DataLoader(dataset= Dataloader(trainset), batch_size=batch_size, shuffle=True, 
                                                           num_workers=num_workers, collate_fn=padding_collation)
val_loader = torch.utils.data.DataLoader(dataset= Dataloader(valset), batch_size=batch_size, shuffle=True, 
                                                           num_workers=num_workers, collate_fn=padding_collation)

In [195]:
class LSTMModel(nn.Module):
    def __init__(self, embed_dim, hidden_dim, labels, batch_size):
        super(LSTMModel, self).__init__()
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(embed_dim, hidden_dim)
        self.lin = nn.Linear(hidden_dim, len(labels.keys()))
    def init_hidden(self):
        hidden1 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        hidden2 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        if use_cuda:
            return (hidden1.cuda(), hidden2.cuda())
        else:
            return (hidden1, hidden2)
    def forward(self, x, hidden):
        # seqlen x batch x emb_dim
        x = torch.transpose(x, 1, 0)
        x, _hidden  = self.lstm(x, hidden)
        x = x[-1, :, :].view(self.batch_size, -1)
        x = self.lin(x)
        return x

In [197]:

model = LSTMModel(400, hidden_dim, label_map, batch_size)
opti = torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.5, 0.999))
crit = nn.CrossEntropyLoss()

if use_cuda:
    model.cuda()
    crit.cuda()

RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/torch/lib/THC/generic/THCStorage.cu:66

In [169]:
def evaluate(model, loader, batch_size):
    model.eval()
    correct = 0
    total = 0

    for batch in loader:
        if len(batch[0]) != batch_size:
            continue
        try:
            batch_x, batch_y = torch.from_numpy(np.array(batch[0])).float(), torch.from_numpy(np.array(batch[1])).long()
        except:
            continue
        if use_cuda:
            batch_x, batch_y = batch_x.cuda(), batch_y.cuda()
            
        x = Variable(batch_x)
        hidden = model.init_hidden()
        x = model(x, hidden)
        _, predicted = torch.max(x.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum()
    return correct / float(total)

In [179]:
step = 0
step_log = []
loss_log = []
val_acc_log = []
torch.manual_seed(1)
for batch in train_loader:
    if len(batch[0]) != batch_size:
        continue
    model.zero_grad()
    try:
        batch_x, batch_y = torch.from_numpy(np.array(batch[0])).float(), torch.from_numpy(np.array(batch[1])).long()
    except:
        continue
#         f=open('dump.txt', 'w')
#         print >>f, np.array(batch[0])
#         f.close()
#         print len(batch[0][0])
#         print type(batch[0][0])
#         print np.array(batch[0]).dtype
#         print torch.from_numpy(np.array(batch[0]))
#         break
    
    if use_cuda:
        batch_x, batch_y = batch_x.cuda(), batch_y.cuda()
    x = Variable(batch_x)
    hidden = model.init_hidden()
    x = model(x, hidden)

    loss = crit(x, Variable(batch_y.view(batch_size)))
    loss.backward()
    opti.step()
    
    if step % 100 == 0:
        val_acc = evaluate(model, val_loader, batch_size)
        print("Step: %d, Loss: %.4f, Validation Acc: %.2f"%(step, loss.data[0], val_acc))
        step_log.append(step)
        loss_log.append(loss.data[0])
        val_acc_log.append(val_acc)
    step += 1
#     if step == 20:
#         break

f=open('results.pkl', 'w')
pickle.dump({'step': step_log, 'loss': loss_log, 'val': val_acc_log}, f)
f.close()

250
<type 'list'>
object


RuntimeError: can't convert a given np.ndarray to a tensor - it has an invalid type. The only supported types are: double, float, int64, int32, and uint8.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(step_log, loss_log)