In [1]:
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 
from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None

In [2]:
mimic_data = pd.read_csv("data/text_binary.csv")
mimic_data.head()

Unnamed: 0,study_id,subject_id,text,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,58522792,16567081,"b"" FINAL REPOR...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,58213163,16567081,b' FINAL REPOR...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,59835582,16043746,b' FINAL REPOR...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,51487790,16456872,b' FINAL REPOR...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,59750073,16824069,b' FINAL REPOR...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [3]:
binary_data = mimic_data.copy()
binary_data.text = binary_data.text.str.lower()
binary_data.head()

Unnamed: 0,study_id,subject_id,text,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,58522792,16567081,"b"" final repor...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,58213163,16567081,b' final repor...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,59835582,16043746,b' final repor...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,51487790,16456872,b' final repor...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,59750073,16824069,b' final repor...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Train/Test Split

In [56]:
labels = binary_data[['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum',
                      'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding','Pleural Effusion', 'Pleural Other', 
                      'Pneumonia', 'Pneumothorax', 'Support Devices']]
labels['labels'] = labels.values.tolist()
y = labels[['labels']]
X = binary_data[['text']]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

## Text Cleaning & Tokenization

In [24]:
# remove all '\\n' from the text
re_newlines = re.compile('\\\\n')
def sub_newlines(x): return re_newlines.sub('',x)

# remove all special characters from the text, keep only alphanumeric and spaces
re_letters = re.compile('[^A-Za-z0-9 ]')
def sub_letters(x): return re_letters.sub('', x)

# remove excessive spacing otherwise you end up with " " substrings
re_spaces = re.compile('\s+')
def sub_spaces(x): return re_spaces.sub(' ', x)
                
# tokenize all words.
my_tok = spacy.load('en')
def spacy_tok(x): 
    return [tok.text for tok in my_tok.tokenizer(sub_spaces
                                                 (sub_letters
                                                 (sub_newlines(x))))]

In [25]:
sub_spaces(sub_letters(sub_newlines(binary_data.text[0])))

'b final report type of examination chest pa and lateral indication yearold male patient with recent pneumonia diagnosed and treated at another facility xray not available now with continued cough and wheeze history of copd remaining evidence of pneumonia findings pa and lateral chest views were obtained with patient in upright position analysis is performed in direct comparison with the next preceding chest examination of the heart size remains normal no typical configurational abnormality is seen the thoracic aorta is moderately widened and somewhat elongated but no local contour abnormalities are identified the pulmonary vasculature is not congested there exists however some irregular peripheral vascular distribution most marked on the bases and coinciding with some slightly hypertranslucent pulmonary areas and flattened low positioned diaphragms are indicative of copd when direct comparison is made with the previous examination of there is a hazy mild degree of density in the left 

In [27]:
# spacy_tok(binary_data.text[0])[1:]

In [28]:
def get_counts(text):
    counts = Counter()
    for word in text:
        counts.update(spacy_tok(word)[1:])
    return counts

In [30]:
word_count = get_counts(binary_data.text)

In [31]:
len(word_count.keys())

37093

In [32]:
for word in list(word_count):
    if word_count[word] < 3:
        del word_count[word]

In [33]:
len(word_count.keys())

15527

In [34]:
vocab2index = {"<PAD>":0, "UNK":1} # init with padding and unknown
words = ["<PAD>", "UNK"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)

In [35]:
len(words)

15529

In [36]:
def encode_sentence(text):
    enc = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in text.split()])
    return enc

## Dataset and Data Loader

In [60]:
class Binary_Mimic(Dataset):
    def __init__(self, X, y, vocab):
        self.x = [encode_sentence(x) for x in X.text]
        self.y = y.labels
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        return x, y

In [61]:
b_train = Binary_Mimic(X_train, y_train, vocab2index)
b_valid = Binary_Mimic(X_val, y_val, vocab2index)

In [63]:
def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (sentences, labels).
    
    Need custom collate_fn because merging sequences (including padding) is not 
    supported in default. Sequences are padded to the maximum length of mini-batch 
    sequences (dynamic padding).
    
    Args:
        data: list of tuple (sentence, label). 
            - list of word indices of variable length
            - label, 0 or 1
    Returns:
        packed_batch: (PackedSequence), see torch.nn.utils.rnn.pack_padded_sequence
        sencences: torch tensor of shape (batch_size, max_len).
        labels: torch tensor of shape (batch_size, 1).
        lengths: list; valid length for each padded sentence. 
    """
    # Sort a data list by sentences length (descending order).
    data.sort(key=lambda x: len(x[0]), reverse=True)
    sentences, labels = zip(*data)
    
    # stack labels
    labels = torch.Tensor(labels)
    
    # Merge sentences
    lengths = [len(s) for s in sentences]
   
    sents = torch.zeros(len(sentences), max(lengths)).long()
    for i, s in enumerate(sentences):
        end = lengths[i]
        sents[i, :end] = torch.Tensor(s[:end])        
    
    return sents, lengths, labels

In [64]:
train_loader = DataLoader(b_train, batch_size=2, shuffle=True, collate_fn=collate_fn)
sents, lengths, labels = next(iter(train_loader))
sents

tensor([[   1,    2,    1,    1,   30,    5,    1,  376,  565,    1,  641,    1,
            1,    1,    1,    7,  194,  182,    1,    1,    1,    9,    1,    1,
            1,    8,    9,   10,  194,    5,   46,    1,    1,    1,  354,   57,
           42,  340, 1134,  500,  409,  410,   46,  102,    1,  128,    1,   97,
          168,  452,  267,  404,  268,  399,   38,   46,    1,    1,   46,   49,
           42,  354,   60,    1,  131,   38,   50,  250,    1,   46,  301,    1,
           73,   42,   53,  139,  163,  153,    1,  169,    1,  419,   42,    1,
            1,  112,   42,  327,  165,   46,  301,    1,    1, 1165,  122,   68,
          354,   57, 1423,   46,  107,   58,    1,    1,    1,   53,   33,    5,
            1],
        [   1,    2,    1,    7,   83,    1,    1,    1,  134,    1,  150,  151,
           70,    1,    1,    1,   73,   68,  134,  162,  298,  139,  270,   15,
            1,    1,  669,   38,  356,  107,    1,   73,  720,  165,  253,    1,
           7

## Basic GRU Model:

In [80]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(GRUModel,self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 14)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, lengths):
        print(x.shape)
        x = self.embeddings(x)
        x = self.dropout(x)
        pack = pack_padded_sequence(x, lengths, batch_first=True)
        out_pack, ht = self.gru(pack)
        return self.linear(ht[-1])

In [111]:
def train_epocs(model, optimizer, train_dl, valid_dl, epochs=10):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long()#.cuda()
            y = y.float()#.cuda()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = nn.BCEWithLogitsLoss()(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, valid_dl)
        print("Epoch #%.f: train loss %.3f val loss %.3f and val accuracy %.3f" % 
              (i+1,sum_loss/total, val_loss, val_acc))
        
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        x = x.long()#.cuda()
        y = y.float()#.cuda()
        y_hat = model(x, s)
        loss = nn.BCEWithLogitsLoss()(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [112]:
batch_size = 50000
train_dl = DataLoader(b_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dl = DataLoader(b_valid, batch_size=batch_size, collate_fn=collate_fn)

In [113]:
vocab_size = len(words)
print(vocab_size)
model = GRUModel(vocab_size, 50, 50)#.cuda()

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

15529


In [114]:
train_epocs(model, optimizer, train_dl, valid_dl, epochs=10)

torch.Size([50000, 520])


KeyboardInterrupt: 