<a href="https://colab.research.google.com/github/Ruisong4/CS598-DLH-Group-103/blob/main/CS598_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Loading the library we need

In [None]:
from google.colab import drive
import os
from gensim.models import KeyedVectors
import numpy as np
from collections import defaultdict
from torch.utils.data import DataLoader, SubsetRandomSampler, random_split, Dataset
from sklearn.model_selection import KFold
import torch.nn as nn
import torch.nn.functional as F
import nltk
import torch
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Setup GPU

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


Setup the working dictionary

In [None]:
#depending on your system and computer
drive.mount('/content/drive/')
os.chdir("/content/drive/My Drive/CS598_DATA")

Load word2vec from file

In [None]:
# load the binary file
model = KeyedVectors.load_word2vec_format("PubMed-w2v.bin", binary=True)

# create a util class to handle words not in model
class W2V:
  def __init__(self, model=None):
    self.w2v = model
    self.embedding_size = self.w2v.vector_size
    self.unknow_words = dict()
  def __getitem__(self, key):
    if key in self.w2v:
      return self.w2v[key]
    if key not in self.unknow_words:
      self.unknow_words[key] = np.random.uniform(-1, 1, (self.embedding_size,))
    return self.unknow_words.get(key)

w2v = W2V(model)
print("Word2Vec Model Loaded")

Word2Vec Model Loaded


load the data from file

In [None]:
x_general = []
x_thirty_days = []
y_general = []
y_thirty_days = []

with open('x.txt') as f:
    line = f.readline()
    while line:
      x_general.append(line)
      line = f.readline()

with open('x_30.txt') as f:
    line = f.readline()
    while line:
      x_thirty_days.append(line)
      line = f.readline()

with open('y.txt') as f:
    line = f.readline()
    while line:
      y_general.append(int(line))
      line = f.readline()

with open('y_30.txt') as f:
    line = f.readline()
    while line:
      y_thirty_days.append(int(line))
      line = f.readline()

y_general = np.array(y_general)
y_thirty_days = np.array(y_thirty_days)

# sanity check
assert len(x_general) == len(y_general)
assert len(x_thirty_days) == len(y_thirty_days)
assert y_general.max() == 1 and y_general.min() == 0
assert y_thirty_days.max() == 1 and y_thirty_days.min() == 0
print("Data File Loaded")

Data File Loaded


tokenize the discharge note into words and calculating the max document length

In [None]:
x_general_tokenized = []
x_thirty_days_tokenized = []
x_general_max_words = 0
x_thirty_days_max_words = 0
x_general_min_words = np.inf
x_thirty_days_min_words = np.inf
x_general_avg_words = 0
x_thirty_days_avg_words = 0

for note in x_general:
  words = nltk.word_tokenize(note)
  words = [word for word in words if word.isalnum()]
  x_general_tokenized.append(words)
  x_general_max_words = max(x_general_max_words, len(words))
  x_general_min_words = min(x_general_min_words, len(words))
  x_general_avg_words += len(words)
x_general_avg_words /= len(x_general_tokenized)

for note in x_thirty_days:
  words = nltk.word_tokenize(note)
  words = [word for word in words if word.isalnum()]
  x_thirty_days_tokenized.append(words)
  x_thirty_days_max_words = max(x_thirty_days_max_words, len(words))
  x_thirty_days_min_words = min(x_thirty_days_min_words, len(words))
  x_thirty_days_avg_words += len(words)
x_thirty_days_avg_words /= len(x_thirty_days_tokenized)

print("Finish tokenizing all words")
print("In general dataset the max word count:", x_general_max_words, "min count:", x_general_min_words, "avg:", x_general_avg_words)
print("In 30-days dataset the max word count:", x_thirty_days_max_words, "min count:", x_thirty_days_min_words, "avg:", x_thirty_days_avg_words)

Finish tokenizing all words
In general dataset the max word count: 4115 min count: 13 avg: 1196.8879310344828
In 30-days dataset the max word count: 3553 min count: 21 avg: 1224.2037845705968


1.   Create DataSet to hold the data
2.   Generarte train and val dataset (90% train 10% val, accodring to the paper)
3.   Creating DataLoader

In [None]:
class ReadmissionDataSet(Dataset):
  def __init__(self, notes, labels, w2v, max_len):
    self.x = notes
    self.y = labels
    self.max_len = max_len
    self.w2v = w2v
  
  def __len__(self):
    return len(self.x)
  
  def __getitem__(self, index):
    note = np.array([self.w2v[w] for w in self.x[index]], dtype=np.float32)
    padded = note
    if (len(note) < self.max_len):
      pad = np.zeros((self.max_len - len(note), w2v.embedding_size,), dtype=np.float32)
      padded = np.concatenate([padded, pad])
    return padded, self.y[index]

general_data_set = ReadmissionDataSet(x_general_tokenized, y_general, w2v, x_general_max_words)
thirty_days_data_set = ReadmissionDataSet(x_thirty_days_tokenized, y_thirty_days, w2v, x_thirty_days_max_words)

general_training_size = int(0.9 * len(general_data_set))
thirty_days_training_size = int(0.9 * len(thirty_days_data_set))

general_test_size = len(general_data_set) - general_training_size
thirty_days_test_size = len(thirty_days_data_set) - thirty_days_training_size

general_train_dataset, general_test_dataset = random_split(general_data_set, [general_training_size, general_test_size])
thirty_days_train_dataset, thirty_days_test_dataset = random_split(thirty_days_data_set, [thirty_days_training_size, thirty_days_test_size])

Define the model

In [None]:
class ReadmissionModel(nn.Module):
  def __init__(self):
    super(ReadmissionModel, self).__init__()
    self.conv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(1, 200))
    self.conv2 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(2, 200))
    self.conv3 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(3, 200)) 
    self.dropout = nn.Dropout(p=0.5)
    self.linear = nn.Linear(64 * 3, 2)
    
  def forward(self, x):

    unsqueezed = torch.unsqueeze(x, 1)

    con1_out = self.conv1(unsqueezed)
    con1_out = torch.squeeze(con1_out, dim=3)
    con1_out = F.relu(con1_out)
    con1_out = F.max_pool1d(con1_out, kernel_size=con1_out.shape[2])
    con1_out = torch.squeeze(con1_out, dim=2)

    con2_out = self.conv2(unsqueezed)
    con2_out = torch.squeeze(con2_out, dim=3)
    con2_out = F.relu(con2_out)
    con2_out = F.max_pool1d(con2_out, kernel_size=con2_out.shape[2])
    con2_out = torch.squeeze(con2_out, dim=2)

    con3_out = self.conv3(unsqueezed)
    con3_out = torch.squeeze(con3_out, dim=3)
    con3_out = F.relu(con3_out)
    con3_out = F.max_pool1d(con3_out, kernel_size=con3_out.shape[2])
    con3_out = torch.squeeze(con3_out, dim=2)

    out = torch.cat((con1_out, con2_out, con3_out), dim=1)
    out = self.dropout(out)
    out = self.linear(out)

    #print("output size:",torch.squeeze(self.linear(out)).shape)
    return out


helper function to calculate accuracy

In [None]:
def accuracy(output, labels):
    preds = output.argmax(dim=1)
    correct = (preds == labels).sum().float()
    acc = correct / len(labels)
    return acc

Helper function to train the model

In [None]:
def train_model(model, train_loader, n_epoch, optimizer, criterion, device):
  model.train()
  m = nn.LogSoftmax(dim=1)
  for epoch in range(n_epoch):
    epoch_loss = 0
    epoch_acc = 0
    for data, target in train_loader:
      data = data.to(device)
      target = target.to(device)

      optimizer.zero_grad()

      y_hat = model(data)
      acc = accuracy(y_hat, target)

      loss = criterion(y_hat, target)
      loss.backward()
      optimizer.step()

      epoch_loss += loss.item()
      epoch_acc += acc.item()
    print(f"Epoch {epoch}: loss: {epoch_loss / len(train_loader)} acc: {100*epoch_acc / len(train_loader)}")
  return model

Helper function to test the model

In [None]:
def test_model(model, test_loader):
  model.eval()
  Y_pred = []
  Y_test = []

  for data, target in test_loader:
    data = data.to(device)
    target = target.to(device)
    Y_test.extend(target.tolist())
    y_hat = model(data)
    y_hat = y_hat.argmax(dim=1)
    Y_pred.extend(y_hat.tolist())

  Y_test = np.array(Y_test)

  return Y_pred, Y_test

define collate function

In [None]:
def collate_fn(data):
  return torch.cat([torch.unsqueeze(torch.from_numpy(x[0]), 0) for x in data], dim=0).float(), torch.tensor([x[1] for x in data], dtype=torch.int64)

10-fold Cross Validation

In [None]:
def cross_validate(model, dataset, n_splits, batch_size, n_epoch, optimizer, criterion, device):

  kfold = KFold(n_splits=n_splits, shuffle=True, random_state=598)

  model_performance = []

  for fold, (train_idx, test_idx) in enumerate(kfold.split(dataset)):
    print("Fold", fold, "begins")
    train_subsampler = SubsetRandomSampler(train_idx)
    test_subsampler = SubsetRandomSampler(test_idx)
    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_subsampler, collate_fn=collate_fn)
    test_loader = DataLoader(dataset, batch_size=batch_size, sampler=test_subsampler, collate_fn=collate_fn)
    
    # clear model weight for next fold
    count = 0
    for layer in model.children():
      if hasattr(layer, "reset_parameters"):
        count += 1
        layer.reset_parameters()
    print("resetting weight in", count, "layers")

    train_model(model, train_loader, n_epoch, optimizer, criterion, device)
    
    Y_pred, Y_test = test_model(model, test_loader)
    acc = accuracy_score(Y_test, Y_pred)
    p, r, f, _ = precision_recall_fscore_support(Y_test, Y_pred, average='binary')
    print("Fold", fold, "results: ", "percision:", p, "recall", r, "f1", f, "acc", acc)

In [None]:
model = ReadmissionModel()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss().to(device)
model = model.to(device)
print(model)
criterion = criterion.to(device)
cross_validate(model, general_train_dataset, 10, 32, 10, optimizer, criterion, device)

ReadmissionModel(
  (conv1): Conv2d(1, 64, kernel_size=(1, 200), stride=(1, 1))
  (conv2): Conv2d(1, 64, kernel_size=(2, 200), stride=(1, 1))
  (conv3): Conv2d(1, 64, kernel_size=(3, 200), stride=(1, 1))
  (dropout): Dropout(p=0.5, inplace=False)
  (linear): Linear(in_features=192, out_features=2, bias=True)
)
Fold 0 begins
resetting weight in 4 layers
Epoch 0: loss: 0.7042130821353787 acc: 54.0521978021978
Epoch 1: loss: 0.6712891590464246 acc: 58.567994505494504
Epoch 2: loss: 0.6545111776053251 acc: 61.092032967032964
Epoch 3: loss: 0.6336868588741009 acc: 64.02815934065934
Epoch 4: loss: 0.621084199665667 acc: 65.43612637362638
Epoch 5: loss: 0.6019299899811273 acc: 66.75824175824175
Epoch 6: loss: 0.5831157776353123 acc: 68.9217032967033
Epoch 7: loss: 0.5749684136647445 acc: 69.72870879120879
Epoch 8: loss: 0.5544580802485183 acc: 71.17101648351648
Epoch 9: loss: 0.5357003814571506 acc: 73.30013736263736
Fold 0 results:  percision: 0.6198830409356725 recall 0.660436137071651 f1 0

KeyboardInterrupt: ignored

In [None]:
model = ReadmissionModel()
sum(p.numel() for p in model.parameters() if p.requires_grad)

77378