### Data modeling imports

In [None]:
# Natural Language Processing libraries
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Machine learning libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm, naive_bayes
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score

# PyTorch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, vocab
import torchtext
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# General-purpose libraries
import os
import random
import re
import string
import time
from datetime import timedelta, datetime
from tqdm import tqdm

### Constants and paths

In [None]:
DATA_PATH = "/Users/sg/dev/dl4hc_proj/data/data_source"

In [None]:
data_ns = pd.read_csv(os.path.join(DATA_PATH, 'data_ns.csv'))
data = pd.read_csv(os.path.join(DATA_PATH, 'data.csv'))
encoder = LabelEncoder()
vectorizer = TfidfVectorizer(max_features=600)

In [None]:
def run_svm(df, encoder, vectorizer):
    for morbidity in df["class"].unique():
        data = df[df['class'] == morbidity]
        X_train, X_test, y_train, y_test = train_test_split(data['data'], data['judgment'], test_size=0.20, shuffle=True)
        Train_Y  = encoder.fit_transform(y_train)
        Test_Y  = encoder.fit_transform(y_test)
        Train_X_Tfidf = vectorizer.fit_transform(X_train)
        Test_X_Tfidf = vectorizer.fit_transform(X_test)

        SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
        SVM.fit(Train_X_Tfidf, y_train)
        predictions_SVM = SVM.predict(Test_X_Tfidf)
        f1_macro = f1_score(y_test, predictions_SVM, average='macro')
        f1_micro = f1_score(y_test, predictions_SVM, average='micro')

        print(morbidity)
        print("f1-macro", f1_macro)
        print("f1-micro", f1_micro)
    
run_svm(data, encoder, vectorizer)

### SVM performs poorly on some classes. The data is scewed. Training SVM with parameters from the paper and ExtraTreesClassifier.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['judgment'], test_size=0.20, shuffle=True)
Test_X_Tfidf = vectorizer.fit_transform(X_test)
Train_X_Tfidf = vectorizer.fit_transform(X_train)
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, y_train)
predictions_SVM = SVM.predict(Test_X_Tfidf)

print("accuracy_score", accuracy_score(predictions_SVM, y_test))
f1_macro = f1_score(y_test, predictions_SVM, average='macro')
f1_micro = f1_score(y_test, predictions_SVM, average='micro')
print("f1-macro", f1_macro)
print("f1-micro", f1_micro)

### Trying out different hyperparameters

In [None]:
SVM = svm.SVC(verbose=True)
feature_selection_model = ExtraTreesClassifier(n_estimators=100)
transformer = SelectFromModel(feature_selection_model)
Train_X_Tfidf_selected = transformer.fit_transform(Train_X_Tfidf, y_train)
param_grid = {'kernel': ['linear', 'poly', 'rbf'], 'gamma': [0.1, 1, 10]}
grid_search = GridSearchCV(SVM, param_grid, cv=5, verbose = 2)
grid_search.fit(Train_X_Tfidf_selected, y_train)
print("Best Hyperparameters: ", grid_search.best_params_)


In [None]:
best_svm_model = svm.SVC(kernel=grid_search.best_params_['kernel'], gamma=grid_search.best_params_['gamma'])
best_svm_model.fit(Train_X_Tfidf_selected, y_train)
print("accuracy_score", accuracy_score(predictions_SVM, y_test))
f1_macro = f1_score(y_test, predictions_SVM, average='macro')
f1_micro = f1_score(y_test, predictions_SVM, average='micro')
print("f1-macro", f1_macro)
print("f1-micro", f1_micro)

In [None]:
SVM = svm.SVC(verbose=True)
feature_selection_model = sklearn.feature_selection.SelectKBest()
transformer = SelectFromModel(feature_selection_model)
Train_X_Tfidf_selected = transformer.fit_transform(Train_X_Tfidf, y_train)
param_grid = {'kernel': ['linear'], 'gamma': [0.1]}
grid_search = GridSearchCV(SVM, param_grid, cv=5, verbose = 2)
grid_search.fit(Train_X_Tfidf_selected, y_train)
print("Best Hyperparameters: ", grid_search.best_params_)
best_svm_model = svm.SVC(kernel=grid_search.best_params_['kernel'], gamma=grid_search.best_params_['gamma'])
best_svm_model.fit(Train_X_Tfidf_selected, y_train)
print("accuracy_score", accuracy_score(predictions_SVM, y_test))
f1_macro = f1_score(y_test, predictions_SVM, average='macro')
f1_micro = f1_score(y_test, predictions_SVM, average='micro')
print("f1-macro", f1_macro)
print("f1-micro", f1_micro)

### Calculating F-1 score and comparing other models 

In [None]:
models = [svm.SVC(kernel='linear', gamma=0.1, verbose=True), RandomForestClassifier(verbose=True), sklearn.naive_bayes.GaussianNB()]
models = [RandomForestClassifier(verbose=True), sklearn.naive_bayes.GaussianNB()]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['judgment'], test_size=0.20, shuffle=True)
X_test = vectorizer.fit_transform(X_test)
X_train = vectorizer.fit_transform(X_train)

for model in models:
    f1_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_micro')
    avg_f1_score = np.mean(f1_scores)
    print(f"Model: {type(model).__name__}, Avg F-1 Score: {avg_f1_score}")


for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1_score_macro = f1_score(y_test, y_pred, average='micro')
    print(f"Model: {type(model).__name__}, F-1 Score on Test Data: {f1_score_macro}")

### Creating and training DL model

In [None]:
class RecordsDataset(Dataset):
    def __init__(self, dataframe, disease):
        self.disease = disease
        self.df = dataframe[dataframe['disease'] == disease].copy()
        self.df = self.df.reset_index()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        Y = self.df.iloc[i]['judgment']
        X = self.df.iloc[i]['text']
        return X,Y

In [None]:
class MorbidityLSTM(nn.Module):
    def __init__(self):
        super(MorbidityLSTM, self).__init__()
        self.max_tokens = 1500
        self.dropout = 0.1
        self.hidden_size = 128
        self.embedding_dim = 300
        weights = np.zeros((len(vocab), self.embedding_dim))

        for i in range(0,len(vocab)-1):
            word = vocab.lookup_token(i)
            weights[i] = glove_embeddings.get_vecs_by_tokens(word)

        self.em = nn.Embedding.from_pretrained(torch.tensor(weights).float(), freeze=False)
        self.hidden_dim1 = self.hidden_size
        self.hidden_dim2 = int(self.hidden_size/2)
        self.num_layers = 1
        self.bilstm1 = nn.LSTM(input_size = self.embedding_dim, hidden_size = int(self.hidden_dim1/2), bidirectional = True,  
                               batch_first = True, num_layers = self.num_layers) 
        self.bilstm2 = nn.LSTM(input_size = self.hidden_dim1, hidden_size = int(self.hidden_dim2/2), bidirectional = True,  
                               batch_first = True, num_layers=self.num_layers)

        self.do = nn.Dropout(self.dropout)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(self.hidden_dim2 * self.max_tokens, 2)


    def forward(self, x):
        x = self.em(x)  
        x, _ = self.bilstm1(x)
        x, _ = self.bilstm2(x)
        x = self.flatten(x)
        x = self.do(x)
        return self.fc1(x)

In [None]:
def train(model, train_dataloader, n_epoch, lr):
    model.train()
    loss_list = []
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_func = nn.CrossEntropyLoss()

    for epoch in range(n_epoch):
        epoch = epoch+1
        curr_epoch_loss = []

        for X, Y in tqdm(train_dataloader):
            optimizer.zero_grad()
            y_hat = model(X.to(torch.device('cpu')))
            loss = loss_func(y_hat, Y.to(device))  
            loss.backward()
            optimizer.step()
            curr_epoch_loss.append(loss.cpu().data.numpy())
        loss_list += curr_epoch_loss
        
    return model, loss_list

def _eval(model, dataloader):
    model.eval()
    pred_all = []
    Y_test = []
    for X, Y in tqdm(dataloader):
        y_hat = emodel(X.to(torch.device('cpu')))
        pred_all.append(y_hat.cpu().data.numpy())
        Y_test.append(Y.cpu().data.numpy())
    pred_all = np.concatenate(pred_all, axis=0)
    Y_test = np.concatenate(Y_test, axis=0)

    return pred_all, Y_test

def _collate(batch):
    batch_size = len(batch)
    X = torch.zeros(batch_size, len(batch[0][0]), dtype=torch.long)
    Y = torch.zeros((batch_size), dtype=torch.long)
    for i in range(len(batch)):
        x, y = batch[i]
        vectors = vocab.lookup_indices(x)
        X[i] = torch.tensor(vectors).long()
        Y[i] = torch.tensor(float(y == True))
        
    return X,Y

In [None]:
df = pd.read_csv("./dl4hc/data.csv")
vocab = torch.load("./dl4hc/glove_vocab.obj")
glove_embeddings = torchtext.vocab.GloVe(name='6B', dim=300)   
lr = 0.01
n_epoch = 25
for _,disease in enumerate(list(data["disease"].unique())):
    model = MorbidityLSTM()
    collate=_collate
    ds = RecordsDataset(df, disease)
    ds_train, ds_test = train_test_split(ds, test_size=0.20, shuffle=True)
    train_loader = torch.utils.data.DataLoader(ds_train, batch_size = 32, collate_fn=_collate)
    val_loader = torch.utils.data.DataLoader(ds_test, batch_size = 32, collate_fn=_collate)
    model, loss_list = train(model, train_loader, n_epoch, lr)
    pred, truth = _eval(model, val_loader)
    auroc, f1, f1_macro, f1_micro = evaluate_predictions(truth, pred)
    print(auroc, f1, f1_macro, f1_micro)